sequences2 = util.get_sequences(file2) data, labels = create_data(sequences1, sequences2) return data, labels # pipeline if __name__ == '__main__': print("Python version:\n{}\n".format(sys.version)) print("matplotlib version: {}".format(matplotlib.__version__)) print(plt.style.available) print("numpy version: {}".format(np.__version__)) input_file = sys.argv[1] output_dir = sys.argv[2] util.remove_old_output_files(output_dir) sys.stdout = open(util.generate_output_filename(output_dir), "w") lines = util.readFile(input_file) threegrams = create_keys(3) twograms = create_keys(2) proteins = [] accuracies = [] for line in lines: if line: print("-----------------------------------------") print(line[0], line[1], line[2], line[3]) print("start processing ", line[2], line[3]) file1 = line[0] file2 = line[1] analysis_type = line[2] protein = line[3] util.start_roc_plot(analysis_type + '-' + protein)
def get_m1_m2_sequences(fasta_file): m1_sequences = [x.seq for x in SeqIO.parse(fasta_file, "fasta") if len(x.description.split("|"))>3 and x.description.split("|")[3].split(":")[1]=='M1'] m2_sequences = [x.seq for x in SeqIO.parse(fasta_file, "fasta") if len(x.description.split("|"))>3 and x.description.split("|")[3].split(":")[1]=='M2'] return m1_sequences, m2_sequences def readFile(filename): f = open(filename) csv_f = csv.reader(f) return csv_f #pipeline if __name__ == '__main__': input_file = sys.argv[1] output_dir = sys.argv[2] util.remove_old_output_files(output_dir) sys.stdout = open(util.generate_output_filename(output_dir, basename='dna_results'), "w") lines = util.readFile(input_file) threegrams = create_keys(3) proteins = [] accuracies = [] for line in lines: if line: print("-----------------------------------------") print(line[0], line[1], line[2], line[3]) print("start processing ", line[2], line[3]) file1 = line[0] file2 = line[1] analysis_type = line[2] protein = line[3]