def Process_690(args): path = args.file_path all_folders = os.listdir(path) count = 0 for folder in all_folders: # load data train_seq_path = os.path.join(args.file_path, folder, "train", "sequences_alph.npy") test_seq_path = os.path.join(args.file_path, folder, "test", "sequences_alph.npy") train_lab_path = os.path.join(args.file_path, folder, "train", "targets.npy") test_lab_path = os.path.join(args.file_path, folder, "test", "targets.npy") train_sequences = np.load(train_seq_path) test_sequences = np.load(test_seq_path) train_labels = np.load(train_lab_path) test_labels = np.load(test_lab_path) train_sequences = train_sequences.reshape(train_sequences.shape[0], 1) test_sequences = test_sequences.reshape(test_sequences.shape[0], 1) train_labels = train_labels.reshape(train_labels.shape[0], 1) test_labels = test_labels.reshape(test_labels.shape[0], 1) # concat sequence and labels together trains = list(np.concatenate((train_sequences, train_labels), axis=1)) tests = list(np.concatenate((test_sequences, test_labels), axis=1)) random.seed(args.seed) random.shuffle(trains) random.shuffle(trains) random.shuffle(tests) random.shuffle(tests) # make output path output_path = os.path.join(args.output_path, str(args.kmer), folder) if not os.path.exists(output_path): os.makedirs(output_path) # write files f_train = open(os.path.join(output_path, "train.tsv"), 'wt') tsv_train = csv.writer(f_train, delimiter='\t') tsv_train.writerow(["sequence", "label"]) for i in range(len(trains)): sentence = get_kmer_sentence(trains[i][0].decode("utf-8"), args.kmer) tsv_train.writerow([sentence, int(trains[i][1])]) f_dev = open(os.path.join(output_path, "dev.tsv"), 'wt') tsv_dev = csv.writer(f_dev, delimiter='\t') tsv_dev.writerow(["sequence", "label"]) for i in range(len(tests)): sentence = get_kmer_sentence(tests[i][0].decode("utf-8"), args.kmer) tsv_dev.writerow([sentence, int(tests[i][1])]) count += 1 print("Finish %s folders" % (count))
def write_file_pair(lines, writer, seq1_index=0, seq2_index=1, label_index=2): for line in lines: seq1 = get_kmer_sentence(line[seq1_index], args.kmer) seq2 = get_kmer_sentence(line[seq2_index], args.kmer) writer.writerow([seq1, seq2, str(int(line[label_index]))])
def write_file(lines, writer, seq_index=2, label_index=3): global max_length for line in lines: sentence = get_kmer_sentence(line[seq_index], args.kmer) if len(sentence.split()) > max_length: max_length = len(sentence.split()) writer.writerow([sentence, str(line[label_index])])
def Process(args): SCAN_LIST = [int(500 / (args.slide - 1)) * i for i in range(args.slide)] old_file = open(args.file_path, "r", encoding="utf-8-sig") old_lines = list(csv.reader(old_file, delimiter=",", quotechar=None))[1:] if args.output_path: root_path = args.output_path + "/" else: root_path = "/".join(args.file_path.split("/")[:-1]) + "/" + str( args.kmer) + "/" if not os.path.exists(root_path): os.makedirs(root_path) labels = np.array([]) new_file = open(root_path + "dev.tsv", 'wt') tsv_w = csv.writer(new_file, delimiter='\t') tsv_w.writerow(["setence", "label"]) for line in old_lines: label = line[6] labels = np.append(labels, int(label)) for index in SCAN_LIST: sub_sequence = line[8][index:index + 500] sub_sentence = get_kmer_sentence(sub_sequence, kmer=args.kmer) tsv_w.writerow([sub_sentence, label]) np.save(root_path + "label.npy", labels)
def write_file(lines, path, kmer, head=True, seq_index=0, label_index=1): with open(path, 'wt') as f: tsv_w = csv.writer(f, delimiter='\t') if head: tsv_w.writerow(["setence", "label"]) for line in lines: if kmer == 0: sentence = str(line[seq_index]) else: sentence = str( get_kmer_sentence("".join(line[seq_index].split()), kmer)) if label_index == None: label = "0" else: label = str(line[label_index]) tsv_w.writerow([sentence, label])
def Process_splice(args): # X_train = np.load(os.path.join(args.file_path, "x_train.npy")) # X_dev = np.load(os.path.join(args.file_path, "x_dev.npy")) # Y_train = np.load(os.path.join(args.file_path, "y_train.npy")) # Y_dev = np.load(os.path.join(args.file_path, "y_dev.npy")) # assert len(X_train) == len(Y_train) # assert len(X_dev) == len(Y_dev) # for kmer in range(3,7): # root_path = os.path.join(args.file_path, str(kmer)) # os.makedirs(root_path) # f_train = open(os.path.join(root_path, "train.tsv"), "wt") # f_dev = open(os.path.join(root_path, "dev.tsv"), "wt") # tsv_train = csv.writer(f_train, delimiter='\t') # tsv_dev = csv.writer(f_dev, delimiter='\t') # tsv_train.writerow(["seq", "label"]) # tsv_dev.writerow(["seq", "label"]) # for i, seq in enumerate(X_train): # sequence = get_kmer_sentence(str(seq), kmer) # tsv_train.writerow([sequence, int(Y_train[i])]) # for j, seq in enumerate(X_dev): # sequence = get_kmer_sentence(str(seq), kmer) # tsv_dev.writerow([sequence, int(Y_dev[j])]) X_test = np.load(os.path.join(args.file_path, "x_test.npy")) Y_test = np.load(os.path.join(args.file_path, "y_test.npy")) assert len(X_test) == len(Y_test) for kmer in range(3, 7): root_path = os.path.join(args.file_path, str(kmer)) os.makedirs(root_path) f_test = open(os.path.join(root_path, "dev.tsv"), "wt") tsv_test = csv.writer(f_test, delimiter='\t') tsv_test.writerow(["seq", "label"]) for i, seq in enumerate(X_test): sequence = get_kmer_sentence(str(seq), kmer) label = int(np.where(Y_test[i] == 1)[0]) tsv_test.writerow([sequence, label])
def Visualize(args): if args.kmer == 0: KMER_LIST = [3, 4, 5, 6] for kmer in KMER_LIST: tokenizer_name = 'dna' + str(kmer) model_path = os.path.join(args.model_path, str(kmer)) model = BertModel.from_pretrained(model_path, output_attentions=True) tokenizer = DNATokenizer.from_pretrained(tokenizer_name, do_lower_case=False) raw_sentence = args.sequence if args.sequence else SEQUENCE sentence_a = get_kmer_sentence(raw_sentence, kmer) tokens = sentence_a.split() attention = get_attention_dna(model, tokenizer, sentence_a, start=args.start_layer, end=args.end_layer) attention_scores = np.array(attention).reshape( np.array(attention).shape[0], 1) # attention_scores[0] = 0 real_scores = get_real_score(attention_scores, kmer, args.metric) real_scores = real_scores / np.linalg.norm(real_scores) if kmer != KMER_LIST[0]: scores += real_scores.reshape(1, real_scores.shape[0]) else: scores = real_scores.reshape(1, real_scores.shape[0]) else: # load model and calculate attention tokenizer_name = 'dna' + str(args.kmer) model_path = args.model_path model = BertModel.from_pretrained(model_path, output_attentions=True) tokenizer = DNATokenizer.from_pretrained(tokenizer_name, do_lower_case=False) raw_sentence = args.sequence if args.sequence else SEQUENCE sentence_a = get_kmer_sentence(raw_sentence, args.kmer) tokens = sentence_a.split() attention = get_attention_dna(model, tokenizer, sentence_a, start=args.start_layer, end=args.end_layer) attention_scores = np.array(attention).reshape( np.array(attention).shape[0], 1) # attention_scores[0] = 0 real_scores = get_real_score(attention_scores, args.kmer, args.metric) scores = real_scores.reshape(1, real_scores.shape[0]) ave = np.sum(scores) / scores.shape[1] print(ave) print(scores) # plot sns.set() ax = sns.heatmap(scores, cmap='YlGnBu', vmin=0) plt.show()