def run_RF(X, test_x, labels_x, f, REPORT, out_dir, dic_dir): np.random.seed(500) if REPORT == 'yes': n_folds = 1 if REPORT == 'no': n_folds = 10 VALIDATION_SPLIT = 0.2 # Validation % l_precision, l_recall, l_f1 = [], [], [] for n_ in range(n_folds): print('Running fold ' + str(n_)) Encoder = LabelEncoder() labels = Encoder.fit_transform(labels_x) if f == 'TF-IDF': vect = TfidfVectorizer(max_features=1000) if f == 'BoW': vect = CountVectorizer(max_features=1000) vect.fit(X) Train_X_Tfidf = vect.transform(X) features_train = build_features(dic_dir, X) train = hstack((Train_X_Tfidf, features_train)) if REPORT == 'yes': Test_X_Tfidf = vect.transform(test_x) features_test = build_features(dic_dir, test_x) test = hstack((Test_X_Tfidf, features_test)) RF = RandomForestClassifier(random_state=0, n_estimators=100) if not os.path.exists(out_dir + '/models'): os.makedirs(out_dir + '/models') if REPORT == 'yes': RF.fit(train,labels) y_pred = RF.predict(test) write_list(y_pred, out_dir + '/models/re_predictions.txt', iterate=True, encoding=encoding) if REPORT == 'no': train, test, labels, test_labels = train_test_split(train, labels, test_size=VALIDATION_SPLIT, random_state=42) RF.fit(train,labels) y_pred = RF.predict(test) l_precision.append(precision_score(y_true=test_labels, y_pred=y_pred, average='macro')) l_recall.append(recall_score(y_true=test_labels, y_pred=y_pred, average='macro')) l_f1.append(f1_score(y_true=test_labels, y_pred=y_pred, average='macro')) if REPORT == 'no': l_results = 're_RF' + '\t' + str(np.mean(l_precision)) + '\t' + str(np.mean(l_recall)) + '\t' + str(np.mean(l_f1)) print(l_results)
positive.append(j.split('.')[0]) if str(j.split('.')[0]) in positive_abstracts: c.append(j) if str(j.split('.')[0]) not in positive_abstracts: d.append(j) c, d = list(set(c)), list(set(d)) c_tp, c_fp = len(c), len(d) precision = c_tp / (c_tp + c_fp) recall = c_tp / (c_tp + (len(positive_abstracts) - c_tp)) f1 = 2 * recall * precision / (recall + precision) print(predictions + ': PRECISION = ' + str(precision) + ', RECALL = ' + str(recall) + ', F1-SCORE = ' + str(f1)) write_list(list(set(positive)), args.i2 + '/triage_pmids.output', True, 'latin-1') if task == 're': labels = read_as_list(predictions + '.txt', encoding='latin-1') map_idx = {} map_idx['0'], map_idx['1'], map_idx['2'], map_idx[ '3'] = 'activation', 'none', 'repression', 'undefined' labels = [map_idx[x] for x in labels] export, final = [], [] df = pd.read_csv(args.i2 + '/re_test.csv') values = df.values.tolist() for l, label in zip(values, labels):
def build_data(l_texts, l_ann, type_data, f2, option, out_file): ''' Exports data. ''' nlp = spacy.load('en') original, all_sentences, tags, l_gene1, l_gene2, l_pmids = [], [], [], [], [], [] def find_s_e(e, tag): word = e.split('\t')[2] b = e.split('\t')[1].split(' ')[1] e = e.split('\t')[1].split(' ')[2] return int(b), int(e), str(word) for i, a in enumerate(l_ann): if len(a.split('.')[0].split(':')) == 2: sentence_index = a.split('.')[0].split(':')[1] else: sentence_index = 'None' already = [] try: ann = read_as_list(type_data + '/' + a, encoding=encoding) txt = read_as_list(type_data + '/' + a.split('.')[0] + '.txt', encoding=encoding) txt = ''.join(txt) relations = [x for x in ann if x[0] == 'R'] if type_data == 'train': entities = [ x for x in ann if x[0] == 'T' and x.split('\t')[1][0:14] != 'AnnotatorNotes' ] else: entities = ann n_dbtfs = [ x for x in entities if x.split('\t')[1].split(' ')[0] == 'DBTF' ] # If there is at least one DBTF and at least two entities... if len(n_dbtfs) > 0 and len(entities) > 1: # Build positive sentences! if relations: for r in relations: tag = r.split('\t')[1].split(' ')[0] ent1, ent2 = r.split('\t')[1].split(' ')[1].split(':')[ 1], r.split('\t')[1].split(' ')[2].split(':')[1] for e in entities: if e.split('\t')[0] == ent1 and e.split( '\t')[1][0:14] != 'AnnotatorNotes': b1, e1, word1 = find_s_e(e, ent1) if e.split('\t')[0] == ent2 and e.split( '\t')[1][0:14] != 'AnnotatorNotes': b2, e2, word2 = find_s_e(e, ent2) invert = False if b1 > b2: invert = True if not invert: out = txt[:b1] + 'gene1' + txt[ e1:b2] + 'gene2' + txt[e2:] if invert: out = txt[:b2] + 'gene2' + txt[ e2:b1] + 'gene1' + txt[e1:] s_ = nltk.sent_tokenize(out) sentence = [] for i, s in enumerate(s_): if "gene1" in s and "gene2" in s: sentence = s idx = i if sentence: sentence = sentence.replace('_', ' ') sentence = sentence.replace('-', ' ') for e in entities: if e.split('\t')[0][0] == 'T' and e.split( '\t')[1][0:14] != 'AnnotatorNotes': w = e.split('\t')[2].replace('_', ' ') w = ' '.join(w.replace('-', ' ').split()) if w: sentence = sentence.replace(w, 'genex') all_entities = [] all_entities.append('DBTF') sentence, all_entities = find_experimental_methods( f2, sentence, all_entities) out1 = preprocess_text(sentence, nlp, all_entities) if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences: all_sentences.append(out1) tags.append(tag.lower()) if sentence_index == 'None': export = ' '.join( nltk.sent_tokenize(txt)[int( idx)].replace('_', ' ').replace( '-', ' ').split()) original_ = out1[0:4] + export original.append(original_) ID = a.split('.')[0] + ':' + str(idx) else: txt = ' '.join( txt.replace('_', ' ').replace('-', ' ').split()) original_ = out1[0:4] + txt original.append(original_) ID = a.split('.')[0] word1, word2 = word1.replace( '_', ' '), word2.replace('_', ' ') word1, word2 = ' '.join( word1.replace('-', ' ').split()), ' '.join( word2.replace('-', ' ').split()) l_gene1.append(word1) l_gene2.append(word2) l_pmids.append(ID) already.append(word1) already.append(word2) # Build negative sentences! non_relations = [] for e in entities: if e[0] == 'T' and e.split( '\t')[1][0:14] != 'AnnotatorNotes': if ' '.join( e.split('\t')[2].replace('_', ' ').replace( '-', ' ').split()) not in already: non_relations.append(e) DBTF = [ x for x in non_relations if x.split('\t')[1].split(' ')[0] == 'DBTF' ] combinations = [(x, y) for x in DBTF for y in non_relations if y.split('\t')[1].split(' ')[0] == 'DBTF' or y.split('\t')[1].split(' ')[0] == 'NONDBTF'] if combinations: for d in combinations: # Combine all DBTF with every possible other entity different from it. No self-regulation! b1, e1 = int(d[0].split('\t')[1].split(' ')[1]), int( d[0].split('\t')[1].split(' ')[2]) word1 = ' '.join(d[0].split('\t')[2].replace( '_', ' ').replace('-', ' ').split()) b2, e2 = int(d[1].split('\t')[1].split(' ')[1]), int( d[1].split('\t')[1].split(' ')[2]) word2 = ' '.join(d[1].split('\t')[2].replace( '_', ' ').replace('-', ' ').split()) now = time.time() if b1 != b2 and word1 != word2: invert = False if b1 > b2: invert = True if not invert: out = txt[:b1] + 'gene1' + txt[ e1:b2] + 'gene2' + txt[e2:] if invert: out = txt[:b2] + 'gene2' + txt[ e2:b1] + 'gene1' + txt[e1:] s_ = nltk.sent_tokenize(out) sentence = [] for i, s in enumerate(s_): if "gene1" in s and "gene2" in s: sentence = s idx = i if sentence: sentence = sentence.replace('_', ' ') sentence = sentence.replace('-', ' ') for e in entities: if e.split('\t')[0][0] == 'T' and e.split( '\t')[1][0:14] != 'AnnotatorNotes': w = e.split('\t')[2].replace('_', ' ') w = ' '.join( w.replace('-', ' ').split()) sentence = sentence.replace(w, 'genex') all_entities = [] all_entities.append('DBTF') sentence, all_entities = find_experimental_methods( f2, sentence, all_entities) out1 = preprocess_text(sentence, nlp, all_entities) if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences: all_sentences.append(out1) tags.append('none') if sentence_index == 'None': export = ' '.join( nltk.sent_tokenize(txt)[int( idx)].replace('_', ' ').replace( '-', ' ').split()) original_ = out1[0:4] + export original.append(original_) ID = a.split('.')[0] + ':' + str(idx) else: txt = ' '.join( txt.replace('_', ' ').replace( '-', ' ').split()) original_ = out1[0:4] + txt original.append(original_) ID = a.split('.')[0] l_gene1.append(word1) l_gene2.append(word2) l_pmids.append(ID) except Exception as e: continue df = pd.DataFrame() df['all_sentences'] = all_sentences df['tags'] = tags df['original'] = original df['l_gene1'] = l_gene1 df['l_gene2'] = l_gene2 df['l_pmids'] = l_pmids if option == 'test': df.to_csv(out_file + '/re_test.csv', index=False) write_list(tags, out_file + '/re_' + option + '_labels.txt', iterate=True, encoding=encoding) write_list(original, out_file + '/re_' + option + '_original.txt', iterate=True, encoding=encoding) write_list(all_sentences, out_file + '/re_' + option + '_preprocessed.txt', iterate=True, encoding=encoding)
def merge(f, f2, f3, f_out): l_ntnu = [f for f in listdir(f) if f.endswith('.minfner')] l_gnormplus = [f for f in listdir(f2) if f.endswith('.minfner')] l_text = [f for f in listdir(f3) if f.endswith('.txt')] for text in l_text: rl, write_out, already, final_merge = [], [], [], [] tx = read_as_list(f3 + '/' + text, encoding=encoding) ann = text + '.out.minfner' ntnu_boolean, gn_boolean = False, False if ann in l_ntnu: ntnu = read_as_list(f + '/' + ann, encoding=encoding) ntnu = ['N_' + s for s in ntnu] ntnu_boolean = True ann = text.split('.')[0] + ':0.txt.out.minfner' if ann in l_gnormplus: gn = read_as_list(f2 + '/' + ann, encoding=encoding) gn = ['G_' + s for s in gn] gn_boolean = True # Merge both tools and keep only entities if ntnu_boolean and gn_boolean: entities = ntnu + gn elif ntnu_boolean and not gn_boolean: entities = ntnu elif not ntnu_boolean and gn_boolean: entities = gn else: entities = False # Keep all N_DBTF final_merge += [ x for x in entities if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'DBTF' ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] # Keep all G_NONDBTF final_merge += [ x for x in entities if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and (x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) not in already ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] # Keep all N_NONDBTF final_merge += [ x for x in entities if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and (x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) not in already ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] # Keep all G_DBTF final_merge += [ x for x in entities if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'DBTF' and ( x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) not in already ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] elements = [x.split('\t')[0][3:] for x in final_merge] final_merge += [ x for x in entities if x[0:3] == 'N_#' and x.split('\t')[0][3:] in elements ] entities = [x[2:] for x in final_merge] ann_out = text.split('.')[0] + '.ann' if entities: for e in entities: if e[0] == 'T': e_ = e.split('\t') entity, tag, start, end, word = e_[0], e_[1].split(' ')[ 0], e_[1].split(' ')[1], e_[1].split(' ')[2], e_[2] write_out.append( str(entity) + '\t' + tag + ' ' + str(start) + ' ' + str(end) + '\t' + word) if e[0] == '#': e_ = e.split('\t') entity, ID = e_[0], ' '.join( e_[1].split(' ')[1:]) + ' ' + e_[2] write_out.append( str(entity) + '\t' + 'AnnotatorNotes T' + str(entity[1:]) + '\t' + str(ID)) if write_out: write_list(write_out, f_out + '/' + ann_out, iterate=True, encoding=encoding) write_list(tx, f_out + '/' + text, iterate=True, encoding=encoding)
def run_RNN(X, test, labels, path_to_glove, REPORT, out_folder): ''' Recurrent Neural Networks with Attention enabled ''' # Hyper-parameters of the model MAX_SEQUENCE_LENGTH = 100 MAX_NB_WORDS = 500 LSTM_DIM = 100 # Categorize target labels ! n_labels = len(set(labels)) values = array(labels) label_encoder = LabelEncoder() integer_encoded = label_encoder.fit_transform(values) onehot_encoder = OneHotEncoder(sparse=False, categories='auto') integer_encoded = integer_encoded.reshape(len(integer_encoded), 1) labels = onehot_encoder.fit_transform(integer_encoded) print(list(label_encoder.inverse_transform([0, 1, 2, 3]))) texts, texts_test = [], [] for idx in X: text = BeautifulSoup(idx, 'html.parser') texts.append(str(text.get_text().encode())) for idx in test: text = BeautifulSoup(idx, 'html.parser') texts_test.append(str(text.get_text().encode())) tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) tokenizer_test = Tokenizer(num_words=MAX_NB_WORDS) tokenizer_test.fit_on_texts(texts) sequences_test = tokenizer_test.texts_to_sequences(texts_test) word_index = tokenizer.word_index word_index_test = tokenizer_test.word_index data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH) data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH) # Run Model with Cross-validation if REPORT == 'yes': n_folds = 1 if REPORT == 'no': n_folds = 10 VALIDATION_SPLIT = 0.2 l_precision, l_recall, l_f1, l_val_accuracy = [], [], [], [] for n_ in range(n_folds): print('Running fold ' + str(n_)) # Shuffle data indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) # Split data if REPORT == 'yes': x_train = data y_train = labels x_val = data y_val = labels if REPORT == 'no': x_train = data[:-nb_validation_samples] y_train = labels[:-nb_validation_samples] x_val = data[-nb_validation_samples:] y_val = labels[-nb_validation_samples:] # Oversampling of minority label in training set ada = RandomOverSampler(random_state=42, sampling_strategy='minority') X_train_resampled, y_train_resampled = ada.fit_sample(x_train, y_train) # Build GloVe dictionaries embeddings_index = {} f = open(path_to_glove, encoding='utf8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() embedding_matrix = np.random.random((len(word_index) + 1, MAX_SEQUENCE_LENGTH)) for word, i in word_index.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector embedding_layer = Embedding(len(word_index) + 1, MAX_SEQUENCE_LENGTH, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=True) sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') embedded_sequences = embedding_layer(sequence_input) model_type = out_folder + "/Bidirectional LSTM with Attention" out = Bidirectional(LSTM(LSTM_DIM, return_sequences=True, dropout=0.30, recurrent_dropout=0.30))(embedded_sequences) out = Attention(MAX_SEQUENCE_LENGTH)(out) out = Dense(LSTM_DIM, activation="relu")(out) out = Dropout(0.30)(out) out = Dense(n_labels, activation="softmax")(out) model = Model(sequence_input, out) # Save model architecture if not os.path.isfile(model_type + '.png'): plot_model(model, to_file=model_type+'.png', show_shapes=True, show_layer_names=True) # Model optimizer and metrics opt = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) # Model parameters if not os.path.exists(out_folder + '/models'): os.makedirs(out_folder + '/models') early_stopping = EarlyStopping(monitor = 'val_loss', patience = 5, verbose = 0, mode= 'min') model_filepath_weights = out_folder + '/models/triage_' + path_to_glove.split('/')[2].split('.')[0] + '_' + str(n_) + '.h5' model_filepath_json = out_folder + '/models/triage_' + path_to_glove.split('/')[2].split('.')[0] + '_' + str(n_) + '.json' checkpoint = ModelCheckpoint(model_filepath_weights, monitor='val_acc', verbose = 0, save_best_only=True, mode='max') callbacks_list = [early_stopping, checkpoint] history = model.fit(X_train_resampled, y_train_resampled, validation_data=(x_val, y_val), epochs=100, callbacks=callbacks_list, verbose = 0) # Predictions if REPORT == 'yes': y_pred = model.predict(data_test) y_pred = y_pred.argmax(axis=-1) y_pred = [str(x) for x in y_pred] write_list(y_pred, out_folder + '/models/re_predictions.txt', iterate=True, encoding=encoding) # serialize model to JSON model_json = model.to_json() with open(model_filepath_json, "w") as json_file: json_file.write(model_json) # serialize weights to HDF5 keras.models.save_model(model, model_filepath_weights) print("Saved model to disk") if REPORT == 'no': y_pred = model.predict(x_val) y_pred = y_pred.argmax(axis=-1) y_val = y_val.argmax(axis=-1) y_pred = [str(x) for x in y_pred] y_val = [str(x) for x in y_val] # Results l_precision.append(precision_score(y_true=y_val, y_pred=y_pred, average='macro')) l_recall.append(recall_score(y_true=y_val, y_pred=y_pred, average='macro')) l_f1.append(f1_score(y_true=y_val, y_pred=y_pred, average='macro')) l_val_accuracy.append(history.history['val_acc'][-1]) if REPORT == 'no': l_results = 're_RNN' + '\t' + str(np.mean(l_precision)) + '\t' + str(np.mean(l_recall)) + '\t' + str(np.mean(l_f1)) print(l_results)
return out, out_original, labels, list_pmids if '__main__' == __name__: ''' Exports triage data. ''' encoding = "latin-1" nltk.download('punkt') nltk.download('stopwords') parser = argparse.ArgumentParser(description='Options') parser.add_argument('--i1', type=str, help="""Folder with abstracts and annotations.""") parser.add_argument('--i2', type=str, help="""Data folder.""") parser.add_argument('--i3', type=str, help="""Data folder.""") parser.add_argument('--option', type=str, help="""Train or test.""") parser.add_argument('--o', type=str, help="""Output folder.""") args = parser.parse_args() pubtator_articles = [f for f in listdir(args.i1) if f.endswith('.txt')] # Replace words by entities! data, data_original, labels, list_pmids = build_data(args.i1, args.i2, args.i3, pubtator_articles, args.option) if args.option == 'test': write_list(list_pmids, args.o + '/triage_test_pmids.txt', iterate=True, encoding=encoding) write_list(data, args.o + '/triage_' + args.option + '_preprocessed.txt', iterate=True, encoding=encoding) write_list(data_original, args.o + '/triage_' + args.option + '_original.txt', iterate=True, encoding=encoding) write_list(labels, args.o + '/triage_' + args.option + '_labels.txt', iterate=True, encoding=encoding)
import argparse from os import listdir from export_abstracts import read_as_list, write_list if '__main__' == __name__: ''' Removes wrong genes. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('--i1', type=str, help="""Data folder.""") parser.add_argument('--i2', type=str, help="""Data folder.""") args = parser.parse_args() to_avoid = read_as_list(args.i1 + '/to_avoid.txt', encoding='latin-1') l_ann = [f for f in listdir(args.i2) if f.endswith('.ann')] for a in l_ann: ann = read_as_list(args.i2 + '/' + a, encoding='latin-1') final = [] for p in ann: if p[0] == 'T': if p.split('\t')[2].replace(' ', '') not in to_avoid: final.append(p) entities = [x.split('\t')[0][1:] for x in final] final += [ x for x in ann if x[0] == '#' and x.split('\t')[0][1:] in entities ] write_list(list(set(final)), args.i2 + '/' + a, True, 'latin-1')
encoding = "latin-1" parser = argparse.ArgumentParser(description='Options') parser.add_argument('--i', type=str, help="""Input directory.""") parser.add_argument('--o', type=str, help="""Output directory.""") args = parser.parse_args() ann1 = pd.read_csv(args.i + '/abstracts.all.labeled.csv', sep='\n|\t', encoding=encoding, engine='python') ann2 = read_as_list(args.i + '/hackaton_1.tsv', encoding=encoding) ann2 = [x.split('\t') for x in ann2] ann3 = read_as_list(args.i + '/hackaton_2.tsv', encoding=encoding) ann3 = [x.split('\t') for x in ann3] di = {True: 1, False: 0} # The values 10 and 20 are replaced by 'A' and 'B' ann1['label'].replace(di, inplace=True) pmid_1, l_1, pmid_2, l_2, pmid_3, l_3 = list(ann1['pmid']), list(ann1['label']), [x[0] for x in ann2], [x[2] for x in ann2], [x[0] for x in ann3], [x[2] for x in ann3] df = pd.DataFrame() df['pmid'] = pmid_1 + pmid_2 + pmid_3 df['label'] = l_1 + l_2 + l_3 df = df.astype({'pmid': int, 'label': int}) df = remove_duplicates(df, 'pmid') list_PMID = list(df['pmid']) list_labels = list(df['label']) write_list(list_PMID, args.o + '/triage_train_pmids.txt', iterate=True, encoding=encoding) write_list(list_labels, args.o + '/triage_train_labels.txt', iterate=True, encoding=encoding)