def export_entities(dictionary, list_of_files, label, T, f, f2): keyword_processor = KeywordProcessor() dic = eval(open(f2 + "/" + dictionary, "r").read()) now = time.time() for x in dic.keys(): keyword_processor.add_keyword(x) for i, file_name in enumerate(list_of_files): try: file_out = open(f + '/' + file_name.replace('txt2', 'out'), 'a') sentence = read_as_list(f + '/' + file_name, encoding='latin-1') if sentence and len(sentence) == 1: sentence = sentence[0] sentence2 = sentence.lower() nomes = keyword_processor.extract_keywords(sentence2) if nomes: for nome in nomes: try: p = re.compile(r'\b%s\b' % re.escape(nome)) for m in p.finditer(sentence2): file_out.write( str(T) + str(m.start()) + '\t' + label + '\t' + str(m.start()) + '\t' + str(m.end()) + '\t' + str(sentence[m.start():m.end()]) + '\t' + '#' + T[1] + str(m.start()) + '\t' + 'AnnotatorNotes_' + T + str(m.start()) + '\t' + str(dic[nome]) + '\n') except: continue except: continue
help="""Predictions file.""") parser.add_argument('--i1', type=str, default=None, help="""Data folder.""") parser.add_argument('--i2', type=str, default=None, help="""Root folder.""") args = parser.parse_args() task = args.task predictions = args.predictions silver_standard = read_as_list(args.i1 + '/ExTRI_confidence', encoding='latin-1')[2:] positive_abstracts, positive = [], [] for l in silver_standard: l_ = l.split('\t') try: if l_[-1] == 'High': positive_abstracts.append(str(l_[0].split(':')[0])) except: continue positive_abstracts = list(set(positive_abstracts)) if task == 'triage': labels = read_as_list(predictions + '.txt', encoding='latin-1') pmids = read_as_list(args.i2 + '/triage_test_pmids.txt',
def build_data(l_texts, l_ann, type_data, f2, option, out_file): ''' Exports data. ''' nlp = spacy.load('en') original, all_sentences, tags, l_gene1, l_gene2, l_pmids = [], [], [], [], [], [] def find_s_e(e, tag): word = e.split('\t')[2] b = e.split('\t')[1].split(' ')[1] e = e.split('\t')[1].split(' ')[2] return int(b), int(e), str(word) for i, a in enumerate(l_ann): if len(a.split('.')[0].split(':')) == 2: sentence_index = a.split('.')[0].split(':')[1] else: sentence_index = 'None' already = [] try: ann = read_as_list(type_data + '/' + a, encoding=encoding) txt = read_as_list(type_data + '/' + a.split('.')[0] + '.txt', encoding=encoding) txt = ''.join(txt) relations = [x for x in ann if x[0] == 'R'] if type_data == 'train': entities = [ x for x in ann if x[0] == 'T' and x.split('\t')[1][0:14] != 'AnnotatorNotes' ] else: entities = ann n_dbtfs = [ x for x in entities if x.split('\t')[1].split(' ')[0] == 'DBTF' ] # If there is at least one DBTF and at least two entities... if len(n_dbtfs) > 0 and len(entities) > 1: # Build positive sentences! if relations: for r in relations: tag = r.split('\t')[1].split(' ')[0] ent1, ent2 = r.split('\t')[1].split(' ')[1].split(':')[ 1], r.split('\t')[1].split(' ')[2].split(':')[1] for e in entities: if e.split('\t')[0] == ent1 and e.split( '\t')[1][0:14] != 'AnnotatorNotes': b1, e1, word1 = find_s_e(e, ent1) if e.split('\t')[0] == ent2 and e.split( '\t')[1][0:14] != 'AnnotatorNotes': b2, e2, word2 = find_s_e(e, ent2) invert = False if b1 > b2: invert = True if not invert: out = txt[:b1] + 'gene1' + txt[ e1:b2] + 'gene2' + txt[e2:] if invert: out = txt[:b2] + 'gene2' + txt[ e2:b1] + 'gene1' + txt[e1:] s_ = nltk.sent_tokenize(out) sentence = [] for i, s in enumerate(s_): if "gene1" in s and "gene2" in s: sentence = s idx = i if sentence: sentence = sentence.replace('_', ' ') sentence = sentence.replace('-', ' ') for e in entities: if e.split('\t')[0][0] == 'T' and e.split( '\t')[1][0:14] != 'AnnotatorNotes': w = e.split('\t')[2].replace('_', ' ') w = ' '.join(w.replace('-', ' ').split()) if w: sentence = sentence.replace(w, 'genex') all_entities = [] all_entities.append('DBTF') sentence, all_entities = find_experimental_methods( f2, sentence, all_entities) out1 = preprocess_text(sentence, nlp, all_entities) if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences: all_sentences.append(out1) tags.append(tag.lower()) if sentence_index == 'None': export = ' '.join( nltk.sent_tokenize(txt)[int( idx)].replace('_', ' ').replace( '-', ' ').split()) original_ = out1[0:4] + export original.append(original_) ID = a.split('.')[0] + ':' + str(idx) else: txt = ' '.join( txt.replace('_', ' ').replace('-', ' ').split()) original_ = out1[0:4] + txt original.append(original_) ID = a.split('.')[0] word1, word2 = word1.replace( '_', ' '), word2.replace('_', ' ') word1, word2 = ' '.join( word1.replace('-', ' ').split()), ' '.join( word2.replace('-', ' ').split()) l_gene1.append(word1) l_gene2.append(word2) l_pmids.append(ID) already.append(word1) already.append(word2) # Build negative sentences! non_relations = [] for e in entities: if e[0] == 'T' and e.split( '\t')[1][0:14] != 'AnnotatorNotes': if ' '.join( e.split('\t')[2].replace('_', ' ').replace( '-', ' ').split()) not in already: non_relations.append(e) DBTF = [ x for x in non_relations if x.split('\t')[1].split(' ')[0] == 'DBTF' ] combinations = [(x, y) for x in DBTF for y in non_relations if y.split('\t')[1].split(' ')[0] == 'DBTF' or y.split('\t')[1].split(' ')[0] == 'NONDBTF'] if combinations: for d in combinations: # Combine all DBTF with every possible other entity different from it. No self-regulation! b1, e1 = int(d[0].split('\t')[1].split(' ')[1]), int( d[0].split('\t')[1].split(' ')[2]) word1 = ' '.join(d[0].split('\t')[2].replace( '_', ' ').replace('-', ' ').split()) b2, e2 = int(d[1].split('\t')[1].split(' ')[1]), int( d[1].split('\t')[1].split(' ')[2]) word2 = ' '.join(d[1].split('\t')[2].replace( '_', ' ').replace('-', ' ').split()) now = time.time() if b1 != b2 and word1 != word2: invert = False if b1 > b2: invert = True if not invert: out = txt[:b1] + 'gene1' + txt[ e1:b2] + 'gene2' + txt[e2:] if invert: out = txt[:b2] + 'gene2' + txt[ e2:b1] + 'gene1' + txt[e1:] s_ = nltk.sent_tokenize(out) sentence = [] for i, s in enumerate(s_): if "gene1" in s and "gene2" in s: sentence = s idx = i if sentence: sentence = sentence.replace('_', ' ') sentence = sentence.replace('-', ' ') for e in entities: if e.split('\t')[0][0] == 'T' and e.split( '\t')[1][0:14] != 'AnnotatorNotes': w = e.split('\t')[2].replace('_', ' ') w = ' '.join( w.replace('-', ' ').split()) sentence = sentence.replace(w, 'genex') all_entities = [] all_entities.append('DBTF') sentence, all_entities = find_experimental_methods( f2, sentence, all_entities) out1 = preprocess_text(sentence, nlp, all_entities) if 'gene1' in out1 and 'gene2' in out1 and out1 not in all_sentences: all_sentences.append(out1) tags.append('none') if sentence_index == 'None': export = ' '.join( nltk.sent_tokenize(txt)[int( idx)].replace('_', ' ').replace( '-', ' ').split()) original_ = out1[0:4] + export original.append(original_) ID = a.split('.')[0] + ':' + str(idx) else: txt = ' '.join( txt.replace('_', ' ').replace( '-', ' ').split()) original_ = out1[0:4] + txt original.append(original_) ID = a.split('.')[0] l_gene1.append(word1) l_gene2.append(word2) l_pmids.append(ID) except Exception as e: continue df = pd.DataFrame() df['all_sentences'] = all_sentences df['tags'] = tags df['original'] = original df['l_gene1'] = l_gene1 df['l_gene2'] = l_gene2 df['l_pmids'] = l_pmids if option == 'test': df.to_csv(out_file + '/re_test.csv', index=False) write_list(tags, out_file + '/re_' + option + '_labels.txt', iterate=True, encoding=encoding) write_list(original, out_file + '/re_' + option + '_original.txt', iterate=True, encoding=encoding) write_list(all_sentences, out_file + '/re_' + option + '_preprocessed.txt', iterate=True, encoding=encoding)
parser = argparse.ArgumentParser( description='Hyper-parameters of the model.') parser.add_argument('--train', type=str, help="""Train file.""") parser.add_argument('--test', type=str, help="""Test file.""") parser.add_argument('--labels', type=str, help="""Labels file.""") parser.add_argument('--features', type=str, help="""Features to be used.""") parser.add_argument('--report', type=str, help="""If yes, predict unseen data.""") parser.add_argument('--dictionary', type=str, help="""Dictionary directory.""") parser.add_argument('--o', type=str, help="""Output folder.""") args = parser.parse_args() train = read_as_list(args.train, encoding='latin-1') if args.report == 'yes': test = read_as_list(args.test, encoding='latin-1') if args.report == 'no': test = [] labels = read_as_list(args.labels, encoding='latin-1') f = args.features REPORT = args.report out_dic = args.o dic_dir = args.dictionary run_SVM(train, test, labels, f, REPORT, out_dic, dic_dir)
def merge(f, f2, f3, f_out): l_ntnu = [f for f in listdir(f) if f.endswith('.minfner')] l_gnormplus = [f for f in listdir(f2) if f.endswith('.minfner')] l_text = [f for f in listdir(f3) if f.endswith('.txt')] for text in l_text: rl, write_out, already, final_merge = [], [], [], [] tx = read_as_list(f3 + '/' + text, encoding=encoding) ann = text + '.out.minfner' ntnu_boolean, gn_boolean = False, False if ann in l_ntnu: ntnu = read_as_list(f + '/' + ann, encoding=encoding) ntnu = ['N_' + s for s in ntnu] ntnu_boolean = True ann = text.split('.')[0] + ':0.txt.out.minfner' if ann in l_gnormplus: gn = read_as_list(f2 + '/' + ann, encoding=encoding) gn = ['G_' + s for s in gn] gn_boolean = True # Merge both tools and keep only entities if ntnu_boolean and gn_boolean: entities = ntnu + gn elif ntnu_boolean and not gn_boolean: entities = ntnu elif not ntnu_boolean and gn_boolean: entities = gn else: entities = False # Keep all N_DBTF final_merge += [ x for x in entities if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'DBTF' ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] # Keep all G_NONDBTF final_merge += [ x for x in entities if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and (x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) not in already ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] # Keep all N_NONDBTF final_merge += [ x for x in entities if x[0] == 'N' and x.split('\t')[1].split(' ')[0] == 'NONDBTF' and (x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) not in already ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] # Keep all G_DBTF final_merge += [ x for x in entities if x[0] == 'G' and x.split('\t')[1].split(' ')[0] == 'DBTF' and ( x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) not in already ] already = [(x.split('\t')[1].split(' ')[1], x.split('\t')[1].split(' ')[2]) for x in final_merge if x[2] == 'T'] elements = [x.split('\t')[0][3:] for x in final_merge] final_merge += [ x for x in entities if x[0:3] == 'N_#' and x.split('\t')[0][3:] in elements ] entities = [x[2:] for x in final_merge] ann_out = text.split('.')[0] + '.ann' if entities: for e in entities: if e[0] == 'T': e_ = e.split('\t') entity, tag, start, end, word = e_[0], e_[1].split(' ')[ 0], e_[1].split(' ')[1], e_[1].split(' ')[2], e_[2] write_out.append( str(entity) + '\t' + tag + ' ' + str(start) + ' ' + str(end) + '\t' + word) if e[0] == '#': e_ = e.split('\t') entity, ID = e_[0], ' '.join( e_[1].split(' ')[1:]) + ' ' + e_[2] write_out.append( str(entity) + '\t' + 'AnnotatorNotes T' + str(entity[1:]) + '\t' + str(ID)) if write_out: write_list(write_out, f_out + '/' + ann_out, iterate=True, encoding=encoding) write_list(tx, f_out + '/' + text, iterate=True, encoding=encoding)
def build_data(folder, folder2, folder3, pubtator_articles, option): ''' Exports data. ''' out, out_original, labels, list_pmids = [], [], [], [] for txt in pubtator_articles: text_original = read_as_list(folder + '/' + txt, encoding=encoding)[0] text = text_original PMID = txt.split('.')[0] ann = read_as_list(folder + '/' + PMID + '.ann', encoding=encoding) # Read annotations ann = [x for x in ann if x[0] == 'T'] if option == 'train': label = read_as_list(folder2 + '/triage_train_labels.txt', encoding=encoding) # Read annotations PMIDs = read_as_list(folder2 + '/triage_train_pmids.txt', encoding=encoding) # Read annotations label_txt = label[PMIDs.index(PMID)] lookback, delete, all_entities = [], [], [] # Build list of start and end of sentences and delete entities inside of entities ... for x in ann: lookback.append((x.split('\t')[1].split(' ')[1] + ' ' + x.split('\t')[1].split(' ')[2])) for x in lookback: copy = lookback.copy() copy.remove(x) start = x.split(' ')[0] for el in copy: if int(start) in range(int(el.split(' ')[0]), int(el.split(' ')[1])): delete.append(start) ann = [x.split('\t') for x in ann if x.split('\t')[1].split(' ')[1] not in delete] ann = [x[1].split(' ') for x in ann] ann.sort(key=lambda x: int(x[1])) c = 0 for a in ann: tag, start, end = a[0], int(a[1]) + c, int(a[2]) + c all_entities.append(tag) c+= len(tag) - end + start text = text[:start] + tag + text[end:] # Find experimental settings ! text, all_entities = find_experimental_methods(folder3, text, all_entities) text = preprocess(text) count_entities = Counter(all_entities) # Features: DBTF and EXPMETHOD (boolean) order_entities = ['EXPMETHOD', 'DBTF'] for entity in order_entities: if entity in count_entities: count_entities[entity] = 1 else: count_entities[entity] = 0 text = str(count_entities[entity]) + ' ' + text text_original = str(count_entities[entity]) + ' ' + text_original out.append(text) out_original.append(text_original) if option == 'train': labels.append(label_txt) if option == 'test': list_pmids.append(':'.join(txt.split(':')[0:2])) return out, out_original, labels, list_pmids
Date: 07.10.2019 ''' import argparse from os import listdir from export_abstracts import read_as_list, write_list if '__main__' == __name__: ''' Removes wrong genes. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('--i1', type=str, help="""Data folder.""") parser.add_argument('--i2', type=str, help="""Data folder.""") args = parser.parse_args() to_avoid = read_as_list(args.i1 + '/to_avoid.txt', encoding='latin-1') l_ann = [f for f in listdir(args.i2) if f.endswith('.ann')] for a in l_ann: ann = read_as_list(args.i2 + '/' + a, encoding='latin-1') final = [] for p in ann: if p[0] == 'T': if p.split('\t')[2].replace(' ', '') not in to_avoid: final.append(p) entities = [x.split('\t')[0][1:] for x in final] final += [ x for x in ann if x[0] == '#' and x.split('\t')[0][1:] in entities ]
return df if '__main__' == __name__: ''' Export triage PMIDs and labels of annotated abstracts. ''' encoding = "latin-1" parser = argparse.ArgumentParser(description='Options') parser.add_argument('--i', type=str, help="""Input directory.""") parser.add_argument('--o', type=str, help="""Output directory.""") args = parser.parse_args() ann1 = pd.read_csv(args.i + '/abstracts.all.labeled.csv', sep='\n|\t', encoding=encoding, engine='python') ann2 = read_as_list(args.i + '/hackaton_1.tsv', encoding=encoding) ann2 = [x.split('\t') for x in ann2] ann3 = read_as_list(args.i + '/hackaton_2.tsv', encoding=encoding) ann3 = [x.split('\t') for x in ann3] di = {True: 1, False: 0} # The values 10 and 20 are replaced by 'A' and 'B' ann1['label'].replace(di, inplace=True) pmid_1, l_1, pmid_2, l_2, pmid_3, l_3 = list(ann1['pmid']), list(ann1['label']), [x[0] for x in ann2], [x[2] for x in ann2], [x[0] for x in ann3], [x[2] for x in ann3] df = pd.DataFrame() df['pmid'] = pmid_1 + pmid_2 + pmid_3 df['label'] = l_1 + l_2 + l_3 df = df.astype({'pmid': int, 'label': int})