def genia_tokenizer(self): '''Tokenize pair text with genia tagger. ''' tagger = GeniaTagger('./tools/geniatagger-3.0.2/geniatagger') with open('./chemprot_test_gs/new_testing_examples.json', 'r') as f: training_examples = json.load(f) # print(len(training_examples)) for i in training_examples: tokenized_tuple = tagger.parse(i['sentence']) token_list = [] for output in tokenized_tuple: if output[0] in string.punctuation: continue pos = output[2] if output[0] == pos: continue if output[0].endswith('..'): token = output[0][:-2] elif pos == 'CD': token = 'NUM' else: token = output[0] token_list.append(token) i['sentence'] = ' '.join(token_list) # with open('./chemprot_training/train_tokenized.json', 'w+') as j: with open('./chemprot_test_gs/testing_tokenized.json', 'w+') as j: json.dump(training_examples, j, indent=4)
corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding = codec) outFile = open(corpusRoot + "genia_and_backoff.txt", "w") for journal in corpusReader.fileids() : print ("******* start " + journal) sentList = corpusReader.sents(journal) for sent in sentList : taggedList = t2.tag(sent) for tag in taggedList : if tag[1] == "UNK" : genia_tag_list = tagger.parse(tag[0]) for genia_tag in genia_tag_list : if genia_tag[4] == "O" : outFile.write(genia_tag[0] + "/" + genia_tag[2] + " ") else : new_tag = genia_tag[4].split("-")[1] outFile.write(genia_tag[0] + "/" + new_tag + " ") else : outFile.write(tag[0] + "/" + tag[1] + " ") outFile.write("\n\n") print ("##### end " + journal)
from nltk.tokenize import PunktSentenceTokenizer from geniatagger import GeniaTagger tagger = GeniaTagger('~/qwerty/shashank/geniatagger-3.0.2/geniatagger') print(tagger.parse('This is a pen.')) #print(tagger.parse('tis is pen')) #print(data) med_tokenizer = PunktSentenceTokenizer(train_data)
corpusReader = PlaintextCorpusReader(corpusRoot, ".*.txt", encoding=codec) outFile = open(corpusRoot + "genia_and_backoff.txt", "w") for journal in corpusReader.fileids(): print("******* start " + journal) sentList = corpusReader.sents(journal) for sent in sentList: taggedList = t2.tag(sent) for tag in taggedList: if tag[1] == "UNK": genia_tag_list = tagger.parse(tag[0]) for genia_tag in genia_tag_list: if genia_tag[4] == "O": outFile.write(genia_tag[0] + "/" + genia_tag[2] + " ") else: new_tag = genia_tag[4].split("-")[1] outFile.write(genia_tag[0] + "/" + new_tag + " ") else: outFile.write(tag[0] + "/" + tag[1] + " ") outFile.write("\n\n") print("##### end " + journal) outFile.close()
def annotate_text(tager=''): genia = GeniaTagger('../genia-tagger/geniatagger-3.0.2/geniatagger') medpost = spacy.load(os.path.abspath('trained_tagger')) stanford = StanfordCoreNLP('http://localhost:9000') main_dir = 'corrected_outcomes' data_dir = os.path.abspath(os.path.join(main_dir, 'aggregated')) create_storage_dirs([data_dir]) sub_dir = os.path.abspath(os.path.join(data_dir, 'test')) if not os.path.exists(os.path.dirname(sub_dir)): os.makedirs(os.path.dirname(sub_dir)) turker, ebm_extract = e.read_anns('hierarchical_labels', 'outcomes', \ ann_type='aggregated', model_phase='train') seq_dir = os.path.abspath(os.path.join(os.path.curdir, 'corrected_outcomes', 'test')) create_storage_dirs([seq_dir]) ebm_csv = [] start = time.time() with open(os.path.join(seq_dir, 'test_medpost.bmes'), 'w') as f: for pmid, doc in ebm_extract.items(): abstract = ' '.join(i for i in doc.tokens) #pprint(abstract) u = doc.anns['AGGREGATED'] v = doc.tokens o = [] corr_outcomes = [] temp, temp_2 = [], [] t = 0 m = 0 o_come = e.print_labeled_spans_2(doc)[0] #extract outcomes from the abstract being examined, [(Outcome-type, Outcome), (Outcome-type, Outcome2)] #store the annotations and the index of the annotations for each abstract for x in range(len(u)): if x == t: if u[x] != 0: for ff in o_come: for j in range(len(u)): if j < len(ff[1].split()): o.append((t, u[x])) t += 1 break o_come.pop(0) txt_toks = [v[i[0]] for i in o] text_wrds = ' '.join(i for i in txt_toks) corr = correcting_spans.correct_text() text_wrds = corr.statTerm_keyWord_punct_remove(text_wrds) if tager.lower() == 'genia': tagged = genia.parse(text_wrds) pos = [i[2] for i in tagged] elif tager.lower() == 'medpost': tagged = medpost(text_wrds) pos = [i.tag_ for i in tagged] elif tager.lower() == 'stanford': pos = [] for elem in word_tokenize(text_wrds): stan = stanford.annotate(elem, properties={'annotators':'pos', 'outputFormat':'json'}) pos.append(stan['sentences'][0]['tokens'][0]['pos']) text_pos = ' '.join(i for i in pos) label = core_outcome[u[x]] corrected_spans = corr.pos_co_occurrence_cleaning(text_wrds, text_pos, label) if len(corrected_spans) == 0: v[o[0][0]:(o[-1][0] + 1)] = txt_toks u[o[0][0]:(o[-1][0] + 1)] = [0 for i in range(len(txt_toks))] elif len(corrected_spans) < 2: span = corrected_spans[0] s = [i for i in span[1].split()] ll = [o[0][1] if i in s else 0 for i in txt_toks] v[o[0][0]:(o[-1][0] + 1)] = txt_toks u[o[0][0]:(o[-1][0] + 1)] = ll else: s = [i for j in corrected_spans for i in j[1].split()] ll = [o[0][1] if i in s else 0 for i in txt_toks] v[o[0][0]:(o[-1][0] + 1)] = txt_toks u[o[0][0]:(o[-1][0] + 1)] = ll p = [i for i in corrected_spans] if len(p) > 0: for i in p: corr_outcomes.append(i) o.clear() else: t += 1 if corr_outcomes: temp_2 = build_sequence_model(v, u, core_outcome, corr_outcomes) qq = 1 for i in temp_2: print(qq, i) f.write('{}\n'.format(i)) qq += 1 f.write('\n') for k in corr_outcomes: ebm_csv.append(k) ebm_csv_df = pd.DataFrame(ebm_csv, columns=['Label','Outcome']) ebm_csv_df.to_csv(os.path.join(os.path.abspath(os.path.curdir), 'corrected_outcomes/test/labels_outcomes_medpost.csv')) f.close() print("Duration {}".format(time.time() - start))