def main(): start_time = time() wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl' links_file = 'e:/el/tmpres/demo/wiki-all/links.txt' description_file = 'e:/el/tmpres/demo/wiki-all/text.txt' wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.pkl' word_idf_file = 'e:/el/tmpres/demo/word_idf.txt' tfidf = TfIdf(word_idf_file) wiki_info = WikiInfo(wiki_info_file, links_file, description_file) wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf) input_file = 'input/00000001.txt' fin = open(input_file, 'rb') doc_text = fin.read() doc_text = doc_text.decode('utf-8') fin.close() pos = input_file.rfind('/') file_name = input_file[pos + 1:] ner_result_file = os.path.join('output', file_name + '.ner') merged_mention_list = mentiondetection.clean_ner_result(ner_result_file) merged_mention_list.sort(key=lambda x: x.span[0]) wiki_link.link_all(doc_text, merged_mention_list) for mention in merged_mention_list: if (not mention.mesh_id) and mention.chebi_id < 0 < mention.wid: cur_name = doc_text[mention.span[0]:mention.span[1] + 1].lower() print cur_name, mention.wid, wiki_info.get_info(mention.wid)[0] print time() - start_time
def __test(): word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt' wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl' wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl' links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt' description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt' input_file = 'input/rsv1407.txt' ner_result_file = 'output/rsv1407.txt.ner' sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') fin = open(input_file, 'rb') doc_text = fin.read() doc_text = doc_text.replace('\r\n', '\n') doc_text = doc_text.decode('utf-8') fin.close() mentions = mentiondetection.clean_ner_result(ner_result_file) # wiki_info = WikiInfo(wiki_info_file, links_file, description_file) # tfidf = TfIdf(word_idf_file) # wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf) # med_link = MedLink(wiki_info=wiki_info, wiki_link=wiki_link) # mentions = med_link.mdel(input_file) # med_link.link_mentions(mentions, doc_text) for m in mentions: print '%d\t%d\t%s\t%s\t%d\t%d' % (m.span[0], m.span[1], m.name, m.mesh_id, m.chebi_id, m.wid)
def mdel(self, file_path, ner_result_file): # mention detection # pos = file_path.rfind('/') # file_name = file_path[pos + 1:] # ner_result_file = os.path.join('output', file_name + '.ner') # read text fin = open(file_path, 'rb') doc_text = fin.read() doc_text = doc_text.decode('utf-8') fin.close() merged_result_list = mentiondetection.clean_ner_result(ner_result_file) return self.link_text(doc_text, merged_result_list)
def __process_file(med_link, sent_detector, text_file, ner_file, dbcursor=None, dbconn=None, fout_me=None, fout_el=None): # mentions = med_link.mdel(filepath) mentions = mentiondetection.clean_ner_result(ner_file) fin = open(text_file, 'rb') doc_text = fin.read() doc_text = doc_text.replace('\r\n', '\n') doc_text = doc_text.decode('utf-8') fin.close() med_link.link_mentions(mentions, doc_text) sent_spans = sent_detector.span_tokenize(doc_text) __store_mentions(mentions, doc_text, sent_spans, text_file, dbcursor, dbconn, fout_me, fout_el)
def mdel(self, file_path): # mention detection pos = file_path.rfind('/') file_name = file_path[pos + 1:] ner_result_file = os.path.join('output', file_name + '.ner') # read text fin = open(file_path, 'rb') doc_text = fin.read() doc_text = doc_text.replace('\r\n', '\n') doc_text = doc_text.decode('utf-8') fin.close() merged_result_list = mentiondetection.clean_ner_result(ner_result_file) return self.link_mentions_info(doc_text, merged_result_list)
def main(): mention_result_file = 'e:/el/tmpres/NER/NER/output/00000001.txt' merged_result_list = mentiondetection.clean_ner_result(mention_result_file) for val in merged_result_list: print val.span, val.mtype