Exemple #1
0
def main():
    start_time = time()

    wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/el/tmpres/demo/wiki-all/text.txt'
    wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.pkl'
    word_idf_file = 'e:/el/tmpres/demo/word_idf.txt'

    tfidf = TfIdf(word_idf_file)

    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)

    input_file = 'input/00000001.txt'
    fin = open(input_file, 'rb')
    doc_text = fin.read()
    doc_text = doc_text.decode('utf-8')
    fin.close()

    pos = input_file.rfind('/')
    file_name = input_file[pos + 1:]
    ner_result_file = os.path.join('output', file_name + '.ner')
    merged_mention_list = mentiondetection.clean_ner_result(ner_result_file)

    merged_mention_list.sort(key=lambda x: x.span[0])
    wiki_link.link_all(doc_text, merged_mention_list)
    for mention in merged_mention_list:
        if (not mention.mesh_id) and mention.chebi_id < 0 < mention.wid:
            cur_name = doc_text[mention.span[0]:mention.span[1] + 1].lower()
            print cur_name, mention.wid, wiki_info.get_info(mention.wid)[0]

    print time() - start_time
Exemple #2
0
def __test():
    word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt'
    wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl'
    wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt'

    input_file = 'input/rsv1407.txt'
    ner_result_file = 'output/rsv1407.txt.ner'
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    fin = open(input_file, 'rb')
    doc_text = fin.read()
    doc_text = doc_text.replace('\r\n', '\n')
    doc_text = doc_text.decode('utf-8')
    fin.close()

    mentions = mentiondetection.clean_ner_result(ner_result_file)

    # wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    # tfidf = TfIdf(word_idf_file)
    # wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)
    # med_link = MedLink(wiki_info=wiki_info, wiki_link=wiki_link)

    # mentions = med_link.mdel(input_file)

    # med_link.link_mentions(mentions, doc_text)

    for m in mentions:
        print '%d\t%d\t%s\t%s\t%d\t%d' % (m.span[0], m.span[1], m.name, m.mesh_id, m.chebi_id, m.wid)
Exemple #3
0
def __test():
    word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt'
    wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl'
    wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt'

    input_file = 'input/rsv1407.txt'
    ner_result_file = 'output/rsv1407.txt.ner'
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    fin = open(input_file, 'rb')
    doc_text = fin.read()
    doc_text = doc_text.replace('\r\n', '\n')
    doc_text = doc_text.decode('utf-8')
    fin.close()

    mentions = mentiondetection.clean_ner_result(ner_result_file)

    # wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    # tfidf = TfIdf(word_idf_file)
    # wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)
    # med_link = MedLink(wiki_info=wiki_info, wiki_link=wiki_link)

    # mentions = med_link.mdel(input_file)

    # med_link.link_mentions(mentions, doc_text)

    for m in mentions:
        print '%d\t%d\t%s\t%s\t%d\t%d' % (m.span[0], m.span[1], m.name,
                                          m.mesh_id, m.chebi_id, m.wid)
Exemple #4
0
    def mdel(self, file_path, ner_result_file):
        # mention detection
        # pos = file_path.rfind('/')
        # file_name = file_path[pos + 1:]
        # ner_result_file = os.path.join('output', file_name + '.ner')

        # read text
        fin = open(file_path, 'rb')
        doc_text = fin.read()
        doc_text = doc_text.decode('utf-8')
        fin.close()

        merged_result_list = mentiondetection.clean_ner_result(ner_result_file)
        return self.link_text(doc_text, merged_result_list)
Exemple #5
0
def __process_file(med_link, sent_detector, text_file, ner_file, dbcursor=None, dbconn=None,
                   fout_me=None, fout_el=None):
    # mentions = med_link.mdel(filepath)
    mentions = mentiondetection.clean_ner_result(ner_file)

    fin = open(text_file, 'rb')
    doc_text = fin.read()
    doc_text = doc_text.replace('\r\n', '\n')
    doc_text = doc_text.decode('utf-8')
    fin.close()

    med_link.link_mentions(mentions, doc_text)

    sent_spans = sent_detector.span_tokenize(doc_text)
    __store_mentions(mentions, doc_text, sent_spans, text_file, dbcursor, dbconn, fout_me, fout_el)
Exemple #6
0
    def mdel(self, file_path):
        # mention detection
        pos = file_path.rfind('/')
        file_name = file_path[pos + 1:]
        ner_result_file = os.path.join('output', file_name + '.ner')

        # read text
        fin = open(file_path, 'rb')
        doc_text = fin.read()
        doc_text = doc_text.replace('\r\n', '\n')
        doc_text = doc_text.decode('utf-8')
        fin.close()

        merged_result_list = mentiondetection.clean_ner_result(ner_result_file)
        return self.link_mentions_info(doc_text, merged_result_list)
Exemple #7
0
def __process_file(med_link,
                   sent_detector,
                   text_file,
                   ner_file,
                   dbcursor=None,
                   dbconn=None,
                   fout_me=None,
                   fout_el=None):
    # mentions = med_link.mdel(filepath)
    mentions = mentiondetection.clean_ner_result(ner_file)

    fin = open(text_file, 'rb')
    doc_text = fin.read()
    doc_text = doc_text.replace('\r\n', '\n')
    doc_text = doc_text.decode('utf-8')
    fin.close()

    med_link.link_mentions(mentions, doc_text)

    sent_spans = sent_detector.span_tokenize(doc_text)
    __store_mentions(mentions, doc_text, sent_spans, text_file, dbcursor,
                     dbconn, fout_me, fout_el)
Exemple #8
0
def main():
    mention_result_file = 'e:/el/tmpres/NER/NER/output/00000001.txt'
    merged_result_list = mentiondetection.clean_ner_result(mention_result_file)
    for val in merged_result_list:
        print val.span, val.mtype