Python TfIdf Examples, preprocess.tfidf.TfIdf Python Examples

Example #1

0

Show file

def gen_extra_description_for_mesh():
    res_dir = 'e:/el/tmpres/demo/del-data/'
    mesh_record_file = res_dir + 'records_info_with_wiki.txt'
    wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl'
    description_file = 'e:/el/tmpres/demo/wiki-all/text.txt'
    links_file = 'e:/el/tmpres/demo/wiki-all/links.txt'
    word_idf_file = 'e:/el/tmpres/demo/word_idf.txt'

    dst_file = 'e:/el/tmpres/demo/extra_description_for_mesh.txt'

    tfidf = TfIdf(word_idf_file)
    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    mesh_records = MeshRecord.load_mesh_records(mesh_record_file)

    # wiki_text = wiki_info.get_info(37220)[1]
    # __get_extra_wiki_description('', wiki_text, tfidf)
    fout = open(dst_file, 'wb')
    for i, (mesh_id, record) in enumerate(mesh_records.iteritems()):
        if record.wid < 0:
            continue
        info = wiki_info.get_info(record.wid)
        if info and info[1]:
            mesh_text = record.mesh_desc.decode('utf-8')
            wiki_text = info[1].decode('utf-8')
            extra_desc = __get_extra_wiki_description(mesh_text, wiki_text,
                                                      tfidf)
            fout.write('%s\n%s\n' % (mesh_id, extra_desc.encode('utf-8')))
        # if i > 10:
        #     break
    fout.close()

Example #2

0

Show file

def main():
    start_time = time()

    wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/el/tmpres/demo/wiki-all/text.txt'
    wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.pkl'
    word_idf_file = 'e:/el/tmpres/demo/word_idf.txt'

    tfidf = TfIdf(word_idf_file)

    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)

    input_file = 'input/00000001.txt'
    fin = open(input_file, 'rb')
    doc_text = fin.read()
    doc_text = doc_text.decode('utf-8')
    fin.close()

    pos = input_file.rfind('/')
    file_name = input_file[pos + 1:]
    ner_result_file = os.path.join('output', file_name + '.ner')
    merged_mention_list = mentiondetection.clean_ner_result(ner_result_file)

    merged_mention_list.sort(key=lambda x: x.span[0])
    wiki_link.link_all(doc_text, merged_mention_list)
    for mention in merged_mention_list:
        if (not mention.mesh_id) and mention.chebi_id < 0 < mention.wid:
            cur_name = doc_text[mention.span[0]:mention.span[1] + 1].lower()
            print cur_name, mention.wid, wiki_info.get_info(mention.wid)[0]

    print time() - start_time

Example #3

0

Show file

File: eldemo.py Project: hldai/ks-studio-el

def test():
    start_time = time()

    text = 'last opportunities Texas senator Cruz'

    word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt'
    tfidf = TfIdf(word_idf_file)

    wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/el/tmpres/demo/wiki-all/text.txt'
    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    wiki_link = WikiLink('e:/el/tmpres/wiki/dict/name_candidates.pkl', wiki_info, tfidf)
    context_tfidf = tfidf.get_tfidf_from_text(text)
    print wiki_link.link_with_context('cruz', context_tfidf)

    print time() - start_time

Example #4

0

Show file

File: eldemo.py Project: XYHHH/ks-studio-el

def test():
    start_time = time()

    text = 'last opportunities Texas senator Cruz'

    word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt'
    tfidf = TfIdf(word_idf_file)

    wiki_info_file = 'e:/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/el/tmpres/demo/wiki-all/text.txt'
    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    wiki_link = WikiLink('e:/el/tmpres/wiki/dict/name_candidates.pkl',
                         wiki_info, tfidf)
    context_tfidf = tfidf.get_tfidf_from_text(text)
    print wiki_link.link_with_context('cruz', context_tfidf)

    print time() - start_time

Example #5

0

Show file

File: eldemo.py Project: XYHHH/ks-studio-el

def init_model():
    # extra_wiki_desc_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt'
    # extra_parents_file = 'e:/el/tmpres/demo/extra_parents.txt'
    #
    # name_wid_file = 'e:/el/tmpres/demo/dict/single_candidates_wid_dict.txt'
    # record_file = 'd:/data/lab_demo/med_edl_data/records_info_with_wiki.txt'
    # dict_file = 'd:/data/lab_demo/med_edl_data/med_dict_ascii_with_ids_edited.txt'
    # tree_number_file = 'd:/data/lab_demo/med_edl_data/id_tn.txt'

    # res_dir = '/media/dhl/Data/el/tmpres/demo/del-data/'

    # input_file = '/media/dhl/Data/el/tmpres/NER/NER/00000001.txt.bak'
    # output_file = '/media/dhl/Data/el/tmpres/demo/result/result-linux.json'

    res_dir = 'e:/data/el/tmpres/demo/del-data/'
    extra_wiki_desc_file = res_dir + 'wiki_extra_sentences.txt'
    extra_parents_file = res_dir + 'extra_parents.txt'
    mesh_record_file = res_dir + 'records_info_with_wiki.txt'
    mesh_dict_file = res_dir + 'med_dict_ascii_with_ids_edited.txt'
    exclude_words_file = res_dir + 'exclude_words.txt'
    tree_number_file = res_dir + 'id_tn.txt'
    obo_file = res_dir + 'chebi.obo'

    word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt'

    # wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.txt'
    wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl'

    # wiki_info_file = r'E:\el\tmpres\demo\wiki-med\new\wiki-info.txt'
    # links_file = r'E:\el\tmpres\demo\wiki-med\new\links.txt'
    # description_file = r'E:\el\tmpres\demo\wiki-med\new\text.txt'

    wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt'

    mesh_extra_description_file = 'e:/data/el/tmpres/demo/extra_description_for_mesh.txt'

    chebi_terms = ChebiTerm.load_obo_file(obo_file)
    print '%d chebi terms' % len(chebi_terms)

    mesh_match = MeshMatch(mesh_dict_file, exclude_words_file)
    mesh_records = MeshRecord.load_mesh_records(mesh_record_file)
    mesh_tree = MeshTree(tree_number_file, mesh_records)

    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    tfidf = TfIdf(word_idf_file)
    wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)
    extra_wiki_desc = ioutils.load_wiki_extra_descriptions(
        mesh_extra_description_file)
    # extra_wiki_desc = ioutils.load_wiki_extra_sentences(extra_wiki_desc_file)

    med_link = MedLink(extra_parents_file, mesh_match, mesh_records, mesh_tree,
                       chebi_terms, wiki_info, extra_wiki_desc, wiki_link)
    return med_link

Example #6

0

Show file

File: webapp.py Project: XYHHH/ks-studio-el

def init_model():
    res_dir = 'e:/data/el/tmpres/'
    # res_dir = '/home/dhl/data/el/tmpres/'
    del_res_dir = os.path.join(res_dir, 'demo/del-data/')
    # extra_wiki_desc_file = del_res_dir + 'wiki_extra_sentences.txt'
    # extra_parents_file = del_res_dir + 'extra_parents.txt'
    # mesh_record_file = del_res_dir + 'records_info_with_wiki.txt'
    # mesh_dict_file = del_res_dir + 'med_dict_ascii_with_ids_edited.txt'
    # exclude_words_file = del_res_dir + 'exclude_words.txt'
    # tree_number_file = del_res_dir + 'id_tn.txt'
    # obo_file = del_res_dir + 'chebi.obo'

    extra_wiki_desc_file = os.path.join(del_res_dir,
                                        'wiki_extra_sentences.txt')
    extra_parents_file = os.path.join(del_res_dir, 'extra_parents.txt')
    mesh_record_file = os.path.join(del_res_dir, 'records_info_with_wiki.txt')
    mesh_dict_file = os.path.join(del_res_dir,
                                  'med_dict_ascii_with_ids_edited.txt')
    exclude_words_file = os.path.join(del_res_dir, 'exclude_words.txt')
    tree_number_file = os.path.join(del_res_dir, 'id_tn.txt')
    obo_file = os.path.join(del_res_dir, 'chebi.obo')

    word_idf_file = os.path.join(res_dir, 'demo/word_idf.txt')
    wiki_candidates_file = os.path.join(res_dir,
                                        'wiki/dict/name_candidates.pkl')
    wiki_info_file = os.path.join(res_dir, 'demo/wiki-all/wiki-info.pkl')
    links_file = os.path.join(res_dir, 'demo/wiki-all/links.txt')
    description_file = os.path.join(res_dir, 'demo/wiki-all/text.txt')
    mesh_extra_description_file = os.path.join(
        res_dir, 'demo/extra_description_for_mesh.txt')

    # wiki_candidates_file = 'e:/el/tmpres/wiki/dict/name_candidates.txt'
    # wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl'
    # wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl'
    # links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt'
    # description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt'
    # mesh_extra_description_file = 'e:/data/el/tmpres/demo/extra_description_for_mesh.txt'

    chebi_terms = ChebiTerm.load_obo_file(obo_file)

    mesh_match = MeshMatch(mesh_dict_file, exclude_words_file)
    mesh_records = MeshRecord.load_mesh_records(mesh_record_file)
    mesh_tree = MeshTree(tree_number_file, mesh_records)

    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    tfidf = TfIdf(word_idf_file)
    wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)
    extra_wiki_desc = ioutils.load_wiki_extra_descriptions(
        mesh_extra_description_file)
    # extra_wiki_desc = ioutils.load_wiki_extra_sentences(extra_wiki_desc_file)

    tmp_med_link = MedLink(extra_parents_file, mesh_match, mesh_records,
                           mesh_tree, chebi_terms, wiki_info, extra_wiki_desc,
                           wiki_link)
    return tmp_med_link

Example #7

0

Show file

def __init_mellink():
    word_idf_file = 'e:/data/el/tmpres/demo/word_idf.txt'
    wiki_candidates_file = 'e:/data/el/tmpres/wiki/dict/name_candidates.pkl'
    wiki_info_file = 'e:/data/el/tmpres/demo/wiki-all/wiki-info.pkl'
    links_file = 'e:/data/el/tmpres/demo/wiki-all/links.txt'
    description_file = 'e:/data/el/tmpres/demo/wiki-all/text.txt'

    wiki_info = WikiInfo(wiki_info_file, links_file, description_file)
    tfidf = TfIdf(word_idf_file)
    wiki_link = WikiLink(wiki_candidates_file, wiki_info, tfidf)
    return MedLink(wiki_info=wiki_info, wiki_link=wiki_link)

Example #8

0

Show file

File: wikilink.py Project: hldai/ks-studio-el

    def link_with_context(self, sname, context_tfidf):
        if not self.tfidf:
            return -1, []

        pos = bisect_left(self.name_list, sname)
        if self.name_list[pos] != sname:
            return -1, []
        beg_idx = self.beg_indices[pos]
        if pos == len(self.beg_indices) - 1:
            end_idx = len(self.candidates)
        else:
            end_idx = self.beg_indices[pos + 1]

        # result_wid = self.candidates[beg_idx]

        if end_idx == beg_idx + 1:
            tmpwid = self.candidates[beg_idx]
            wiki_info = self.wiki_info.get_info(tmpwid)
            if wiki_info and wiki_info[1]:
                if 'may refer to' in wiki_info[1] or 'may stand for' in wiki_info[1]:
                    return -1, []
            return tmpwid, [tmpwid]

        sum_cnts = 0.0
        for i in xrange(beg_idx, end_idx):
            sum_cnts += self.cnts[i]

        cur_candidates = list()
        # max_score = -1
        for i in xrange(beg_idx, end_idx):
            cur_wid = self.candidates[i]
            wiki_info = self.wiki_info.get_info(cur_wid)
            if wiki_info and wiki_info[1]:
                if 'may refer to' in wiki_info[1] or 'may stand for' in wiki_info[1]:
                    continue

                candidate_tfidf = self.tfidf.get_tfidf_from_text(wiki_info[1].decode('utf-8'))
                sim = TfIdf.sim(candidate_tfidf, context_tfidf)
                # if sname == 'elisa' or sname == 'sfn':
                #     print cur_wid, wiki_info[0], sim, self.cnts[i], sum_cnts, self.cnts[i] / sum_cnts
                # print cur_wid, wiki_info[0], sim
                # score = sim + 0.0 * self.cnts[i] / sum_cnts
                score = sim
                cur_candidates.append((cur_wid, score))
                # if score > max_score:
                #     max_score = score
                #     result_wid = cur_wid
        cur_candidates.sort(key=lambda x: -x[1])
        cur_candidates = [x[0] for x in cur_candidates]
        # if cur_candidates[0][0] != result_wid:
        #     print 'not equal!'
        #     print cur_candidates
        return cur_candidates[0], cur_candidates

Example #9

0

Show file

def gen_extra_sentences():
    word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt'
    tfidf = TfIdf(word_idf_file)

    mesh_id_wid_file = 'e:/el/tmpres/demo/merge/mesh_id_wid.txt'
    merged_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions.txt'
    merged_tokenized_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions_tokenized.txt'
    extra_sentence_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt'

    mesh_ids = list()
    wids = list()
    fin = open(mesh_id_wid_file, 'rb')
    for line in fin:
        vals = line.strip().split('\t')
        mesh_ids.append(vals[0])
        wids.append(int(vals[1]))
    fin.close()

    fin_desc = open(merged_desc_file, 'rb')
    fin_token_desc = open(merged_tokenized_desc_file, 'rb')
    fout = open(extra_sentence_file, 'wb')
    for idx, (mesh_id, mesh_desc, mesh_token_desc) in enumerate(
            izip(mesh_ids, fin_desc, fin_token_desc)):
        mesh_token_desc = mesh_token_desc.strip()
        mesh_desc_words = mesh_token_desc.split(' ')
        mesh_sentence_ends = find_sentence_ends(mesh_desc_words)

        wiki_desc = fin_desc.next().strip()
        wiki_token_desc = fin_token_desc.next().strip()
        wiki_desc_words = wiki_token_desc.split(' ')
        wiki_sentence_ends = find_sentence_ends(wiki_desc_words)

        extra_sentence_indices = __get_sentences_to_add(
            mesh_desc_words, mesh_sentence_ends, wiki_desc_words,
            wiki_sentence_ends, tfidf)

        wiki_words_to_pos_list = tokenized_text_match(wiki_desc,
                                                      wiki_desc_words)
        original_sentences = get_original_sentences(wiki_desc,
                                                    wiki_words_to_pos_list,
                                                    wiki_sentence_ends)
        fout.write('%s\t%d\n' % (mesh_id, len(extra_sentence_indices)))
        for j in extra_sentence_indices:
            fout.write('%s\n' % original_sentences[j])

        # if idx == 10000:
        #     break
    fin_desc.close()
    fin_token_desc.close()
    fout.close()

Example #10

0

Show file

File: wikiprocess.py Project: hldai/ks-studio-el

def __get_sentences_to_add(prev_text_words, prev_sentence_ends, new_text_words, new_sentence_ends, tfidf):
    prev_tfidf_vecs = get_tfidf_of_sentences(prev_text_words, prev_sentence_ends, tfidf)
    new_tfidf_vecs = get_tfidf_of_sentences(new_text_words, new_sentence_ends, tfidf)
    wanted_sentence_indices = list()
    for nidx, new_tfidf_vec in enumerate(new_tfidf_vecs):
        to_add = True
        for pidx, prev_tfidf_vec in enumerate(prev_tfidf_vecs):
            sim_val = TfIdf.sim(new_tfidf_vec, prev_tfidf_vec)
            if sim_val > 0.95:
                to_add = False
                # print sim_val, 'too similar'
                break
        if to_add:
            wanted_sentence_indices.append(nidx)
    return wanted_sentence_indices

Example #11

0

Show file

def __get_sentences_to_add(prev_text_words, prev_sentence_ends, new_text_words,
                           new_sentence_ends, tfidf):
    prev_tfidf_vecs = get_tfidf_of_sentences(prev_text_words,
                                             prev_sentence_ends, tfidf)
    new_tfidf_vecs = get_tfidf_of_sentences(new_text_words, new_sentence_ends,
                                            tfidf)
    wanted_sentence_indices = list()
    for nidx, new_tfidf_vec in enumerate(new_tfidf_vecs):
        to_add = True
        for pidx, prev_tfidf_vec in enumerate(prev_tfidf_vecs):
            sim_val = TfIdf.sim(new_tfidf_vec, prev_tfidf_vec)
            if sim_val > 0.95:
                to_add = False
                # print sim_val, 'too similar'
                break
        if to_add:
            wanted_sentence_indices.append(nidx)
    return wanted_sentence_indices