コード例 #1
0
ファイル: nytdataset.py プロジェクト: hldai/emadr-exp
def __setup_entity_pairs_file():
    # docs_ner_file = 'e:/dc/nyt-world-full/processed/docs.txt'
    # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt'
    # cooccur_mentions_file = 'e:/dc/nyt-world-full/processed/mentions-ner/cooccur-mentions.txt'
    # entity_name_dict_file = 'e:/dc/nyt-world-full/processed/mentions-ner/entity-names-nloc.txt'
    # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt'

    # datadir = 'e:/data/emadr/nyt-world-full/processed/'
    datadir = 'e:/data/emadr/nyt-less-docs/sports'
    filter_loc = False

    docs_ner_file = os.path.join(datadir, 'docs.txt')
    ner_result_file = os.path.join(datadir, 'ner-result.txt')
    cooccur_mentions_file = os.path.join(datadir, 'cooccur-mentions.txt')
    entity_name_dict_file = os.path.join(datadir, 'entity-names.txt')
    doc_all_mentions_file = os.path.join(datadir, 'doc-mentions.txt')
    ee_file = os.path.join(datadir, 'bindata/ee.bin')
    de_file = os.path.join(datadir, 'bindata/de.bin')
    cnts_file = os.path.join(datadir, 'bindata/entity-cnts.bin')

    dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file,
                                            cooccur_mentions_file)

    # gen entity name dict
    dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file,
                                    filter_loc)

    dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)
    dataarange.gen_doc_entity_pairs(entity_name_dict_file,
                                    doc_all_mentions_file, de_file)

    dataarange.gen_entity_entity_pairs(entity_name_dict_file,
                                       cooccur_mentions_file, ee_file)

    dataarange.gen_cnts_file(de_file, cnts_file)
コード例 #2
0
ファイル: nytdataset.py プロジェクト: hldai/emadr-exp
def retrieve_mentions():
    line_docs_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized.txt'
    illegal_start_words_file = 'e:/dc/20ng_bydate/stopwords.txt'

    output_dir = 'e:/dc/nyt-world-full/processed/mentions/'
    dst_doc_entity_candidates_list_file = output_dir + 'doc_mention_candidates.txt'
    dst_entity_candidate_clique_file = output_dir + 'mention_candidate_cliques.txt'
    dst_doc_entity_indices_file = output_dir + 'doc_mention_candidate_indices.txt'
    # init_entity_net(line_docs_file_name, illegal_start_words_file, dst_doc_entity_candidates_list_file,
    #                 dst_entity_candidate_clique_file, dst_doc_entity_indices_file)

    lc_word_cnts_file_name = 'e:/dc/el/wiki/wiki_word_cnts_lc.txt'
    wc_word_cnts_file_name = 'e:/dc/el/wiki/wiki_word_cnts_with_case.txt'
    proper_entity_dict_file = output_dir + 'entity_names.txt'
    # gen_entity_name_dict(dst_doc_entity_candidates_list_file, lc_word_cnts_file_name, wc_word_cnts_file_name,
    #                      proper_entity_dict_file)

    de_file_bin = 'e:/dc/nyt-world-full/processed/bin/de.bin'
    de_file_txt = 'e:/dc/nyt-world-full/processed/bin/de.txt'
    dataarange.gen_doc_entity_pairs(proper_entity_dict_file,
                                    dst_doc_entity_candidates_list_file,
                                    de_file_bin, de_file_txt)

    entity_candidate_cliques_file = dst_entity_candidate_clique_file
    ee_file = 'e:/dc/nyt-world-full/processed/bin/ee.bin'
    dataarange.gen_entity_entity_pairs(proper_entity_dict_file,
                                       entity_candidate_cliques_file, ee_file)
コード例 #3
0
ファイル: nytdataset.py プロジェクト: hldai/emadr-exp
def setup_entity_pairs_file():
    # docs_ner_file = 'e:/dc/nyt-world-full/processed/docs.txt'
    # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt'
    # cooccur_mentions_file = 'e:/dc/nyt-world-full/processed/mentions-ner/cooccur-mentions.txt'
    # entity_name_dict_file = 'e:/dc/nyt-world-full/processed/mentions-ner/entity-names-nloc.txt'
    # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt'
    # doc_all_mentions_file = 'e:/dc/nyt-world-full/processed/mentions-ner/doc-mentions.txt'
    # doc_entity_file = 'e:/dc/nyt-world-full/processed/bin/de-ner.bin'
    # ee_file = 'e:/dc/nyt-world-full/processed/bin/ee-ner.bin'
    # cnts_file = 'e:/dc/nyt-world-full/processed/bin/entity-cnts-ner.bin'

    docs_ner_file = 'e:/dc/nyt-world-full/processed/train/docs.txt'
    ner_result_file = 'e:/dc/nyt-world-full/processed/train/ner-result.txt'
    cooccur_mentions_file = 'e:/dc/nyt-world-full/processed/train/cooccur-mentions.txt'
    entity_name_dict_file = 'e:/dc/nyt-world-full/processed/train/entity-names-nloc.txt'
    ee_file = 'e:/dc/nyt-world-full/processed/bin/ee-ner-train.bin'

    # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file)

    # gen entity name dict
    dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file, True)
    # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)

    # dataarange.gen_doc_entity_pairs(entity_name_dict_file, doc_all_mentions_file, doc_entity_file)

    dataarange.gen_entity_entity_pairs(entity_name_dict_file, cooccur_mentions_file, ee_file)
コード例 #4
0
ファイル: nytdataset.py プロジェクト: hldai/emadr-exp
def retrieve_mentions():
    line_docs_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized.txt'
    illegal_start_words_file = 'e:/dc/20ng_bydate/stopwords.txt'

    output_dir = 'e:/dc/nyt-world-full/processed/mentions/'
    dst_doc_entity_candidates_list_file = output_dir + 'doc_mention_candidates.txt'
    dst_entity_candidate_clique_file = output_dir + 'mention_candidate_cliques.txt'
    dst_doc_entity_indices_file = output_dir + 'doc_mention_candidate_indices.txt'
    # init_entity_net(line_docs_file_name, illegal_start_words_file, dst_doc_entity_candidates_list_file,
    #                 dst_entity_candidate_clique_file, dst_doc_entity_indices_file)

    lc_word_cnts_file_name = 'e:/dc/el/wiki/wiki_word_cnts_lc.txt'
    wc_word_cnts_file_name = 'e:/dc/el/wiki/wiki_word_cnts_with_case.txt'
    proper_entity_dict_file = output_dir + 'entity_names.txt'
    # gen_entity_name_dict(dst_doc_entity_candidates_list_file, lc_word_cnts_file_name, wc_word_cnts_file_name,
    #                      proper_entity_dict_file)

    de_file_bin = 'e:/dc/nyt-world-full/processed/bin/de.bin'
    de_file_txt = 'e:/dc/nyt-world-full/processed/bin/de.txt'
    dataarange.gen_doc_entity_pairs(proper_entity_dict_file, dst_doc_entity_candidates_list_file, de_file_bin,
                                    de_file_txt)

    entity_candidate_cliques_file = dst_entity_candidate_clique_file
    ee_file = 'e:/dc/nyt-world-full/processed/bin/ee.bin'
    dataarange.gen_entity_entity_pairs(proper_entity_dict_file, entity_candidate_cliques_file, ee_file)
コード例 #5
0
ファイル: 20ngdataset.py プロジェクト: hldai/emadr-exp
def setup_entity_pairs_file():
    doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt'
    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    # pack_docs_for_ner(doc_list_file, docs_ner_file)

    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file)

    # gen entity name dict
    entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file)

    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt'
    # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)

    name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin'
    dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file, doc_entity_file)

    entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin'
    dataarange.gen_entity_entity_pairs(name_dict_file, entity_candidate_cliques_file, ee_file)

    cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin'
    dataarange.gen_cnts_file(doc_entity_file, cnts_file)
コード例 #6
0
def __setup_entity_pairs_file():
    doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt'
    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    pack_docs_for_ner(doc_list_file, docs_ner_file)

    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file)

    # gen entity name dict
    entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file)

    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt'
    # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)

    name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin'
    dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file,
                                    doc_entity_file)

    entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin'
    dataarange.gen_entity_entity_pairs(name_dict_file,
                                       entity_candidate_cliques_file, ee_file)

    cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin'
    dataarange.gen_cnts_file(doc_entity_file, cnts_file)