def __setup_entity_pairs_file(): # docs_ner_file = 'e:/dc/nyt-world-full/processed/docs.txt' # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt' # cooccur_mentions_file = 'e:/dc/nyt-world-full/processed/mentions-ner/cooccur-mentions.txt' # entity_name_dict_file = 'e:/dc/nyt-world-full/processed/mentions-ner/entity-names-nloc.txt' # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt' # datadir = 'e:/data/emadr/nyt-world-full/processed/' datadir = 'e:/data/emadr/nyt-less-docs/sports' filter_loc = False docs_ner_file = os.path.join(datadir, 'docs.txt') ner_result_file = os.path.join(datadir, 'ner-result.txt') cooccur_mentions_file = os.path.join(datadir, 'cooccur-mentions.txt') entity_name_dict_file = os.path.join(datadir, 'entity-names.txt') doc_all_mentions_file = os.path.join(datadir, 'doc-mentions.txt') ee_file = os.path.join(datadir, 'bindata/ee.bin') de_file = os.path.join(datadir, 'bindata/de.bin') cnts_file = os.path.join(datadir, 'bindata/entity-cnts.bin') dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file) # gen entity name dict dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file, filter_loc) dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file) dataarange.gen_doc_entity_pairs(entity_name_dict_file, doc_all_mentions_file, de_file) dataarange.gen_entity_entity_pairs(entity_name_dict_file, cooccur_mentions_file, ee_file) dataarange.gen_cnts_file(de_file, cnts_file)
def retrieve_mentions(): line_docs_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized.txt' illegal_start_words_file = 'e:/dc/20ng_bydate/stopwords.txt' output_dir = 'e:/dc/nyt-world-full/processed/mentions/' dst_doc_entity_candidates_list_file = output_dir + 'doc_mention_candidates.txt' dst_entity_candidate_clique_file = output_dir + 'mention_candidate_cliques.txt' dst_doc_entity_indices_file = output_dir + 'doc_mention_candidate_indices.txt' # init_entity_net(line_docs_file_name, illegal_start_words_file, dst_doc_entity_candidates_list_file, # dst_entity_candidate_clique_file, dst_doc_entity_indices_file) lc_word_cnts_file_name = 'e:/dc/el/wiki/wiki_word_cnts_lc.txt' wc_word_cnts_file_name = 'e:/dc/el/wiki/wiki_word_cnts_with_case.txt' proper_entity_dict_file = output_dir + 'entity_names.txt' # gen_entity_name_dict(dst_doc_entity_candidates_list_file, lc_word_cnts_file_name, wc_word_cnts_file_name, # proper_entity_dict_file) de_file_bin = 'e:/dc/nyt-world-full/processed/bin/de.bin' de_file_txt = 'e:/dc/nyt-world-full/processed/bin/de.txt' dataarange.gen_doc_entity_pairs(proper_entity_dict_file, dst_doc_entity_candidates_list_file, de_file_bin, de_file_txt) entity_candidate_cliques_file = dst_entity_candidate_clique_file ee_file = 'e:/dc/nyt-world-full/processed/bin/ee.bin' dataarange.gen_entity_entity_pairs(proper_entity_dict_file, entity_candidate_cliques_file, ee_file)
def setup_entity_pairs_file(): doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt' docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' # pack_docs_for_ner(doc_list_file, docs_ner_file) docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file) # gen entity name dict entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file) ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt' # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file) name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin' dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file, doc_entity_file) entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin' dataarange.gen_entity_entity_pairs(name_dict_file, entity_candidate_cliques_file, ee_file) cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin' dataarange.gen_cnts_file(doc_entity_file, cnts_file)
def __setup_entity_pairs_file(): doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt' docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' pack_docs_for_ner(doc_list_file, docs_ner_file) docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file) # gen entity name dict entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file) ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt' # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file) name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin' dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file, doc_entity_file) entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin' dataarange.gen_entity_entity_pairs(name_dict_file, entity_candidate_cliques_file, ee_file) cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin' dataarange.gen_cnts_file(doc_entity_file, cnts_file)
def __setup_doc_entities_file(): illegal_start_words_file = 'e:/data/emadr/20ng_bydate/stopwords.txt' emadr_data_dir = 'e:/data/emadr/el/tac/2014/eval/' line_docs_file = os.path.join(emadr_data_dir, 'docs-tokenized.txt') doc_entity_candidates_list_file = os.path.join( emadr_data_dir, 'doc_entity_candidates.txt') entity_candidate_clique_file = os.path.join( emadr_data_dir, 'entity_candidate_cliques.txt') dataarange.init_entity_net(line_docs_file, illegal_start_words_file, doc_entity_candidates_list_file, entity_candidate_clique_file) proper_entity_dict_file = 'e:/data/emadr/el/wiki/entity_names.txt' # doc_entity_candidates_file = 'e:/data/emadr/el/wiki/doc_entity_candidates.txt' dst_doc_entity_list_file = os.path.join(emadr_data_dir, 'de.bin') dataarange.gen_doc_entity_pairs(proper_entity_dict_file, doc_entity_candidates_list_file, dst_doc_entity_list_file)