def count_datasets_vocabulary(self): new_dataset_folder = config.base_folder+"data/new_datasets/" """ datasets = ['aida_train.txt', 'aida_dev.txt', 'aida_test.txt', 'ace2004.txt', 'aquaint.txt', 'clueweb.txt', 'msnbc.txt', 'wikipedia.txt'] """ for dataset in util.get_immediate_files(new_dataset_folder): dataset = os.path.basename(os.path.normpath(dataset)) print("Processing dataset: ", dataset) self.add(new_dataset_folder+dataset) self.print_statistics() self.serialize(folder=config.base_folder+"data/vocabulary/")
def create_tfrecords(): new_dataset_folder = config.base_folder+"data/new_datasets/" datasets = [os.path.basename(os.path.normpath(d)) for d in util.get_immediate_files(new_dataset_folder)] print("datasets: ", datasets) tfrecords_generator = TFRecordsGenerator() tfrecords_generator.set_gmonly_mode() for file in datasets: tfrecords_generator.process(filepath=new_dataset_folder+file) tfrecords_generator.set_allspans_mode() for file in datasets: tfrecords_generator.process(filepath=new_dataset_folder+file)
def create_entity_universe(language, gmonly_files=None, allspans_files=None, printSamples=None): new_dataset_folder = config.base_folder + "data/new_datasets/" + language + "/" if gmonly_files is None: gmonly_files = [] if allspans_files is None: #allspans_files = ['aida_train.txt', 'aida_dev.txt', 'aida_test.txt', 'ace2004.txt', # 'aquaint.txt', 'clueweb.txt', 'msnbc.txt', 'wikipedia.txt'] allspans_files = [] for dataset in util.get_immediate_files(new_dataset_folder): if language in dataset: allspans_file.append( os.path.basename(os.path.normpath(dataset))) print("gmonly_files: ", gmonly_files) print("allspans_files: ", allspans_files) def create_entity_universe_aux(generator, datasets): entities_universe = set() for dataset in datasets: print("Processing dataset: ", dataset) for sample in generator.process(filepath=new_dataset_folder + dataset): entities_universe.update(*sample.cand_entities) entities_universe.update(sample.ground_truth) if printSamples: printSamples.print_sample(sample) print("Overall statistics: ") print("all_gm_misses: ", generator.all_gm_misses) print("all_gt_misses: ", generator.all_gt_misses) print("all_gm: ", generator.all_gm) print("recall % : ", (1 - (generator.all_gm_misses + generator.all_gt_misses) / (generator.all_gm + 1.0)) * 100, " %") print("len(entities_universe):\t\t\t", colored(len(entities_universe), 'red')) return entities_universe gmonly_entities, allspans_entities = set(), set() samplesGenerator = SamplesGenerator() if gmonly_files: print("gmonly files statistics: ") samplesGenerator.set_gmonly_mode() gmonly_entities = create_entity_universe_aux(samplesGenerator, gmonly_files) if allspans_files: print("Test files statistics: ") samplesGenerator.set_allspans_mode() allspans_entities = create_entity_universe_aux(samplesGenerator, allspans_files) all_entities = gmonly_entities | allspans_entities print("len(all_entities) = ", len(all_entities)) # print the entities of our universe to a file together with the name with open( config.base_folder + "data/entities/" + language + "/entities_universe.txt", "w") as fout: _, wiki_id_name_map = util.load_wiki_name_id_map() for ent_id in all_entities: fout.write(ent_id + "\t" + wiki_id_name_map[ent_id].replace(' ', '_') + "\n") return all_entities