# if 'entity_map' in ann['long_answer'].keys(): # entities.extend([ ent for k, v in ann["long_answer"]["entity_map"].items() for (ids, ent) in v ]) # for cand in item["long_answer_candidates"]: # if 'entity_map' in cand.keys(): # entities.extend([ ent for k, v in cand["entity_map"].items() for (ids, ent) in v ]) # for ann in item["annotations"]: # for sa in ann['short_answers']: # if 'entity_map' in sa.keys(): # entities.extend([ ent for k, v in sa["entity_map"].items() for (ids, ent) in v ]) if len(entities) == 0: empty_ents += 1 st = time.time() #print(example_id) #print("Size of all entities: %d", len(entities)) k_hop_entities, k_hop_facts = apr.get_khop_facts( entities, FLAGS.csr_num_hops) #print("Size of two hop entities: %d", len(k_hop_entities)) #print("Size of two hop facts: %d", len(k_hop_facts)) csr_data = CsrData() csr_data.create_and_save_csr_data( full_wiki=FLAGS.full_wiki, decompose_ppv=FLAGS.decompose_ppv, files_dir=FLAGS.apr_files_dir, sub_entities=k_hop_entities, question_id=example_id, question_embedding=question_embedding, relation_embeddings=relation_embeddings, sub_facts=k_hop_facts) #print('Time taken for CSR: '+str(time.time() - st)) print("No ent questions: " + str(empty_ents))
max_tasks = {"train": 50, "dev": 5} max_shards = {"train": 7, "dev": 17} apr = ApproximatePageRank() for mode in [FLAGS.split]: # Parse all shards in each mode # Currently sequentially, can be parallelized later for task_id in [FLAGS.task_id]: #range(0, max_tasks[mode]): for shard_id in [FLAGS.shard_split_id ]: #range(0, max_shards[mode]): # if task_id == 0 and shard_id in range(0, 16): # print("skipping finished job") # continue nq_data, entities = get_examples(FLAGS.nq_dir, mode, task_id, shard_id) if nq_data is None: print("No examples here") continue print("Size of all entities: %d", len(entities)) two_hop_entities = apr.get_khop_entities( entities, FLAGS.csr_num_hops) print("Size of two hop entities: %d", len(two_hop_entities)) csr_data = CsrData() csr_data.create_and_save_csr_data( full_wiki=FLAGS.full_wiki, decompose_ppv=FLAGS.decompose_ppv, files_dir=FLAGS.apr_files_dir, sub_entities=two_hop_entities, mode=mode, task_id=task_id, shard_id=shard_id)
#print("Size of two hop entities: %d", len(k_hop_entities)) #print("Size of two hop facts: %d", len(k_hop_facts)) relation_scores = None if FLAGS.rel_classifier_scores: print(example_id, half_qid) if str(half_qid) not in que_rel_scores: print("not in qid") print(example_id, half_qid) print(item['question_text'], entities) continue print(item['question_text'], entities) print(que_rel_scores[str(half_qid)]) #print([(rel ,apr.data.entity_names['r'][str(apr.data.rel2id[rel])], score) for rel, score in list(que_rel_scores[str(half_qid)].items())]) relation_scores = que_rel_scores[str(half_qid)] proc_q +=1 csr_data = CsrData() csr_data.create_and_save_csr_data(full_wiki=FLAGS.full_wiki, decompose_ppv=FLAGS.decompose_ppv, files_dir=FLAGS.output_apr_files_dir, sub_entities=k_hop_entities, question_id=example_id, question_embedding=question_embedding, relation_embeddings=relation_embeddings, relation_scores=relation_scores, sub_facts=k_hop_facts, relations_to_filter=apr.data.relations_to_filter) #print('Time taken for CSR: '+str(time.time() - st)) print("No ent questions: "+str(empty_ents)) print("No proc q: "+str(proc_q))