def test_minimal_examples(self): num_examples = 20 minimal_example = { "annotations": [], "long_answer_candidates": [], "question_text": "", "document_url": "", "document_title": "", "example_id": 1 } self.write_examples([minimal_example] * num_examples) output_examples = nq_data_utils.get_nq_examples(self.test_file) self.assertEqual(num_examples, len(list(output_examples)))
def main(_): shard_counter = 0 input_file = nq_data_utils.get_nq_filename(FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, "jsonl.gz") op = get_output_fp(shard_counter) counter = 0 for line in nq_data_utils.get_nq_examples(input_file): op.write((line.decode("utf-8")).encode("utf-8")) counter += 1 if counter % FLAGS.split_size == 0: op.close() shard_counter += 1 op = get_output_fp(shard_counter)
def test_example_metadata(self): example = { "annotations": [], "long_answer_candidates": [], "question_text": "test_q", "document_url": "test_url", "document_title": "test_title", "example_id": 10 } self.write_examples([example]) output_example = next(nq_data_utils.get_nq_examples(self.test_file)) self.assertEqual(output_example["name"], "test_title") self.assertEqual(output_example["id"], "10") self.assertEqual(output_example["questions"][0]["input_text"], "test_q") self.assertEqual(output_example["answers"][0]["input_text"], "long")
def main(_): examples_processed = 0 instances_processed = 0 num_examples_with_correct_context = 0 if FLAGS.is_training: creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training) instances = [] input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") tf.logging.info("Reading file %s", input_file) for example in nq_data_utils.get_nq_examples(input_file): for instance in creator_fn.process(example): instances.append(instance) instances_processed += 1 if example["has_correct_context"]: num_examples_with_correct_context += 1 if examples_processed % 100 == 0: tf.logging.info("Examples processed: %d", examples_processed) tf.logging.info("Instances processed: %d", instances_processed) examples_processed += 1 if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples: break tf.logging.info("Examples with correct context retained: %d of %d", num_examples_with_correct_context, examples_processed) random.shuffle(instances) tf.logging.info("Total no: of instances in current shard: %d", len(instances)) output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance) # For eval - First process every shard in parallel elif not FLAGS.is_training and not FLAGS.merge_eval: input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") tf.logging.info("Reading file %s", input_file) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_examples = run_nq.read_nq_examples( input_file=input_file, is_training=False) output_file = nq_data_utils.get_sharded_filename(FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") eval_writer = run_nq.FeatureWriter(filename=output_file, is_training=False) eval_features = [] examples_processed = 0 def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) examples_processed = len(eval_features) if examples_processed % 100 == 0: tf.logging.info("Examples processed: %d", examples_processed) _ = run_nq.convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, is_training=False, output_fn=append_feature) eval_writer.close() # For eval - Fianlly merge all shards into 1 else: instances = [] for task in range(FLAGS.max_dev_tasks): for shard_split in range(FLAGS.max_dev_shard_splits): input_file = nq_data_utils.get_sharded_filename(FLAGS.input_data_dir, FLAGS.split, task, shard_split, "tf-record") tf.logging.info("Reading file %s", input_file) instances.extend([ tf.train.Example.FromString(r) for r in tf.python_io.tf_record_iterator(input_file) ]) output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance.SerializeToString())
def test_multi_candidate_document(self): example = { "annotations": [{ "long_answer": { "candidate_index": 1, "start_token": 0, "end_token": 3, "entity_map": {} }, "short_answers": [{ "start_token": 2, "end_token": 3, "entity_map": {} }], "yes_no_answer": "NONE" }], "long_answer_candidates": [{ "start_token": 0, "end_token": 3, "top_level": True, "entity_map": {} }, { "start_token": 0, "end_token": 3, "top_level": True }], "document_tokens": [{ "token": "<P>", "html_token": True }, { "token": "the", "html_token": False }, { "token": "document", "html_token": False }], "question_text": "the question", "document_url": "", "document_title": "", "example_id": 1 } self.write_examples([example]) # The document in this case should be a single string with all contexts. output_example = next(nq_data_utils.get_nq_examples(self.test_file)) self.assertEqual( "[ContextId=-1] [NoLongAnswer] [ContextId=0] [Paragraph=1] the document " "[ContextId=1] [Paragraph=2] the document", output_example["contexts"]) passages, spans, _ = self.make_tf_examples(output_example, is_training=True) self.assertEqual([ "[CLS] [Q] the question [SEP] [ContextId=-1] [NoLongAnswer] " "[ContextId=0] [Paragraph=1] the docum [SEP] [SEP] [PAD] [PAD] " "[PAD] [PAD] [PAD] [PAD] [PAD]", "[CLS] [Q] the question [SEP]ent [ContextId=1] [Paragraph=2] the document [SEP] " "[SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]" ], passages) self.assertEqual(["[CLS]", "document"], spans) passages, _, tok_maps = self.make_tf_examples(output_example, is_training=False) self.assertEqual([ "[CLS] [Q] the question [SEP] [ContextId=-1] [NoLongAnswer] " "[ContextId=0] [Paragraph=1] the docum [SEP] [SEP] [PAD] [PAD] " "[PAD] [PAD] [PAD] [PAD] [PAD]", "[CLS] [Q] the question [SEP]ent [ContextId=1] [Paragraph=2] the document [SEP] " "[SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]" ], passages) self.assertEqual([[-1] * 9 + [1, 2] + [-1] * 9, [-1] * 5 + [2, -1, -1, 1, 2, 2] + [-1] * 9], tok_maps)
def main(_): examples_processed = 0 instances_processed = 0 examples_with_instances = 0 num_examples_with_correct_context = 0 # if FLAGS.create_pretrain_data or FLAGS.create_fact_annotation_data: # pretrain_file = open(nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir, # FLAGS.split, FLAGS.task_id, # FLAGS.shard_split_id, # "txt"), 'w') # else: # pretrain_file = None pretrain_file = open( nq_data_utils.get_sharded_filename(FLAGS.pretrain_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "txt"), 'w') if FLAGS.is_training: fixed_train_list = None if FLAGS.use_fixed_training_data: fp = open(FLAGS.fixed_training_data_filepath) fixed_train_list = [] for line in fp: fixed_train_list.append(int(line)) creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training) instances = [] input_file = nq_data_utils.get_sharded_filename( FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") print("Reading file %s", input_file) file_stats_counter = { 'example_count': 0, 'sp_recall_sum': 0, 'answer_reach_counter': 0, 'single_answer_reach_counter': 0, 'multi_answer_recall': 0, 'single_answer_counter': 0, 'multi_answer_counter': 0, 'multi_answer_size_counter': 0 } for example in nq_data_utils.get_nq_examples(input_file): ins_count = 0 stats_count = None for instance, stats_count in creator_fn.process( example, pretrain_file, fixed_train_list): instances.append(instance) instances_processed += 1 ins_count = 1 if FLAGS.use_passage_rw_facts_in_shortest_path or FLAGS.use_question_to_passage_facts_in_shortest_path: file_stats_counter['example_count'] += 1 file_stats_counter['sp_recall_sum'] += stats_count[ 'fact_recall_counter'] if len(stats_count['answers_reached']) > 0: file_stats_counter['answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter[ 'multi_answer_recall'] += stats_count[ 'answer_recall_counter'] else: file_stats_counter[ 'single_answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter['multi_answer_counter'] += 1 file_stats_counter['multi_answer_size_counter'] += len( stats_count['answer_entity_ids']) else: file_stats_counter['single_answer_counter'] += 1 if stats_count is not None and FLAGS.use_question_rw_facts_in_shortest_path: file_stats_counter['example_count'] += 1 file_stats_counter['sp_recall_sum'] += stats_count[ 'fact_recall_counter'] if len(stats_count['answers_reached']) > 0: file_stats_counter['answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter[ 'multi_answer_recall'] += stats_count[ 'answer_recall_counter'] else: file_stats_counter['single_answer_reach_counter'] += 1 if len(stats_count['answer_entity_ids']) > 1: file_stats_counter['multi_answer_counter'] += 1 file_stats_counter['multi_answer_size_counter'] += len( stats_count['answer_entity_ids']) else: file_stats_counter['single_answer_counter'] += 1 if example["has_correct_context"]: num_examples_with_correct_context += 1 if examples_processed % 100 == 0: print("Examples processed: %d", examples_processed) print("Instances processed: %d", instances_processed) examples_processed += 1 examples_with_instances += ins_count if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples: break #time.sleep(5) print("Examples with correct context retained: %d of %d", num_examples_with_correct_context, examples_processed) random.shuffle(instances) print("Total no: of instances in current shard: %d", len(instances)) output_file = nq_data_utils.get_sharded_filename( FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") stats_file = nq_data_utils.get_sharded_filename( FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "stats.txt") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance) with open(stats_file, 'w') as fp: print("Example count: %d", file_stats_counter['example_count']) print("Fact Recall sum: %d", file_stats_counter['sp_recall_sum']) print("Count with answers reached: %d", file_stats_counter['answer_reach_counter']) print("Single Answer Example count: %d", file_stats_counter['single_answer_counter']) print("Single Answer Reached count: %d", file_stats_counter['single_answer_reach_counter']) print("Multi Answer Example count: %d", file_stats_counter['multi_answer_counter']) print("Multi Answer recall sum: %d", file_stats_counter['multi_answer_recall']) print("Multi Answer Size counter: %d", file_stats_counter['multi_answer_size_counter']) fp.write("Example count: " + str(file_stats_counter['example_count']) + "\n") fp.write("Fact Recall sum: " + str(file_stats_counter['sp_recall_sum']) + "\n") fp.write("Count with answers reached: " + str(file_stats_counter['answer_reach_counter']) + "\n") fp.write("Single Answer Example count: " + str(file_stats_counter['single_answer_counter']) + "\n") fp.write("Single Answer Reached count: " + str(file_stats_counter['single_answer_reach_counter']) + "\n") fp.write("Multi Answer Example count: " + str(file_stats_counter['multi_answer_counter']) + "\n") fp.write("Multi Answer recall sum: " + str(file_stats_counter['multi_answer_recall']) + "\n") fp.write("Multi Answer Size counter: " + str(file_stats_counter['multi_answer_size_counter']) + "\n") fp.write("Examples with instances Processed: " + str(examples_with_instances) + "\n") # For eval - First process every shard in parallel elif not FLAGS.is_training and not FLAGS.merge_eval: input_file = nq_data_utils.get_sharded_filename( FLAGS.input_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "jsonl.gz") print("Reading file %s", input_file) tokenizer = tokenization.FullTokenizer( vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) eval_examples = run_nq.read_nq_examples(input_file=input_file, is_training=False) output_file = nq_data_utils.get_sharded_filename( FLAGS.output_data_dir, FLAGS.split, FLAGS.task_id, FLAGS.shard_split_id, "tf-record") eval_writer = run_nq.FeatureWriter(filename=output_file, is_training=False) eval_features = [] examples_processed = 0 def append_feature(feature): eval_features.append(feature) eval_writer.process_feature(feature) examples_processed = len(eval_features) if examples_processed % 10 == 0: print("Examples processed: %d", examples_processed) _ = run_nq.convert_examples_to_features(examples=eval_examples, tokenizer=tokenizer, is_training=False, output_fn=append_feature, pretrain_file=pretrain_file) eval_writer.close() # For eval - Fianlly merge all shards into 1 else: instances = [] for task in range(FLAGS.max_dev_tasks): for shard_split in range(FLAGS.max_dev_shard_splits): input_file = nq_data_utils.get_sharded_filename( FLAGS.input_data_dir, FLAGS.split, task, shard_split, "tf-record") print("Reading file %s", input_file) instances.extend([ tf.train.Example.FromString(r) for r in tf.python_io.tf_record_iterator(input_file) ]) output_file = os.path.join(FLAGS.output_data_dir, "eval.tf-record") with tf.python_io.TFRecordWriter(output_file) as writer: for instance in instances: writer.write(instance.SerializeToString())