Beispiel #1
0
def main(_):
    examples_processed = 0
    num_examples_with_correct_context = 0
    creator_fn = run_nq.CreateTFExampleFn(is_training=FLAGS.is_training)

    instances = []
    for example in get_examples(FLAGS.input_jsonl):
        print("Processing example #: " + str(examples_processed))
        for instance in creator_fn.process(example):
            instances.append(instance)
        if example["has_correct_context"]:
            num_examples_with_correct_context += 1
        if examples_processed % 100 == 0:
            tf.logging.info("Examples processed: %d", examples_processed)
        examples_processed += 1
        if FLAGS.max_examples > 0 and examples_processed >= FLAGS.max_examples:
            break
    print("Examples with correct context retained: " +
          str(num_examples_with_correct_context) + " of " +
          str(examples_processed))

    random.shuffle(instances)
    with tf.python_io.TFRecordWriter(FLAGS.output_tfrecord) as writer:
        for instance in instances:
            writer.write(instance)
Beispiel #2
0
    def make_tf_examples(self, example, is_training):
        passages = []
        spans = []
        token_maps = []
        tf_example_creator = run_nq.CreateTFExampleFn(is_training=is_training)
        for record in list(tf_example_creator.process(example)):
            tfexample = tf.train.Example()
            tfexample.ParseFromString(record)
            tokens = [
                self.VOCAB_TOKENS[x] for x in (
                    tfexample.features.feature["input_ids"].int64_list.value)
            ]
            passages.append(" ".join(tokens).replace(" ##", ""))
            if is_training:
                start = tfexample.features.feature[
                    "start_positions"].int64_list.value[0]
                end = tfexample.features.feature[
                    "end_positions"].int64_list.value[0]
                spans.append(" ".join(tokens[start:end + 1]).replace(
                    " ##", ""))
            else:
                token_maps.append(
                    tfexample.features.feature["token_map"].int64_list.value)

        return passages, spans, token_maps