コード例 #1
0
def write_eval_records(filepath: Path):
    eval_writer = bert_utils.FeatureWriter(filename=str(filepath),
                                           is_training=False)
    tokenizer = tokenization.FullTokenizer(vocab_file=str(DATASET_PATH /
                                                          'vocab-nq.txt'),
                                           do_lower_case=True)
    features = []
    convert = bert_utils.ConvertExamples2Features(
        tokenizer=tokenizer,
        is_training=False,
        output_fn=eval_writer.process_feature,
        collect_stat=False)
    n_examples = 0
    for examples in bert_utils.nq_examples_iter(input_file=TEST_FILE,
                                                is_training=False,
                                                tqdm=tqdm.tqdm):
        for example in examples:
            n_examples += convert(example)
    eval_writer.close()
    print('number of test examples: %d, written to file: %d' %
          (n_examples, eval_writer.num_features))
コード例 #2
0
                    max_position=50,
                    max_contexts=48,
                    max_query_length=64,
                    max_seq_length=512,
                    doc_stride=128,
                    include_unknowns=-1.0,
                    n_best_size=20,
                    max_answer_length=30)
import tqdm
eval_records = "../input/bert-joint-baseline/nq-test.tfrecords"
#nq_test_file = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl'
if on_kaggle_server and private_dataset:
    eval_records = 'nq-test.tfrecords'
if not os.path.exists(eval_records):
    # tf2baseline.FLAGS.max_seq_length = 512
    eval_writer = bert_utils.FeatureWriter(filename=os.path.join(eval_records),
                                           is_training=False)

    tokenizer = tokenization.FullTokenizer(
        vocab_file='../input/bert-joint-baseline/vocab-nq.txt',
        do_lower_case=True)

    features = []
    convert = bert_utils.ConvertExamples2Features(
        tokenizer=tokenizer,
        is_training=False,
        output_fn=eval_writer.process_feature,
        collect_stat=False)

    n_examples = 0
    tqdm_notebook = tqdm.tqdm_notebook if not on_kaggle_server else None
    for examples in bert_utils.nq_examples_iter(input_file=nq_test_file,
コード例 #3
0
    """Decodes a record to a TensorFlow example."""
    feature_description = feature_description or FEATURE_DESCRIPTION
    example = tf.io.parse_single_example(serialized=record, features=feature_description)
    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for key in [k for k in example.keys() if k not in ['example_id', 'unique_id']]:
        example[key] = tf.cast(example[key], dtype=tf.int32)

    #     example.pop('example_id')
    return example

#%%

if not url_exists(NQ_TEST_TFRECORD_PATH):
    # tf2baseline.F.max_seq_length = 512
    eval_writer = bert_utils.FeatureWriter(filename=NQ_TEST_TFRECORD_PATH,
                                           is_training=False)
    tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_PATH,
                                           do_lower_case=True)
    features = []
    convert = bert_utils.ConvertExamples2Features(tokenizer=tokenizer,
                                                  is_training=False,
                                                  output_fn=eval_writer.process_feature,
                                                  collect_stat=False)
    n_examples = 0
    # tqdm_notebook = tqdm.tqdm_notebook  # if not on_kaggle_server else None
    for examples in bert_utils.nq_examples_iter(input_file=NQ_TEST_JSONL_PATH,
                                                is_training=False,
                                                tqdm=tqdm):
        for example in examples:
            n_examples += convert(example)
    eval_writer.close()