def write_eval_records(filepath: Path): eval_writer = bert_utils.FeatureWriter(filename=str(filepath), is_training=False) tokenizer = tokenization.FullTokenizer(vocab_file=str(DATASET_PATH / 'vocab-nq.txt'), do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features( tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 for examples in bert_utils.nq_examples_iter(input_file=TEST_FILE, is_training=False, tqdm=tqdm.tqdm): for example in examples: n_examples += convert(example) eval_writer.close() print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features))
max_position=50, max_contexts=48, max_query_length=64, max_seq_length=512, doc_stride=128, include_unknowns=-1.0, n_best_size=20, max_answer_length=30) import tqdm eval_records = "../input/bert-joint-baseline/nq-test.tfrecords" #nq_test_file = '../input/tensorflow2-question-answering/simplified-nq-test.jsonl' if on_kaggle_server and private_dataset: eval_records = 'nq-test.tfrecords' if not os.path.exists(eval_records): # tf2baseline.FLAGS.max_seq_length = 512 eval_writer = bert_utils.FeatureWriter(filename=os.path.join(eval_records), is_training=False) tokenizer = tokenization.FullTokenizer( vocab_file='../input/bert-joint-baseline/vocab-nq.txt', do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features( tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 tqdm_notebook = tqdm.tqdm_notebook if not on_kaggle_server else None for examples in bert_utils.nq_examples_iter(input_file=nq_test_file,
"""Decodes a record to a TensorFlow example.""" feature_description = feature_description or FEATURE_DESCRIPTION example = tf.io.parse_single_example(serialized=record, features=feature_description) # tf.Example only supports tf.int64, but the TPU only supports tf.int32. # So cast all int64 to int32. for key in [k for k in example.keys() if k not in ['example_id', 'unique_id']]: example[key] = tf.cast(example[key], dtype=tf.int32) # example.pop('example_id') return example #%% if not url_exists(NQ_TEST_TFRECORD_PATH): # tf2baseline.F.max_seq_length = 512 eval_writer = bert_utils.FeatureWriter(filename=NQ_TEST_TFRECORD_PATH, is_training=False) tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_PATH, do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features(tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 # tqdm_notebook = tqdm.tqdm_notebook # if not on_kaggle_server else None for examples in bert_utils.nq_examples_iter(input_file=NQ_TEST_JSONL_PATH, is_training=False, tqdm=tqdm): for example in examples: n_examples += convert(example) eval_writer.close()