def write_eval_records(filepath: Path): eval_writer = bert_utils.FeatureWriter(filename=str(filepath), is_training=False) tokenizer = tokenization.FullTokenizer(vocab_file=str(DATASET_PATH / 'vocab-nq.txt'), do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features( tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 for examples in bert_utils.nq_examples_iter(input_file=TEST_FILE, is_training=False, tqdm=tqdm.tqdm): for example in examples: n_examples += convert(example) eval_writer.close() print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features))
eval_writer = bert_utils.FeatureWriter(filename=os.path.join(eval_records), is_training=False) tokenizer = tokenization.FullTokenizer(vocab_file='vocab-nq.txt', do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features( tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 for examples in bert_utils.nq_examples_iter(input_file=nq_test_file, is_training=False, tqdm=None): for example in examples: n_examples += convert(example) eval_writer.close() # print('number of test examples: %d, written to file: %d' % (n_examples,eval_writer.num_features)) seq_length = bert_utils.FLAGS.max_seq_length #config['max_position_embeddings'] name_to_features = { "unique_id": tf.io.FixedLenFeature([], tf.int64), "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64), }
tokenizer = tokenization.FullTokenizer( vocab_file='../input/bert-joint-baseline/vocab-nq.txt', do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features( tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 tqdm_notebook = tqdm.tqdm_notebook if not on_kaggle_server else None for examples in bert_utils.nq_examples_iter(input_file=nq_test_file, is_training=False, tqdm=tqdm_notebook): for example in examples: n_examples += convert(example) eval_writer.close() print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features)) seq_length = FLAGS.max_seq_length #config['max_position_embeddings'] name_to_features = { "unique_id": tf.io.FixedLenFeature([], tf.int64), "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64), "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64), "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64), }
if not url_exists(NQ_TEST_TFRECORD_PATH): # tf2baseline.F.max_seq_length = 512 eval_writer = bert_utils.FeatureWriter(filename=NQ_TEST_TFRECORD_PATH, is_training=False) tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_PATH, do_lower_case=True) features = [] convert = bert_utils.ConvertExamples2Features(tokenizer=tokenizer, is_training=False, output_fn=eval_writer.process_feature, collect_stat=False) n_examples = 0 # tqdm_notebook = tqdm.tqdm_notebook # if not on_kaggle_server else None for examples in bert_utils.nq_examples_iter(input_file=NQ_TEST_JSONL_PATH, is_training=False, tqdm=tqdm): for example in examples: n_examples += convert(example) eval_writer.close() print('number of test examples: %d, written to file: %d' % (n_examples, eval_writer.num_features)) #%% raw_ds = tf.data.TFRecordDataset(NQ_TEST_TFRECORD_PATH) decoded_ds = raw_ds.map(_decode_record) batched_ds = decoded_ds.batch(batch_size=BATCH_SIZE, drop_remainder=(TPU is not None)) result = model.predict(batched_ds, verbose=1) #%%