def convert_examples_to_features(self, examples, is_training, output_fn, batch_size): """Converts examples to features and write them into TFRecord file.""" return squad_lib.convert_examples_to_features( examples=examples, tokenizer=self.tokenizer, max_seq_length=self.seq_len, doc_stride=self.doc_stride, max_query_length=self.query_len, is_training=is_training, output_fn=output_fn, batch_size=batch_size)
def _generate_tf_record_from_squad_file(cls, input_file_path, tokenizer, output_path, is_training, predict_batch_size=8, max_seq_length=384, max_query_length=64, doc_stride=128, version_2_with_negative=False): """Generates and saves training/validation data into a tf record file.""" examples = squad_lib.read_squad_examples( input_file=input_file_path, is_training=is_training, version_2_with_negative=version_2_with_negative) writer = squad_lib.FeatureWriter(filename=output_path, is_training=is_training) features = [] def _append_feature(feature, is_padding): if not is_padding: features.append(feature) writer.process_feature(feature) if is_training: batch_size = None else: batch_size = predict_batch_size number_of_examples = squad_lib.convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training, output_fn=writer.process_feature if is_training else _append_feature, batch_size=batch_size) writer.close() meta_data = { 'size': number_of_examples, 'version_2_with_negative': version_2_with_negative } if is_training: examples = [] return meta_data, examples, features
vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) def _append_feature(feature, is_padding): if not is_padding: eval_features.append(feature) eval_writer.process_feature(feature) eval_features = [] dataset_size = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, output_fn=_append_feature, batch_size=4) eval_writer.close() BATCH_SIZE = 4 eval_dataset = create_squad_dataset( "/content/drive/My Drive/BERT/data/squad/eval.tf_record", 384,#input_meta_data['max_seq_length'], BATCH_SIZE, is_training=False)