def _dataset_fn(ctx=None): """Returns tf.data.Dataset for distributed BERT pretraining.""" batch_size = ctx.get_per_replica_batch_size( global_batch_size) if ctx else global_batch_size dataset = input_pipeline.create_squad_dataset( input_file_pattern, max_seq_length, batch_size, is_training=is_training, input_pipeline_context=ctx) return dataset
def predict_squad_customized(strategy, input_meta_data, bert_config, predict_tfrecord_path, num_steps): """Make predictions using a Bert-based squad model.""" primary_cpu_task = '/job:worker' if FLAGS.tpu else '' with tf.device(primary_cpu_task): predict_dataset = input_pipeline.create_squad_dataset( predict_tfrecord_path, input_meta_data['max_seq_length'], FLAGS.predict_batch_size, is_training=False) predict_iterator = iter( strategy.experimental_distribute_dataset(predict_dataset)) with strategy.scope(): # Prediction always uses float32, even if training uses mixed precision. tf.keras.mixed_precision.experimental.set_policy('float32') squad_model, _ = bert_models.squad_model( bert_config, input_meta_data['max_seq_length'], float_type=tf.float32) checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir) logging.info('Restoring checkpoints from %s', checkpoint_path) checkpoint = tf.train.Checkpoint(model=squad_model) checkpoint.restore(checkpoint_path).expect_partial() @tf.function def predict_step(iterator): """Predicts on distributed devices.""" def _replicated_step(inputs): """Replicated prediction calculation.""" x, _ = inputs unique_ids, start_logits, end_logits = squad_model( x, training=False) return dict(unique_ids=unique_ids, start_logits=start_logits, end_logits=end_logits) outputs = strategy.experimental_run_v2(_replicated_step, args=(next(iterator), )) return tf.nest.map_structure(strategy.experimental_local_results, outputs) all_results = [] for _ in range(num_steps): predictions = predict_step(predict_iterator) for result in get_raw_results(predictions): all_results.append(result) if len(all_results) % 100 == 0: logging.info('Made predictions for %d records.', len(all_results)) return all_results
def build_inputs(self, params, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" if params.input_path == 'dummy': # Dummy training data for unit test. def dummy_data(_): dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) x = dict( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) y = dict( start_positions=tf.constant(0, dtype=tf.int32), end_positions=tf.constant(1, dtype=tf.int32)) return (x, y) dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset if params.is_training: input_path = params.input_path else: input_path, self._eval_examples, self._eval_features = ( self._preprocess_eval_data(params)) batch_size = input_context.get_per_replica_batch_size( params.global_batch_size) if input_context else params.global_batch_size # TODO(chendouble): add and use nlp.data.question_answering_dataloader. dataset = input_pipeline.create_squad_dataset( input_path, params.seq_length, batch_size, is_training=params.is_training, input_pipeline_context=input_context) return dataset
if __name__ == "__main__": input_meta_data = generate_tf_record_from_json_file( cf.INPUTS_FILE_TRAIN, cf.INPUTS_FILE_VOCAB, cf.INPUTS_FILE_DEV ) with tf.io.gfile.GFile(cf.TRAIN_META_DATA, "w") as writer: writer.write(json.dumps(input_meta_data, indent=4) + "\n") train_dataset = create_squad_dataset( cf.INPUTS_FILE_DEV, input_meta_data['max_seq_length'], # 384 cf.BATCH_SIZE, is_training=True ) train_dataset_light = train_dataset.take(cf.NB_BATCHES_TRAIN) bert_squad = BERTSquad() optimizer = optimization.create_optimizer( init_lr=cf.INIT_LR, num_train_steps=cf.NB_BATCHES_TRAIN, num_warmup_steps=cf.WARMUP_STEPS ) train_loss = tf.keras.metrics.Mean(name="train_loss")
from google.colab import drive drive.mount("/content/drive") input_meta_data = generate_tf_record_from_json_file( "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.json", "/content/drive/MyDrive/BERT/ChatBot/vocab.txt", "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record" ) with tf.io.gfile.GFile("/content/drive/MyDrive/BERT/ChatBot/train_meta_data","w") as writer: writer.write(json.dumps(input_meta_data, indent=4)+"\n") BATCH_SIZE = 4 train_dataset = create_squad_dataset("/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record", input_meta_data["max_seq_length"], BATCH_SIZE, is_training=True) class BertSquadLayer(tf.keras.layers.Layer): def __init__(self): super(BertSquadLayer, self).__init__() self.final_dense = tf.keras.layers.Dense( units=2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02) ) def call(self,inputs): logits = self.final_dense(inputs) logits = tf.transpose(logits, [2,0,1])
input_meta_data = generate_tf_record_from_json_file( "/content/drive/My Drive/BERT/data/squad/train-v1.1.json", "/content/drive/My Drive/BERT/data/squad/vocab.txt", "/content/drive/My Drive/BERT/data/squad/train-v1.1.tf_record") input_meta_data = generate_tf_record_from_json_file( "/content/drive/My Drive/BERT/data/squad/train-v1.1.json", "/content/drive/My Drive/BERT/data/squad/vocab.txt", "/content/drive/My Drive/BERT/data/squad/train-v1.1.tf_record") BATCH_SIZE = 4 train_dataset = create_squad_dataset( "/content/drive/My Drive/BERT/data/squad/train-v1.1.tf_record", input_meta_data['max_seq_length'], # 384 BATCH_SIZE, is_training=True) # Stage 3: Model building class BertSquadLayer(tf.keras.layers.Layer): def __init__(self): super(BertSquadLayer, self).__init__() self.final_dense = tf.keras.layers.Dense( units=2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)) def call(self, inputs):
tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, output_fn=_append_feature, batch_size=4) eval_writer.close() # Load the ready-to-be-used dataset to our session BATCH_SIZE = 4 eval_dataset = create_squad_dataset( "./data/squad/eval.tf_record", 384, #input_meta_data['max_seq_length'], BATCH_SIZE, is_training=False) # its not for trainig ## -- Making the predictions ''' Need to make correct input format according to google ''' # Defines a certain type of collection (like a dictionary).nametupale create tuble with element with name attached to it. # kind of dictionary but tuple RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"]) # Returns each element of batched output once at a time def get_raw_results(predictions): for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
import json import math import time import os # ********** Phase 2: Data Preprocessing ********** with open('./Bert_QA/data/squad/train_meta_data') as json_file: input_meta_data = json.load(json_file) BATCH_SIZE = 4 train_dataset = create_squad_dataset( "./Bert_QA/data/squad/train-v1.1.tf_record", input_meta_data['max_seq_length'], # 384 BATCH_SIZE, is_training=True) # ********** Phase 3: Model Building ********** # SQUAD Layer: layer to add after the BERT # Get 2 list start and end word socre of the answer. class BertSquadLayer(tf.keras.layers.Layer): def __init__(self): super(BertSquadLayer, self).__init__() self.final_dense = tf.keras.layers.Dense( units=2, # 2 output units # Gaussian function 'TruncatedNormal' not to get high values stddev how much to spred the function kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))
try: with open('./data/squad/train_meta_data') as json_file: input_meta_data = json.load(json_file) print("train meta data file is available") except: print("Generating train meta data file") input_meta_data = get_input_metadata() ''' * inputs are much bigger so bigger batch size can create an issue, CPU limitation * because tweets have small sentences were as here most of time we have to work with multiple paragraph with another sentence that is question ''' BATCH_SIZE = 4 train_dataset = create_squad_dataset( "./data/train-v1.1.tf_record", input_meta_data[ 'max_seq_length'], # we don't have input sequence of more than 384 words BATCH_SIZE, is_training=True) # stating it is for training set # ********** Phase 3: Model Building ********** # SQUAD Layer: layer to add after the BERT # Get 2 list start and end word socre of the answer. class BertSquadLayer(tf.keras.layers.Layer): def __init__(self): super(BertSquadLayer, self).__init__() self.final_dense = tf.keras.layers.Dense( units=2, # 2 output units # Gaussian function 'TruncatedNormal' not to get high values stddev how much to spred the function kernel_initializer=tf.keras.initializers.TruncatedNormal(
output_fn=_append_feature, batch_size=4) my_bert_layer = hub.KerasLayer( "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False) vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy() do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy() tokenizer = FullTokenizer(vocab_file, do_lower_case) eval_dataset = create_squad_dataset( cf.OUTPUT_EVAL_FILE, 384, #input_meta_data['max_seq_length'], cf.BATCH_SIZE, is_training=False) RawResult = collections.namedtuple( "RawResult", ["unique_id", "start_logits", "end_logits"]) # bert_squad!!!! all_results = [] for count, inputs in enumerate(eval_dataset): x, _ = inputs unique_ids = x.pop("unique_ids") start_logits, end_logits = bert_squad(x, training=False) output_dict = dict(unique_ids=unique_ids, start_logits=start_logits,