Exemple #1
0
 def _dataset_fn(ctx=None):
     """Returns tf.data.Dataset for distributed BERT pretraining."""
     batch_size = ctx.get_per_replica_batch_size(
         global_batch_size) if ctx else global_batch_size
     dataset = input_pipeline.create_squad_dataset(
         input_file_pattern,
         max_seq_length,
         batch_size,
         is_training=is_training,
         input_pipeline_context=ctx)
     return dataset
Exemple #2
0
def predict_squad_customized(strategy, input_meta_data, bert_config,
                             predict_tfrecord_path, num_steps):
    """Make predictions using a Bert-based squad model."""
    primary_cpu_task = '/job:worker' if FLAGS.tpu else ''

    with tf.device(primary_cpu_task):
        predict_dataset = input_pipeline.create_squad_dataset(
            predict_tfrecord_path,
            input_meta_data['max_seq_length'],
            FLAGS.predict_batch_size,
            is_training=False)
        predict_iterator = iter(
            strategy.experimental_distribute_dataset(predict_dataset))

        with strategy.scope():
            # Prediction always uses float32, even if training uses mixed precision.
            tf.keras.mixed_precision.experimental.set_policy('float32')
            squad_model, _ = bert_models.squad_model(
                bert_config,
                input_meta_data['max_seq_length'],
                float_type=tf.float32)

        checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
        logging.info('Restoring checkpoints from %s', checkpoint_path)
        checkpoint = tf.train.Checkpoint(model=squad_model)
        checkpoint.restore(checkpoint_path).expect_partial()

        @tf.function
        def predict_step(iterator):
            """Predicts on distributed devices."""
            def _replicated_step(inputs):
                """Replicated prediction calculation."""
                x, _ = inputs
                unique_ids, start_logits, end_logits = squad_model(
                    x, training=False)
                return dict(unique_ids=unique_ids,
                            start_logits=start_logits,
                            end_logits=end_logits)

            outputs = strategy.experimental_run_v2(_replicated_step,
                                                   args=(next(iterator), ))
            return tf.nest.map_structure(strategy.experimental_local_results,
                                         outputs)

        all_results = []
        for _ in range(num_steps):
            predictions = predict_step(predict_iterator)
            for result in get_raw_results(predictions):
                all_results.append(result)
            if len(all_results) % 100 == 0:
                logging.info('Made predictions for %d records.',
                             len(all_results))
        return all_results
  def build_inputs(self, params, input_context=None):
    """Returns tf.data.Dataset for sentence_prediction task."""
    if params.input_path == 'dummy':
      # Dummy training data for unit test.
      def dummy_data(_):
        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
        x = dict(
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
        y = dict(
            start_positions=tf.constant(0, dtype=tf.int32),
            end_positions=tf.constant(1, dtype=tf.int32))
        return (x, y)

      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
      dataset = dataset.map(
          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
      return dataset

    if params.is_training:
      input_path = params.input_path
    else:
      input_path, self._eval_examples, self._eval_features = (
          self._preprocess_eval_data(params))

    batch_size = input_context.get_per_replica_batch_size(
        params.global_batch_size) if input_context else params.global_batch_size
    # TODO(chendouble): add and use nlp.data.question_answering_dataloader.
    dataset = input_pipeline.create_squad_dataset(
        input_path,
        params.seq_length,
        batch_size,
        is_training=params.is_training,
        input_pipeline_context=input_context)
    return dataset
Exemple #4
0
if __name__ == "__main__":

    input_meta_data = generate_tf_record_from_json_file(
        cf.INPUTS_FILE_TRAIN,
        cf.INPUTS_FILE_VOCAB,
        cf.INPUTS_FILE_DEV
    )

    with tf.io.gfile.GFile(cf.TRAIN_META_DATA, "w") as writer:
        writer.write(json.dumps(input_meta_data, indent=4) + "\n")


    train_dataset = create_squad_dataset(
        cf.INPUTS_FILE_DEV,
        input_meta_data['max_seq_length'], # 384
        cf.BATCH_SIZE,
        is_training=True
    )

    train_dataset_light = train_dataset.take(cf.NB_BATCHES_TRAIN)

    bert_squad = BERTSquad()

    optimizer = optimization.create_optimizer(
        init_lr=cf.INIT_LR,
        num_train_steps=cf.NB_BATCHES_TRAIN,
        num_warmup_steps=cf.WARMUP_STEPS
    )

    train_loss = tf.keras.metrics.Mean(name="train_loss")
Exemple #5
0
from google.colab import drive

drive.mount("/content/drive")

input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.json",
    "/content/drive/MyDrive/BERT/ChatBot/vocab.txt",
    "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record"
)

with tf.io.gfile.GFile("/content/drive/MyDrive/BERT/ChatBot/train_meta_data","w") as writer:
    writer.write(json.dumps(input_meta_data, indent=4)+"\n")

BATCH_SIZE = 4
train_dataset = create_squad_dataset("/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record",
                                     input_meta_data["max_seq_length"],
                                     BATCH_SIZE,
                                     is_training=True)

class BertSquadLayer(tf.keras.layers.Layer):

    def __init__(self):
        super(BertSquadLayer, self).__init__()
        self.final_dense = tf.keras.layers.Dense(
            units=2,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
        )
    
    def call(self,inputs):
        logits = self.final_dense(inputs)

        logits = tf.transpose(logits, [2,0,1])
Exemple #6
0
    
input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/My Drive/BERT/data/squad/train-v1.1.json",
    "/content/drive/My Drive/BERT/data/squad/vocab.txt",
    "/content/drive/My Drive/BERT/data/squad/train-v1.1.tf_record")

input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/My Drive/BERT/data/squad/train-v1.1.json",
    "/content/drive/My Drive/BERT/data/squad/vocab.txt",
    "/content/drive/My Drive/BERT/data/squad/train-v1.1.tf_record")

BATCH_SIZE = 4

train_dataset = create_squad_dataset(
    "/content/drive/My Drive/BERT/data/squad/train-v1.1.tf_record",
    input_meta_data['max_seq_length'], # 384
    BATCH_SIZE,
    is_training=True)


# Stage 3: Model building

class BertSquadLayer(tf.keras.layers.Layer):

  def __init__(self):
    super(BertSquadLayer, self).__init__()
    self.final_dense = tf.keras.layers.Dense(
        units=2,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02))

  def call(self, inputs):
Exemple #7
0
    tokenizer=tokenizer,
    max_seq_length=384,
    doc_stride=128,
    max_query_length=64,
    is_training=False,
    output_fn=_append_feature,
    batch_size=4)

eval_writer.close()

# Load the ready-to-be-used dataset to our session
BATCH_SIZE = 4

eval_dataset = create_squad_dataset(
    "./data/squad/eval.tf_record",
    384,        #input_meta_data['max_seq_length'],
    BATCH_SIZE,
    is_training=False)  # its not for trainig

## -- Making the predictions
'''
Need to make correct input format according to google
'''

# Defines a certain type of collection (like a dictionary).nametupale create tuble with element with name attached to it.
# kind of dictionary but tuple 
RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])

# Returns each element of batched output once at a time
def get_raw_results(predictions):
    for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
import json
import math
import time
import os


# ********** Phase 2: Data Preprocessing  **********

with open('./Bert_QA/data/squad/train_meta_data') as json_file: 
    input_meta_data = json.load(json_file)

BATCH_SIZE = 4

train_dataset = create_squad_dataset(
    "./Bert_QA/data/squad/train-v1.1.tf_record",
    input_meta_data['max_seq_length'], # 384
    BATCH_SIZE,
    is_training=True)

# ********** Phase 3: Model Building  **********

# SQUAD Layer: layer to add after the BERT
# Get 2 list start and end word socre of the answer.
class BertSquadLayer(tf.keras.layers.Layer):

    def __init__(self):
        super(BertSquadLayer, self).__init__()
        self.final_dense = tf.keras.layers.Dense(
            units=2, # 2 output units
            # Gaussian function 'TruncatedNormal' not to get high values stddev how much to spred the function
            kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)) 
try:
    with open('./data/squad/train_meta_data') as json_file:
        input_meta_data = json.load(json_file)
    print("train meta data file is available")
except:
    print("Generating train meta data file")
    input_meta_data = get_input_metadata()
'''
* inputs are much bigger so bigger batch size can create an issue, CPU limitation
* because tweets have small sentences were as here most of time we have to work with multiple paragraph with another sentence that is question
'''
BATCH_SIZE = 4

train_dataset = create_squad_dataset(
    "./data/train-v1.1.tf_record",
    input_meta_data[
        'max_seq_length'],  # we don't have input sequence of more than 384 words
    BATCH_SIZE,
    is_training=True)  # stating it is for training set

# ********** Phase 3: Model Building  **********


# SQUAD Layer: layer to add after the BERT
# Get 2 list start and end word socre of the answer.
class BertSquadLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(BertSquadLayer, self).__init__()
        self.final_dense = tf.keras.layers.Dense(
            units=2,  # 2 output units
            # Gaussian function 'TruncatedNormal' not to get high values stddev how much to spred the function
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
Exemple #10
0
                                                output_fn=_append_feature,
                                                batch_size=4)

    my_bert_layer = hub.KerasLayer(
        "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
        trainable=False)

    vocab_file = my_bert_layer.resolved_object.vocab_file.asset_path.numpy()

    do_lower_case = my_bert_layer.resolved_object.do_lower_case.numpy()

    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    eval_dataset = create_squad_dataset(
        cf.OUTPUT_EVAL_FILE,
        384,
        #input_meta_data['max_seq_length'],
        cf.BATCH_SIZE,
        is_training=False)

    RawResult = collections.namedtuple(
        "RawResult", ["unique_id", "start_logits", "end_logits"])

    # bert_squad!!!!

    all_results = []
    for count, inputs in enumerate(eval_dataset):
        x, _ = inputs
        unique_ids = x.pop("unique_ids")
        start_logits, end_logits = bert_squad(x, training=False)
        output_dict = dict(unique_ids=unique_ids,
                           start_logits=start_logits,