def generate_squad_dataset():
    """Generates squad training dataset and returns input meta data."""
    assert FLAGS.squad_data_file
    if FLAGS.tokenization == "WordPiece":
        return squad_lib_wp.generate_tf_record_from_json_file(
            input_file_path=FLAGS.squad_data_file,
            vocab_file_path=FLAGS.vocab_file,
            output_path=FLAGS.train_data_output_path,
            translated_input_folder=FLAGS.translated_squad_data_folder,
            max_seq_length=FLAGS.max_seq_length,
            do_lower_case=FLAGS.do_lower_case,
            max_query_length=FLAGS.max_query_length,
            doc_stride=FLAGS.doc_stride,
            version_2_with_negative=FLAGS.version_2_with_negative,
            xlnet_format=FLAGS.xlnet_format)
    else:
        assert FLAGS.tokenization == "SentencePiece"
        return squad_lib_sp.generate_tf_record_from_json_file(
            input_file_path=FLAGS.squad_data_file,
            sp_model_file=FLAGS.sp_model_file,
            output_path=FLAGS.train_data_output_path,
            translated_input_folder=FLAGS.translated_squad_data_folder,
            max_seq_length=FLAGS.max_seq_length,
            do_lower_case=FLAGS.do_lower_case,
            max_query_length=FLAGS.max_query_length,
            doc_stride=FLAGS.doc_stride,
            xlnet_format=FLAGS.xlnet_format,
            version_2_with_negative=FLAGS.version_2_with_negative)
def get_input_meta_data():
    # training file, vocab file and path to output of tf files
    input_meta_data = generate_tf_record_from_json_file(
        "./data/train-v1.1.json", "./data/vocab.txt",
        "./data/train-v1.1.tf_record")

    # save the input meta data in json format
    with tf.io.gfile.GFile("./data/squad/train_meta_data", "w") as writer:
        writer.write(
            json.dumps(input_meta_data, indent=4) +
            "\n")  # \n just to make sure there is no issue with the code

    return input_meta_data
Beispiel #3
0
def generate_squad_dataset():
  """Generates squad training dataset and returns input meta data."""
  assert FLAGS.squad_data_file
  if FLAGS.tokenizer_impl == "word_piece":
    return squad_lib_wp.generate_tf_record_from_json_file(
        FLAGS.squad_data_file, FLAGS.vocab_file, FLAGS.train_data_output_path,
        FLAGS.max_seq_length, FLAGS.do_lower_case, FLAGS.max_query_length,
        FLAGS.doc_stride, FLAGS.version_2_with_negative)
  else:
    assert FLAGS.tokenizer_impl == "sentence_piece"
    return squad_lib_sp.generate_tf_record_from_json_file(
        FLAGS.squad_data_file, FLAGS.sp_model_file,
        FLAGS.train_data_output_path, FLAGS.max_seq_length, FLAGS.do_lower_case,
        FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative)
Beispiel #4
0
from joblib import dump, load

import app.config as cf


from app.model.BERTSquad import BERTSquad
from app.model.squad_loss_fn import squad_loss_fn



if __name__ == "__main__":

    input_meta_data = generate_tf_record_from_json_file(
        cf.INPUTS_FILE_TRAIN,
        cf.INPUTS_FILE_VOCAB,
        cf.INPUTS_FILE_DEV
    )

    with tf.io.gfile.GFile(cf.TRAIN_META_DATA, "w") as writer:
        writer.write(json.dumps(input_meta_data, indent=4) + "\n")


    train_dataset = create_squad_dataset(
        cf.INPUTS_FILE_DEV,
        input_meta_data['max_seq_length'], # 384
        cf.BATCH_SIZE,
        is_training=True
    )

    train_dataset_light = train_dataset.take(cf.NB_BATCHES_TRAIN)
Beispiel #5
0
import numpy as np
import math
import random
import time
import json
import collections
import os

from google.colab import drive

drive.mount("/content/drive")

input_meta_data = generate_tf_record_from_json_file(
    "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.json",
    "/content/drive/MyDrive/BERT/ChatBot/vocab.txt",
    "/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record"
)

with tf.io.gfile.GFile("/content/drive/MyDrive/BERT/ChatBot/train_meta_data","w") as writer:
    writer.write(json.dumps(input_meta_data, indent=4)+"\n")

BATCH_SIZE = 4
train_dataset = create_squad_dataset("/content/drive/MyDrive/BERT/ChatBot/train-v1.1.tf_record",
                                     input_meta_data["max_seq_length"],
                                     BATCH_SIZE,
                                     is_training=True)

class BertSquadLayer(tf.keras.layers.Layer):

    def __init__(self):