Example #1
0
def load_model(train_steps, num_warmup_steps):
    try:  # try to load finetuned model at local.
        tokenizer = load_tokenizer()
        config = GPT2Config.from_pretrained(configs.model_path,
                                            return_dict=False)
        model = TFGPT2LMHeadModel.from_pretrained(configs.model_path,
                                                  return_dict=False)
        print("model loaded from local!")
    except Exception as e:
        tokenizer = BertTokenizer.from_pretrained(
            "mymusise/gpt2-medium-chinese")
        model = TFGPT2LMHeadModel.from_pretrained(
            "mymusise/gpt2-medium-chinese", return_dict=False)
        print("model loaded from remote!")

    loss = model.compute_loss
    optimizer = nlp.optimization.create_optimizer(
        5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(
        optimizer=optimizer,
        loss=[loss, *[None] * model.config.n_layer],
        # metrics=[metric]
    )
    return model
    def load_or_train_model(tokenizer, file_paths, gpt_model_path, cumulative_string_path):
        '''
        Tries to load previously trained model
        If there is none, runs the training on the generated dataset
        '''

        if os.path.exists(gpt_model_path):
            print('Loading GPT model')

            return TFGPT2LMHeadModel.from_pretrained(gpt_model_path + 'pytorch_model.bin')

        else:

            print('GPT model not found, training one')

            # Creating the configurations from which the model can be made
            config = GPT2Config(
                vocab_size=tokenizer.vocab_size,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

            # Creating the model
            model = TFGPT2LMHeadModel(config)

            # Defining our optimizer
            optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)

            # Definining our loss function
            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

            # Defining our metric which we want to observe
            metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

            # Compiling the model
            model.compile(optimizer=optimizer, loss=[loss, *[None] * model.config.n_layer], metrics=[metric])

            # Prepare training dataset
            dataset = GPTModel.load_or_generate_dataset(tokenizer, file_paths, cumulative_string_path)

            # Execute training
            num_epoch = 3
            model.fit(dataset, epochs=num_epoch)

            # Creating directory if it is not present
            os.mkdir(gpt_model_path)

            # Save the model
            output_model_file = os.path.join(gpt_model_path, WEIGHTS_NAME)
            model.save_pretrained(output_model_file)

            # Save the config
            model_to_save = model.module if hasattr(model, 'module') else model
            output_config_file = os.path.join(gpt_model_path, CONFIG_NAME)
            model_to_save.config.to_json_file(output_config_file)

            return model
Example #3
0
def init_model(
    tokenizer: BertTokenizer,
    train_steps: int = 20000,
    num_warmup_steps: int = 1000,
    model_path: str = configs.model_path,
) -> TFGPT2LMHeadModel:

    try:
        model = TFGPT2LMHeadModel.from_pretrained(
            model_path, return_dict=False)
    except EnvironmentError:
        config = GPT2Config(
            architectures=["TFGPT2LMHeadModel"],
            model_type="TFGPT2LMHeadModel",
            tokenizer_class="BertTokenizer",
            vocab_size=tokenizer.vocab_size,
            n_positions=configs.model.n_positions,
            n_ctx=configs.model.n_ctx,
            n_embd=configs.model.n_embd,
            n_layer=configs.model.n_layer,
            n_head=configs.model.n_head,
            d_model=configs.model.n_embd,
            num_heads=configs.model.n_head,
            pad_token_id=tokenizer.pad_token_id,
            task_specific_params={
                "text-generation": {
                    "do_sample": True,
                    "max_length": 120
                }
            },
            return_dict=False,
            output_attentions=False,
            output_hidden_states=False,
            use_cache=False,
        )
        model = TFGPT2LMHeadModel(config)

    loss = model.compute_loss
    optimizer = nlp.optimization.create_optimizer(
        5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps)

    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
    # metric = Mymetrice('accuracy')

    model.compile(
        optimizer=optimizer,
        loss=[loss, *[None] * model.config.n_layer],
        metrics=[metric]
    )

    return model
Example #4
0
def load_model():
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    # add the EOS token as PAD token to avoid warnings
    model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

    return tokenizer, model
def alternate_sentences(pos, sentence):
    GPT2tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    GPT2model = TFGPT2LMHeadModel.from_pretrained(
        "gpt2", pad_token_id=GPT2tokenizer.eos_token_id)
    #     GPT2tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
    #     GPT2model = TFGPT2LMHeadModel.from_pretrained("distilgpt2",pad_token_id=GPT2tokenizer.eos_token_id)
    partial_sentence = get_np_vp(pos, sentence)
    input_ids = GPT2tokenizer.encode(partial_sentence, return_tensors='tf')
    maximum_length = len(partial_sentence.split()) + 40
    # Activate top_k sampling and top_p sampling with only from 90% most likely words
    sample_outputs = GPT2model.generate(
        input_ids,
        do_sample=True,
        max_length=maximum_length,
        top_p=0.80,  # 0.85 
        top_k=30,  #30
        repetition_penalty=10.0,
        num_return_sequences=10)
    generated_sentences = []
    sentence = sentence.replace("\n", "")
    for i, sample_output in enumerate(sample_outputs):
        decoded_sentence = GPT2tokenizer.decode(sample_output,
                                                skip_special_tokens=True)
        # final_sentence = decoded_sentence
        final_sentence = tokenize.sent_tokenize(decoded_sentence)[0]
        final_sentence = final_sentence.replace("\r\n", "")
        final_sentence = final_sentence.replace("\n", "")
        generated_sentences.append(final_sentence)
    generated_sentences.append(sentence)

    if len(generated_sentences) > 2:
        return generated_sentences[-2:]

    else:
        return generated_sentences
Example #6
0
def load_model(category):
    model_path = f"../models/model_gpt2_{category}"
    tokenizer = GPT2Tokenizer.from_pretrained(model_path,
                                              local_files_only=True)
    model = TFGPT2LMHeadModel.from_pretrained(model_path,
                                              local_files_only=True)
    return model, tokenizer
Example #7
0
def generate_text(self,
                  prefix=None,
                  file_data=True,
                  max_length=512,
                  do_sample=True,
                  top_k=50,
                  top_p=0.9,
                  temperature=0.3,
                  return_sequences=2):
    '''
    Takes in initial text and generates text with specified number of characters more using Top P sampling
    :param prefix: initial text to start with
    :param several parameters to hyperparemeterize with given defaults
    :return: complete generated text
    '''

    if return_sequences < 1:
        raise Exception(
            "return sequences number is less than 1 (need an integer of atleast 1)"
        )

    if max_length < 1:
        raise Exception("Max text length must be equal to or greater than 1")

    with NoStdStreams():
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        model = TFGPT2LMHeadModel.from_pretrained(
            "gpt2", pad_token_id=tokenizer.eos_token_id)

    if file_data:
        f = open(self.dataset, "r")
        input_ids = tokenizer.encode(f.read(),
                                     return_tensors='tf',
                                     max_length=max_length - 1,
                                     truncation=True)
        f.close()
    else:
        input_ids = tokenizer.encode(prefix,
                                     return_tensors='tf',
                                     max_length=max_length - 1,
                                     truncation=True)

    logger("Generating text now...")
    tf.random.set_seed(0)
    output = model.generate(input_ids,
                            do_sample=do_sample,
                            max_length=max_length,
                            top_k=top_k,
                            top_p=top_p,
                            temperature=temperature,
                            num_return_sequences=return_sequences)
    total_text = ""
    for i, sample_output in enumerate(output):
        value = "{}: {}".format(
            i, tokenizer.decode(sample_output, skip_special_tokens=True))
        total_text += value

    self.models['text_generation'] = {"generated_text": total_text}
    return self.models['text_generation']
Example #8
0
 def __init__(self, inp_context):
     # user input context
     self.inp_context = inp_context
     # the transformers
     self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     # add the EOS token as PAD token to avoid warnings
     self.model = TFGPT2LMHeadModel.from_pretrained("gpt2",
             pad_token_id=self.tokenizer.eos_token_id)
def load_model_tokenizer_GPT2():
    """
    Loads GPT-2 model from local memory. Replace with gpt2
    """
    dir_path = os.path.dirname(os.path.realpath(__file__))
    tokenizer = GPT2Tokenizer.from_pretrained(f'{dir_path}\\gpt2_model')
    model = TFGPT2LMHeadModel.from_pretrained(f'{dir_path}\\gpt2_model')
    return tokenizer, model
Example #10
0
    def load(self):
        self._tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        # check if model exists
        if os.path.exists(self._model_path):
            print('loading')
            # https://huggingface.co/transformers/training.html#fine-tuning-in-native-tensorflow-2
            self._model = TFGPT2LMHeadModel.from_pretrained(self._model_path)
            # self._model.load_weights(self._model_path)

        return self
Example #11
0
    def __init__(self, next_node):
        super().__init__(next_node)
        # self.dir_path = r'D:\BaiduNetdiskDownload\huggingface\gpt2-chinese-poem'
        # self.dir_path = r'D:\python\nlp_chat_robot\models\model_file\gpt2-chinese-poem'
        self.dir_path = config.poem_gen_node_dir_path
        self.tokenizer = BertTokenizer.from_pretrained(self.dir_path)
        self.model = TFGPT2LMHeadModel.from_pretrained(self.dir_path)

        self.text_generator = TextGenerationPipeline(self.model,
                                                     self.tokenizer)
Example #12
0
 def __init__(self, flags, model_path=HF_MODEL_PATH):
     if flags.model_type == 'tf':
         from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
         self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
         self.model = TFGPT2LMHeadModel.from_pretrained(
             model_path, pad_token_id=self.tokenizer.eos_token_id)
     else:
         from transformers import GPT2LMHeadModel, GPT2Tokenizer
         self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
         self.model = GPT2LMHeadModel.from_pretrained(
             model_path, pad_token_id=self.tokenizer.eos_token_id)
     self.flags = flags
    def test_encoder_decoder_save_load_from_encoder_decoder(self):
        config = self.get_encoder_decoder_config_small()

        # create two random ViT/GPT2 models for vit-gpt2 & initialize weights (+cross_attention weights)
        encoder = TFViTModel(config.encoder)
        encoder(encoder.dummy_inputs)
        decoder = TFGPT2LMHeadModel(config.decoder)
        decoder(decoder.dummy_inputs)

        encoder_decoder_orig = TFVisionEncoderDecoderModel(encoder=encoder,
                                                           decoder=decoder)

        pixel_values = floats_tensor([
            13,
            encoder.config.num_channels,
            encoder.config.image_size,
            encoder.config.image_size,
        ])
        decoder_input_ids = ids_tensor([13, 1], decoder.config.vocab_size)

        logits_orig = encoder_decoder_orig(
            pixel_values=pixel_values,
            decoder_input_ids=decoder_input_ids).logits

        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_path = os.path.join(tmp_dirname, "encoder")
            decoder_path = os.path.join(tmp_dirname, "decoder")

            encoder.save_pretrained(encoder_path)
            decoder.save_pretrained(decoder_path)

            encoder_decoder = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
                encoder_path, decoder_path)

        logits_1 = encoder_decoder(pixel_values=pixel_values,
                                   decoder_input_ids=decoder_input_ids).logits

        self.assertTrue(
            logits_orig.numpy().sum() - logits_1.numpy().sum() < 1e-3)

        max_diff = np.max(np.abs(logits_1.numpy() - logits_orig.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=4)

        with tempfile.TemporaryDirectory() as tmp_dirname:
            encoder_decoder.save_pretrained(tmp_dirname)
            encoder_decoder = TFVisionEncoderDecoderModel.from_pretrained(
                tmp_dirname)

        logits_2 = encoder_decoder(pixel_values=pixel_values,
                                   decoder_input_ids=decoder_input_ids).logits

        max_diff = np.max(np.abs(logits_2.numpy() - logits_orig.numpy()))
        self.assertAlmostEqual(max_diff, 0.0, places=4)
Example #14
0
    def __init__(self, model_name, device, tf_pt='tf'):
        self.tf_pt = tf_pt
        self.device = device
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)

        if self.tf_pt == 'tf':
            self.model = TFGPT2LMHeadModel.from_pretrained(
                model_name, pad_token_id=self.tokenizer.eos_token_id)
        else:
            self.model = GPT2LMHeadModel.from_pretrained(
                model_name, pad_token_id=self.tokenizer.eos_token_id)

        self.model.device(device)
Example #15
0
def main(highQualityMode=False):
    dir_path = os.path.dirname(os.path.realpath(__file__))
    tokenizer_save_path = dir_path + "/saved_tokenizer"
    model_out_dir = dir_path + "/model/"
    input_dir = dir_path + "/input.txt"
    output_dir = dir_path + "/output.txt"

    try:
        with open(input_dir, 'r') as r:
            text = r.read()
        os.remove(input_dir)
    except:
        raise SystemExit("Input not found!")

    translator = Translator()
    lang = translator.detect(text).lang
    if lang != "zh-CN":
        text = translator.translate(text, dest="zh-tw").text

    #load pretrained model
    tokenizer = getTokenizer(tokenizer_save_path)
    model = TFGPT2LMHeadModel.from_pretrained(model_out_dir)

    # encoding the input text
    start = time.time()
    input_ids = tokenizer.encode(text, return_tensors='tf')

    # generate output
    NUM_SEQUENCE = 3 if highQualityMode else 1
    beam_output = model.generate(input_ids,
                                 max_length=1000,
                                 num_beams=int(random.random() * 10) + 1,
                                 temperature=random.random() * 10 % 5 / 10 +
                                 0.5,
                                 no_repeat_ngram_size=2,
                                 num_return_sequences=NUM_SEQUENCE,
                                 top_k=int(random.random() * 100 % 40),
                                 top_p=1)

    beam_output = getHighQuality(tokenizer.eos_token_id, beam_output)

    with open(output_dir, 'w') as w:
        w.write("Time used: " + str(time.time() - start) + '\n')

        if lang != "zh-CN":
            output = translator.translate(tokenizer.decode(beam_output),
                                          dest=lang).text
        else:
            output = tokenizer.decode(beam_output)

        w.write(output)
Example #16
0
def create_model(args, vocab_size):
    """
    :param args:
    :param vocab_size:字典大小
    :return:
    """
    print('配置模型参数')
    # model_config = GPT2Config.from_json_file('config/model_config_dialogue_small.json')
    print(vocab_size)
    print('创建model')
    # model = TFGPT2LMHeadModel.from_pretrained('gpt2')
    if args.pretrained_model:  # 如果指定了预训练的GPT2模型
        model = TFGPT2LMHeadModel.from_pretrained(args.pretrained_model)
    else:  # 若没有指定预训练模型,则初始化模型
        print('初始化模型')
        model_config = GPT2Config.from_json_file(args.model_config)
        print('config:\n' + model_config.to_json_string())
        model = TFGPT2LMHeadModel(config=model_config)
        print('构造好模型')
        # 根据tokenizer的vocabulary调整GPT2模型的voca的大小
    #model.resize_token_embeddings(vocab_size)

    # model = TFGPT2LMHeadModel.from_pretrained()#实例化一个类
    return model, model.config.to_dict().get("n_ctx")
Example #17
0
    def _initialise_model_and_tokenizer(self):
        """
        Initialise model and tokenizer.
        ----------
        model_name_or_dir: str, optional
            either local dir containing the tf_model.h5, vocab.json, config.json, merges.txt OR the model
            shortcut name 'distilgpt2'
        """

        self.tokenizer = GPT2Tokenizer.from_pretrained(
            pretrained_model_name_or_path=self.model_name)
        self.tokenizer.pad_token = '[PAD]'
        self.tokenizer.decoder[
            self.tokenizer.pad_token_id] = self.tokenizer.pad_token
        self.keras_model = TFGPT2LMHeadModel.from_pretrained(self.model_name)
        self.get_candidate_word_probs(
            '.', ['warming', 'up'])  # because first prediction is always slow
Example #18
0
def main():
    # Initialize a tokenizer and model.
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = TFGPT2LMHeadModel.from_pretrained("gpt2", from_pt=True)

    # Infinite loop.
    while True:
        #i = 0
        #while i == 0:
        # Take in the user input. If the input matches a certain
        # string, break out of the loop and exit the program.
        user_input = input("prompt:> ")
        if user_input == "<|endoftext|>":
            break
        #user_input = "And to the darkness, I cast a bright light. The shadow disolves "

        # Tokenize/encode the input text.
        #encoded_input = tokenizer.tokenize(user_input)
        encoded_input = tokenizer.encode(user_input, return_tensors="tf")

        # Generate samples.
        generated_samples = model.generate(encoded_input,
                                           max_length=150,
                                           num_return_sequences=10,
                                           no_repeat_ngram_size=2,
                                           repetition_penalty=1.5,
                                           top_p=0.92,
                                           temperature=0.85,
                                           do_sample=True,
                                           top_k=125,
                                           early_stopping=True)

        # Print samples
        for i, beam in enumerate(generated_samples):
            print("{}: {}".format(
                i, tokenizer.decode(beam, skip_special_tokens=True)))
            print()
        #i += 1

    # Exit the program.
    exit(0)
Example #19
0
  def __init__(self):
    """Possible states are 
    1. "await" (awaiting response)
    2. "proceed" (proceed with the conversation)- used to give the bot control over the converation"""
    self._state="await"
  
    """Possible Flags are 
    1. "Exec" (task Executed)
    2. "notExec" (proceed with the conversation)- used to give the bot control over the converation"""
    self._FLAG=None
    self._bert_base_case_mrpc_tokenizer=AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
    self._bert_base_case_mrpc_model=TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
    self._gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    self._gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self._gpt2_tokenizer.eos_token_id)
    self.bert_large_uncased_whole_word_masking_finetuned_squad_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self.bert_large_uncased_whole_word_masking_finetuned_squad_model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    self._DialoGP_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
    self._DialoGP_model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")



    self._conversation_started=False
    self._conversation_ended=True
Example #20
0
def setup_model_finetuning(path_to_pretrained, tokenizer_en, tokenizer_lng):
    # load pre-trained models
    model = TFGPT2LMHeadModel.from_pretrained(path_to_pretrained)

    # setup new embedding matrix for fine-tuning
    weights = tf.stop_gradient(
        model.transformer.get_input_embeddings() \
        .weight.value()) \
        .numpy()

    # get mean embeddings
    mean_weights = tf.reduce_mean(weights, axis=0).numpy()

    new_vocab = tokenizer_lng.get_vocab()
    old_vocab = tokenizer_en.get_vocab()
    new_embeddings = tf.zeros([len(new_vocab), mean_weights.shape[0]]).numpy()

    for word, idx_new in new_vocab.items():
        idx_old = old_vocab.get(word, -1)
        if idx_old >= 0:
            new_embeddings[idx_new, :] = weights[idx_old, :]
        else:
            new_embeddings[idx_new, :] = mean_weights

    # set embeddings
    model.transformer.set_input_embeddings(tf.constant(new_embeddings))

    # freezing model weights
    for layer in model.transformer.h:
        layer.trainable = False

        model.transformer.wte.trainable = True
        model.transformer.wpe.trainable = True
        model.transformer.ln_f.trainable = True

    return model
Example #21
0
 def __init__(self, dir_path):
     super(GPT2Model, self).__init__()
     self.gpt2 = TFGPT2LMHeadModel.from_pretrained(dir_path)
Example #22
0
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from transformers import TFGPT2LMHeadModel, BertTokenizer
import tensorflow as tf
import re
from flask import Flask
from flask import render_template
from flask import request, Response

app = Flask(__name__)
app.config['DEBUG'] = True
model = TFGPT2LMHeadModel.from_pretrained("gpt2-cn-50")


@app.route('/')
def index_main():
    return render_template('index.html')


@app.route('/random', methods=["GET", "POST"])
def get_text():
    if request.method == "GET":
        sentence = request.args.get("message")
        result = test_model(sentence)
        return Response(result)


def test_model(sentence):
    if " " not in sentence:
        sentence = re.sub("", " ", sentence)[1:]
Example #23
0
        return "negative"
    elif max((model.predict(text))[0]) == model.predict(text)[0][1]:
        return "neutral"
    else:
        return "positive"


def about_symbol(text):
    text = text.replace(".", ". ")
    text = text.replace(". . .", ". ")

    return text


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model_gpt = TFGPT2LMHeadModel.from_pretrained(
    "gpt2", pad_token_id=tokenizer.eos_token_id)

with open("train_pos_edit_full.json", encoding='utf-8') as json_file:
    json_data = json.load(json_file)
    json_string = json_data['splited_sentence']

seq_length = 1000
start = time.time()

text_list = []
output_list = []

for a in tqdm(range(len(json_string))):
    # print("-" * 100)
    for b in range(len(json_string[a])):
        input_text = json_string[a][b]
# 참고 : https://nlp.gluon.ai/api/modules/data.html
toked = tokenizer('안녕 하세요')   # tokenizer가 잘못 만들어진 듯 ... 한글자씩 ?
print(toked)

toked_idx = vocab(toked)
print(toked_idx)

toked = vocab.to_tokens(toked_idx)
print(toked)

detoked = detokenizer(toked)
print(detoked)

''.join(toked).replace('▁', ' ')

model = TFGPT2LMHeadModel.from_pretrained(MODEL_PATH)
model.summary()

# 모델의 seed 입력 문장 생성
tok = tokenizer('이때')   # tok = ['▁', '이', '때']
tok_idx = [vocab[vocab.bos_token]] + vocab[tok]     # tok_idx = [0, 47437, 47438, 47675]
input_ids = tf.convert_to_tensor(tok_idx)[None, :]  # 텐서로 변환

# 모델의 출력
output = model.generate(input_ids, max_length=50)

output

# 모델의 출력을 문자열로 변환
out_tok_idx = output.numpy().tolist()[0]   # output token 인덱스
out_tok = vocab.to_tokens(out_tok_idx)     # token 인덱스를 token 문자로 변환
for i in range(num_gpus):
    devices.append("GPU:" + str(i))
strategy = tf.distribute.MirroredStrategy(devices=devices)
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
print(
    "============================ Loading model from pretrained and compiling ==========================="
)
with strategy.scope():
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    print("========================= Loading dataset ========================")
    train_dataset = tokenize(get_dataset(train_file), tokenizer,
                             truncate).batch(num_gpus)
    valid_dataset = tokenize(get_dataset(valid_file), tokenizer,
                             truncate).batch(num_gpus)
    model = TFGPT2LMHeadModel.from_pretrained(model_name)
    #Disable past key values
    model.config.use_cache = False
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = metrics.SparseCategoricalAccuracy(name='Accuracy')
    model.compile(optimizer=optimizer,
                  loss=[loss, *[None] * model.config.n_layer],
                  metrics=[metric])
print(
    "========================= Finetuning Model =================================="
)
model.fit(train_dataset, batch_size=64, epochs=num_epochs)
print(
    "========================= Evaluating Model =================================="
)
Example #26
0
 def __init__(self):
     self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
     self.model = TFGPT2LMHeadModel.from_pretrained(
         'gpt2', pad_token_id=self.tokenizer.eos_token_id)
Example #27
0
def run(inp, outString):
    #tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    #model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
    tokenizer = GPT2Tokenizer.from_pretrained("ATLA_GPT2", from_pt=True)
    model = TFGPT2LMHeadModel.from_pretrained(
        "ATLA_GPT2", from_pt=True, pad_token_id=tokenizer.eos_token_id)
    stop_token = tokenizer.encode('\n', return_tensors='tf')
    while (True):
        i = ""
        server_key = ''
        while (inp.empty()):
            pass
        while (not inp.empty()):
            tmp = inp.get()
            server_key = tmp['server_key']
            i += tmp['SpeechResult']
        '''
        #Collect total input from memory
        #Currently throws an error when using beams
        st = ''
        for s in paragraph:
        	st += s
        i = st + i
        '''
        input_ids = tokenizer.encode(i, return_tensors='tf')  # Batch size 1

        # set no_repeat_ngram_size to 2
        '''
        beam_output = model.generate(
            input_ids, 
            num_beams=3, 
            no_repeat_ngram_size=2, 
            early_stopping=True,
            s ='\n'
        )
        '''

        beam_output = model.generate(input_ids,
                                     do_sample=True,
                                     max_length=50,
                                     top_k=50)
        '''
        beam_output = model.generate(
            input_ids, 
            do_sample=True, 
            max_length=50, 
            top_p=0.92, 
            top_k=0
        )
        '''
        #max_length=len(i) + 30,
        '''
        beam_output = model.generate(
            input_ids
        )
        '''
        #print("Output:\n" + 100 * '-')
        #print(tokenizer.decode(beam_output[0], skip_special_tokens=True))
        out = (tokenizer.decode(beam_output[0], skip_special_tokens=False))
        paragraph.append(out)
        out = out[len(i):len(out)]
        outString.put(out)
        data = {"GPT2_RESULT": out, "server_key": server_key}
        print(data)
        #print(data)
        #print(paragraph)
        resp = requests.post(url, json=data)
    config = base_model.config
    tokenizer = BertTokenizer.from_pretrained(vocab_name)
    trainer = Trainer
elif FLAGS.pretrained_model_name == "bert_mini_uncased":
    pretrained_model_name = "uncased_L-4_H-256_A-4"
    vocab_name = f"{PRETRAINED_MODELS_DIR}/{pretrained_model_name}/"
    config = read_bert_config(pretrained_model_name)
    base_model = TFBertModel(config)
    base_model = ModelManager.load_pretrained_model(
        base_model, f"{pretrained_model_name}/bert_model.ckpt.index")
    tokenizer = BertTokenizer.from_pretrained(vocab_name)
    trainer = Trainer
elif FLAGS.pretrained_model_name == "gpt2":
    pretrained_model_name = "gpt2"
    vocab_name = "gpt2"
    base_model = TFGPT2LMHeadModel.from_pretrained(pretrained_model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(vocab_name)
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    base_model.config.pad_token_id = tokenizer.pad_token_id
    config = base_model.config
    trainer = GPTTrainer
else:
    raise NotImplementedError()

embedder = Embedder(pretrained_model_name, tokenizer)
model = Transformer(embedder=embedder,
                    model=base_model,
                    hidden_state_size=config.hidden_size,
                    max_sequence_length=150)

if FLAGS.load_model:
 def get_encoder_decoder_models(self):
     encoder_model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k", name="encoder")
     decoder_model = TFGPT2LMHeadModel.from_pretrained("../gpt2", config=self.get_decoder_config(), name="decoder")
     return {"encoder": encoder_model, "decoder": decoder_model}
 def get_encoder_decoder_model(self, config, decoder_config):
     encoder_model = TFViTModel(config, name="encoder")
     decoder_model = TFGPT2LMHeadModel(decoder_config, name="decoder")
     return encoder_model, decoder_model