def trainModel(dictionary, model_size = "124M"):
    text = ""
    for eachItem in dictionary.items():
        for eachItemArticle in eachItem:
            if(type(eachItemArticle) == int):
                continue
            else:
                text += eachItemArticle["texto"] + " "
    f = open("data.txt", "w+")
    f.write(text)
    f.close()
    gpt2.encode_dataset("data.txt")
Example #2
0
def encode_and_compress(inFile):
    gpt2.encode_dataset(inFile)
Example #3
0
#!/usr/bin/env python

import os

import gpt_2_simple as gpt2

STEPS = 500
MODEL_NAME = '355M'
FILE_PATH = 'tweets'

if not os.path.isdir(os.path.join('models', MODEL_NAME)):
    print(f'Downloading {MODEL_NAME} model...')
    gpt2.download_gpt2(model_name=MODEL_NAME)

gpt2.encode_dataset(f'{FILE_PATH}.csv')

sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              'text_encoded.npz',
              model_name=MODEL_NAME,
              steps=STEPS,
              restore_from='fresh',
              run_name='run1',
              print_every=10,
              sample_every=100)
Example #4
0
import gpt_2_simple as gpt2
import pandas as pd
from tqdm import tqdm
import os

input_data_file = './data/2-quotes_filtered.csv'
output_folder = "output/"

split_quotes = pd.read_csv(input_data_file)
print(split_quotes.head())

## These were the top 20 most common topics in the dataset
topics_to_keep = ['life', 'love','inspirational', 'humor', 
                  'death', 'art', 'education', 'books', 'change', 'time', 
                  'beauty', 'god', 'happiness', 'children', 'work', 'faith', 
                  'funny', 'good', 'family', 'friendship']

quotes_to_keep = split_quotes[split_quotes['topic'].isin(topics_to_keep)]

## Create the GPT-ready dataset
file_name = os.path.join(output_folder, "processed_quotes.txt")
with open(file_name, mode='w') as open_file:
  for index, row in split_quotes.iterrows():
    open_file.write("_TOPIC_ {} _QUOTE_ {} _AUTHOR_ {} _END_\n".format(row['topic'], row['quote'], row['author']))

## Encode it to make loading faster during training
gpt2.encode_dataset(file_name, out_path=os.path.join(output_folder,'text_encoded.npz'))
Example #5
0
import pandas as pd
from os import path

if len(sys.argv) < 3:
    print('You must enter the corpus file and model name as parameters, e.g.: bot.py comment_data.txt 355M')
    sys.exit(1)    

file_name = sys.argv[1]
model_name = sys.argv[2]

sess = gpt2.start_tf_sess()

# Check if file exists
if not path.isfile(file_name):
    print("File does not exist. Please use a text corpus for training.")
    sys.exit(1)    

# Encode data if not already encoded
if file_name[-4:] != ".npz":
    old_file_name = file_name
    file_name = file_name[:-4] + ".npz"
    print("Encoding data...")
    try:
        gpt2.encode_dataset(old_file_name, out_path=file_name, model_name=model_name)
    except:
        print("Failed to encode data. Please check that your file is a text corpus that can be encoded.")
        sys.exit(1)    

gpt2.finetune(sess, file_name, model_name=model_name, multi_gpu=False, overwrite=True)

Example #6
0
def main():
    """
    The main function.
    """
    parser = argparse.ArgumentParser(
        description="Finetune a GPT-2 model using ff2zim")
    parser.add_argument("-d",
                        "--debug",
                        action="store_true",
                        help="show debug information")
    subparsers = parser.add_subparsers(dest="action",
                                       help="action to perform",
                                       required=True)

    # parser for generating trainingfile
    tfparser = subparsers.add_parser(
        "generate-trainingfile",
        help="generate the trainingfile from a ff2zim project")
    tfparser.add_argument("project", help="path to ff2zim project")
    tfparser.add_argument("trainingfile", help="path to write trainingfile to")
    tfparser.add_argument(
        "--add-epub",
        action="store",
        nargs="*",
        help="add an epub or a directory of epubs to the trainingfile",
        metavar="PATH",
        dest="epubpaths")

    # parser for encoding the trainingfile
    eparser = subparsers.add_parser(
        "encode-trainingfile",
        help="encode a trainingfile for better performance")
    eparser.add_argument("trainingfile", help="path to trainingfile to encode")
    eparser.add_argument("outfile", help="path to write to")
    eparser.add_argument("model", help="model to encode for")

    # parser for finetuning
    finetuneparser = subparsers.add_parser(
        "finetune", help="finetune a gpt-2 model using a trainingfile")
    finetuneparser.add_argument("trainingfile", help="path to trainingfile")
    finetuneparser.add_argument("--model",
                                action="store",
                                default="124M",
                                help="model to use")
    finetuneparser.add_argument("--run-name",
                                action="store",
                                dest="runname",
                                default="run1",
                                help="run name for finetuned model.")

    # parser for generating
    genparser = subparsers.add_parser(
        "generate", help="generate a sample with an interactive prompt")
    genparser.add_argument("--model",
                           action="store",
                           default="124M",
                           help="model to use")
    genparser.add_argument("--run-name",
                           action="store",
                           dest="runname",
                           default="run1",
                           help="run name for finetuned model.")
    genparser.add_argument("-n",
                           "--numsamples",
                           action="store",
                           type=int,
                           help="number of samples to generate",
                           default=1)
    genparser.add_argument("-m",
                           "--mode",
                           action="store",
                           choices=("story", "chapter", "complete"),
                           default="story")

    ns = parser.parse_args()

    if ns.action == "generate-trainingfile":
        print("Generating trainingfile...")
        trainingfile = ns.trainingfile
        finetuner = GPT2Finetuner(ns.project, ns.epubpaths)
        num_stories, num_epubs = finetuner.create_training_file(trainingfile)
        print("Trainingfile successfully created.")
        print("Included: {} fanfics and {} epubs.".format(
            num_stories, num_epubs))
        return

    elif ns.action == "encode-trainingfile":
        print("Encoding trainingfile...")
        gpt2.encode_dataset(ns.trainingfile,
                            out_path=ns.outfile,
                            model_name=ns.model)
        print("Done.")
        return

    elif ns.action == "finetune":
        model = ns.model
        if not os.path.isdir(os.path.join("models", model)):
            print("Downloading the '{}' model...".format(model))
            gpt2.download_gpt2(model_name=model)
            print("Download finished.")
        print("Starting TF session...")
        sess = gpt2.start_tf_sess()
        print("TF session started.")
        print("Finetuning...")
        gpt2.finetune(
            sess,
            ns.trainingfile,
            model_name=model,
            run_name=ns.runname,
            print_every=100,
            sample_every=500,
            save_every=500,
            use_memory_saving_gradients=True,
            accumulate_gradients=1,
        )
    elif ns.action == "generate":
        prepend_story_start = False
        print("========== Generate a story ==========")
        if ns.mode in ("story", "chapter"):
            story_start = "\n" + TOKEN_STORY_START + "\n"
            description_s = "\n" + TOKEN_DESCRIPTION_START + "\n"
            description = input("Description of story: ")
            description_s += description + "\n" + TOKEN_DESCRIPTION_END + "\n"
            story_start += description_s + "\n" + TOKEN_CHAPTER_START + "\n"
            prepend_story_start = True
        elif ns.mode == "complete":
            story_start = input("Prompt: ")
        print("========== Generating... =========")
        print("Starting TF session...")
        sess = gpt2.start_tf_sess()
        print("TF session started.")
        print("Loading gpt-2...")
        gpt2.load_gpt2(sess)
        print("Loaded.")
        print("Generating: ", end="", flush=True)
        results = []
        for i in range(ns.numsamples):
            finished = False
            storyparts = []
            while not finished:
                if not storyparts:
                    # first generation
                    prefix = story_start
                elif prepend_story_start:
                    # also include story start
                    prefix = description_s
                    prefix += " ".join(storyparts[-1].split(" ")[-21:-1])
                else:
                    prefix = " ".join(storyparts[-1].split(" ")[-21:-1])
                multisamples = True
                gpt2results = gpt2.generate(
                    sess,
                    model_name=ns.model,
                    run_name=ns.runname,
                    prefix=prefix,
                    return_as_list=True,
                    # nsamples=ns.numsamples,
                    seed=int(time.time()),
                    temperature=0.8,
                    top_k=50,
                    top_p=0.9,
                    nsamples=(5 if multisamples else 1),
                )
                result = None
                for gpt2result in gpt2results:
                    gpt2result = gpt2result[len(prefix):]
                    if not is_looping(gpt2result):
                        result = gpt2result
                        break
                    if result is None:
                        # set default just to be sure
                        result = gpt2result

                if ns.debug:
                    print("=====")
                    print("#storyparts: ", len(storyparts))
                    if len(storyparts) > 0:
                        print("-----\nLast storypart: \n-----\n",
                              storyparts[-1])
                    print("-----\nResult: \n-----\n", result)
                    print("=====")

                if ns.mode == "story" or ns.mode == "chapter":
                    if is_looping(result):
                        print("L", end="", flush=True)
                        # remove last part to reduce chance of looping
                        storyparts = storyparts[:-1]
                        continue

                    # append result
                    storyparts.append(result)
                    if TOKEN_CHAPTER_END in result:
                        print("C", end="", flush=True)
                        if ns.mode == "chapter":
                            finished = True
                    elif TOKEN_STORY_END in result:
                        print("S", end="", flush=True)
                        finished = True
                    else:
                        print(".", end="", flush=True)
                elif ns.mode == "complete":
                    # set result
                    storyparts = [prefix + result]
                    finished = True
            # results.append(story[len(prefix):])
            results.append("".join(storyparts))
        print("\n", flush=True)
        for text in results:
            print("========= Result =========")
            print(text)