def trainModel(dictionary, model_size = "124M"): text = "" for eachItem in dictionary.items(): for eachItemArticle in eachItem: if(type(eachItemArticle) == int): continue else: text += eachItemArticle["texto"] + " " f = open("data.txt", "w+") f.write(text) f.close() gpt2.encode_dataset("data.txt")
def encode_and_compress(inFile): gpt2.encode_dataset(inFile)
#!/usr/bin/env python import os import gpt_2_simple as gpt2 STEPS = 500 MODEL_NAME = '355M' FILE_PATH = 'tweets' if not os.path.isdir(os.path.join('models', MODEL_NAME)): print(f'Downloading {MODEL_NAME} model...') gpt2.download_gpt2(model_name=MODEL_NAME) gpt2.encode_dataset(f'{FILE_PATH}.csv') sess = gpt2.start_tf_sess() gpt2.finetune(sess, 'text_encoded.npz', model_name=MODEL_NAME, steps=STEPS, restore_from='fresh', run_name='run1', print_every=10, sample_every=100)
import gpt_2_simple as gpt2 import pandas as pd from tqdm import tqdm import os input_data_file = './data/2-quotes_filtered.csv' output_folder = "output/" split_quotes = pd.read_csv(input_data_file) print(split_quotes.head()) ## These were the top 20 most common topics in the dataset topics_to_keep = ['life', 'love','inspirational', 'humor', 'death', 'art', 'education', 'books', 'change', 'time', 'beauty', 'god', 'happiness', 'children', 'work', 'faith', 'funny', 'good', 'family', 'friendship'] quotes_to_keep = split_quotes[split_quotes['topic'].isin(topics_to_keep)] ## Create the GPT-ready dataset file_name = os.path.join(output_folder, "processed_quotes.txt") with open(file_name, mode='w') as open_file: for index, row in split_quotes.iterrows(): open_file.write("_TOPIC_ {} _QUOTE_ {} _AUTHOR_ {} _END_\n".format(row['topic'], row['quote'], row['author'])) ## Encode it to make loading faster during training gpt2.encode_dataset(file_name, out_path=os.path.join(output_folder,'text_encoded.npz'))
import pandas as pd from os import path if len(sys.argv) < 3: print('You must enter the corpus file and model name as parameters, e.g.: bot.py comment_data.txt 355M') sys.exit(1) file_name = sys.argv[1] model_name = sys.argv[2] sess = gpt2.start_tf_sess() # Check if file exists if not path.isfile(file_name): print("File does not exist. Please use a text corpus for training.") sys.exit(1) # Encode data if not already encoded if file_name[-4:] != ".npz": old_file_name = file_name file_name = file_name[:-4] + ".npz" print("Encoding data...") try: gpt2.encode_dataset(old_file_name, out_path=file_name, model_name=model_name) except: print("Failed to encode data. Please check that your file is a text corpus that can be encoded.") sys.exit(1) gpt2.finetune(sess, file_name, model_name=model_name, multi_gpu=False, overwrite=True)
def main(): """ The main function. """ parser = argparse.ArgumentParser( description="Finetune a GPT-2 model using ff2zim") parser.add_argument("-d", "--debug", action="store_true", help="show debug information") subparsers = parser.add_subparsers(dest="action", help="action to perform", required=True) # parser for generating trainingfile tfparser = subparsers.add_parser( "generate-trainingfile", help="generate the trainingfile from a ff2zim project") tfparser.add_argument("project", help="path to ff2zim project") tfparser.add_argument("trainingfile", help="path to write trainingfile to") tfparser.add_argument( "--add-epub", action="store", nargs="*", help="add an epub or a directory of epubs to the trainingfile", metavar="PATH", dest="epubpaths") # parser for encoding the trainingfile eparser = subparsers.add_parser( "encode-trainingfile", help="encode a trainingfile for better performance") eparser.add_argument("trainingfile", help="path to trainingfile to encode") eparser.add_argument("outfile", help="path to write to") eparser.add_argument("model", help="model to encode for") # parser for finetuning finetuneparser = subparsers.add_parser( "finetune", help="finetune a gpt-2 model using a trainingfile") finetuneparser.add_argument("trainingfile", help="path to trainingfile") finetuneparser.add_argument("--model", action="store", default="124M", help="model to use") finetuneparser.add_argument("--run-name", action="store", dest="runname", default="run1", help="run name for finetuned model.") # parser for generating genparser = subparsers.add_parser( "generate", help="generate a sample with an interactive prompt") genparser.add_argument("--model", action="store", default="124M", help="model to use") genparser.add_argument("--run-name", action="store", dest="runname", default="run1", help="run name for finetuned model.") genparser.add_argument("-n", "--numsamples", action="store", type=int, help="number of samples to generate", default=1) genparser.add_argument("-m", "--mode", action="store", choices=("story", "chapter", "complete"), default="story") ns = parser.parse_args() if ns.action == "generate-trainingfile": print("Generating trainingfile...") trainingfile = ns.trainingfile finetuner = GPT2Finetuner(ns.project, ns.epubpaths) num_stories, num_epubs = finetuner.create_training_file(trainingfile) print("Trainingfile successfully created.") print("Included: {} fanfics and {} epubs.".format( num_stories, num_epubs)) return elif ns.action == "encode-trainingfile": print("Encoding trainingfile...") gpt2.encode_dataset(ns.trainingfile, out_path=ns.outfile, model_name=ns.model) print("Done.") return elif ns.action == "finetune": model = ns.model if not os.path.isdir(os.path.join("models", model)): print("Downloading the '{}' model...".format(model)) gpt2.download_gpt2(model_name=model) print("Download finished.") print("Starting TF session...") sess = gpt2.start_tf_sess() print("TF session started.") print("Finetuning...") gpt2.finetune( sess, ns.trainingfile, model_name=model, run_name=ns.runname, print_every=100, sample_every=500, save_every=500, use_memory_saving_gradients=True, accumulate_gradients=1, ) elif ns.action == "generate": prepend_story_start = False print("========== Generate a story ==========") if ns.mode in ("story", "chapter"): story_start = "\n" + TOKEN_STORY_START + "\n" description_s = "\n" + TOKEN_DESCRIPTION_START + "\n" description = input("Description of story: ") description_s += description + "\n" + TOKEN_DESCRIPTION_END + "\n" story_start += description_s + "\n" + TOKEN_CHAPTER_START + "\n" prepend_story_start = True elif ns.mode == "complete": story_start = input("Prompt: ") print("========== Generating... =========") print("Starting TF session...") sess = gpt2.start_tf_sess() print("TF session started.") print("Loading gpt-2...") gpt2.load_gpt2(sess) print("Loaded.") print("Generating: ", end="", flush=True) results = [] for i in range(ns.numsamples): finished = False storyparts = [] while not finished: if not storyparts: # first generation prefix = story_start elif prepend_story_start: # also include story start prefix = description_s prefix += " ".join(storyparts[-1].split(" ")[-21:-1]) else: prefix = " ".join(storyparts[-1].split(" ")[-21:-1]) multisamples = True gpt2results = gpt2.generate( sess, model_name=ns.model, run_name=ns.runname, prefix=prefix, return_as_list=True, # nsamples=ns.numsamples, seed=int(time.time()), temperature=0.8, top_k=50, top_p=0.9, nsamples=(5 if multisamples else 1), ) result = None for gpt2result in gpt2results: gpt2result = gpt2result[len(prefix):] if not is_looping(gpt2result): result = gpt2result break if result is None: # set default just to be sure result = gpt2result if ns.debug: print("=====") print("#storyparts: ", len(storyparts)) if len(storyparts) > 0: print("-----\nLast storypart: \n-----\n", storyparts[-1]) print("-----\nResult: \n-----\n", result) print("=====") if ns.mode == "story" or ns.mode == "chapter": if is_looping(result): print("L", end="", flush=True) # remove last part to reduce chance of looping storyparts = storyparts[:-1] continue # append result storyparts.append(result) if TOKEN_CHAPTER_END in result: print("C", end="", flush=True) if ns.mode == "chapter": finished = True elif TOKEN_STORY_END in result: print("S", end="", flush=True) finished = True else: print(".", end="", flush=True) elif ns.mode == "complete": # set result storyparts = [prefix + result] finished = True # results.append(story[len(prefix):]) results.append("".join(storyparts)) print("\n", flush=True) for text in results: print("========= Result =========") print(text)