def train_model(model_name="124M", data_path='C:\\\\Users\\pogop\\OneDrive\\Desktop\\NKJ.txt', steps=600, run_name='run1'): gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/124M/ gpt2.finetune(sess, data_path, model_name=model_name, steps=steps, run_name=run_name) # steps is max number of training steps return
def downloadGPT2Model(modelSize = "simple"): if(modelSize.lower() == "simple"): model_name = "124M" elif(modelSize.lower() == "medium"): model_name = "355M" elif(modelSize.lower() == "large"): model_name = "774M" else: model_name = "1558M" if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name)
def main(steps=200): model_name = "774M" if not os.path.isdir(os.path.join('models', model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/124M/ file_name = "./datasets/gpt2_dataset.txt" sess = gpt2.start_tf_sess() gpt2.finetune(sess, file_name, model_name=model_name, steps=steps) # steps is max number of training steps
def fine_tune(inFile): model_name = "117M" gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/117M/ sess = gpt2.start_tf_sess() gpt2.finetune(sess, inFile, model_name=model_name, steps=1000, save_every=100) # steps is max number of training steps gpt2.generate(sess)
def train_GPT(): model_name = "124M" if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/124M/ file_name = "corpus.txt" sess = gpt2.start_tf_sess() gpt2.finetune(sess, file_name, model_name=model_name, steps=1) # steps is max number of training steps return sess
def finetune(): model_name = "124M" gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/124M/ session = gpt2.start_tf_sess() gpt2.finetune(session, 'clnn.txt', model_name=model_name, steps=arguments.training_iterations ) # steps is max number of training steps return session
def main(): ##models: #model_name = "124M" #model_name = "355M" #model_name = "774M" #model_name = "1558M" model_name = "355M" file_name = "champ.txt" if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under ./models/124M/ if not os.path.isfile(file_name): print("please provide a filename..") exit() #GPU config config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = 0.77 config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF sess = tf.compat.v1.Session(config=config) #sess = gpt2.start_tf_sess() #old for CPU print('\n+++ Train model (y)? +++') train = input() if train == "" or train == "y" or train == 'yes': print('---> training model...\n') gpt2.finetune( sess, file_name, model_name=model_name, steps=100) # steps is max number of training steps - default: 1000 else: print('---> not training model...\n') # gpt2.generate(sess) #generate session in file ## generate text to file gen_file = 'gpt2_gentext_{:%Y%m%d_%H%M%S}.txt'.format( datetime.datetime.now(datetime.timezone.utc)) gpt2.generate_to_file(sess, destination_path=gen_file, length=10000, temperature=0.7, nsamples=1, batch_size=1)
async def gpt2_download_model(self, ctx, *, arg=None): print('Command gpt2_download_model triggered') if arg: if arg in VALID_DEFAULT_MODELS: gpt2.download_gpt2(model_name=arg) await ctx.send("Model downloaded") else: await ctx.send("ERROR: Invalid argument") else: # If no model name is provided, download the one in the config model_name = self.config['model_name'] if model_name in VALID_DEFAULT_MODELS: gpt2.download_gpt2(model_name=model_name) else: await ctx.send("ERROR: Invalid model_name in config")
def generate_models(self, model_name: str, data_path: str) -> None: """ Generate new models given a model name and data source path. Data source path being a pre-existing gpt2 learning model """ print(os.path.join("models", model_name)) if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name) gpt2.finetune(self.sess, data_path, model_name=model_name, batch_size=1, sample_every=100, sample_length=100, save_every=100) gpt2.generate(self.sess)
def gpt_2(): model_name = "117M" gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/117M/ sess = gpt2.start_tf_sess() gpt2.finetune(sess, 'titles.txt', model_name=model_name, steps=1000, save_every=200, sample_every=25) # steps is max number of training steps gpt2.generate(sess)
def generate_tweet(prefix): if not path.exists(path.join("models", MODEL)): gpt2.download_gpt2(model_name=MODEL) sess = gpt2.start_tf_sess() gpt2.load_gpt2(sess, model_name=MODEL) output = gpt2.generate( sess, prefix=f"{top_trend} ", top_k=40, return_as_list=True, length=240, truncate="<|endoftext|>", model_name=MODEL, )[0] return output
def fine_tune(args, model_name='124M'): print( f'Run fine-tuning for run {args.run_name} using GPT2 model {model_name}...' ) if not os.path.isdir(os.path.join("models", model_name)): log.info(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name) sess = gpt2.start_tf_sess() gpt2.finetune(sess, args.data_path, model_name=model_name, run_name=args.run_name, steps=-1, sample_every=10, save_every=10)
def generate_models(self, name, data): model_name = name if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name) data_path = data gpt2.finetune( self.sess, data_path, model_name=model_name, batch_size=1, sample_every=100, sample_length=100, ) gpt2.generate(self.sess)
def train_model(channel: str): file_name = 'data/%s.txt' % channel if not os.path.exists('model'): gpt2.download_gpt2(model_name=model) sess = gpt2.start_tf_sess() gpt2.finetune(sess, dataset=file_name, model_name=model, steps=steps, restore_from='latest', run_name='run1', print_every=100, sample_every=2000, save_every=500)
def train_gpt2_model(fileName): model_name = "124M" if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name) # model is saved into current directory under /models/124M/ sess = gpt2.start_tf_sess() gpt2.finetune(sess, dataset=fileName, model_name=model_name, steps=1000, restore_from='fresh', run_name='run'+fileName, print_every=50, sample_every=200, save_every=500)
def train( tf_session: TfSession, file_path_input: str = DEFAULT_FILE_PATH_INPUT, training_steps: int = DEFAULT_TRAINING_STEPS, ) -> None: if not os.path.isdir(os.path.join('models', MODEL_NAME)): print(f'Downloading { MODEL_NAME } model...') gpt2.download_gpt2(model_name=MODEL_NAME) gpt2.finetune( tf_session, file_path_input, model_name=MODEL_NAME, steps=training_steps, overwrite=True, ) return tf_session
def create_data(data_path='data/BonIver.txt', model_name='124M', steps=70, restore_from='fresh', run_name='run1', print_every=5, sample_every=10, save_every=50): sess = gpt2.start_tf_sess() gpt2.download_gpt2() gpt2.finetune(sess, dataset=data_path, model_name=model_name, steps=steps, restore_from=restore_from, run_name=run_name, print_every=print_every, sample_every=sample_every, save_every=save_every)
def gpt2_finetuning(model_name, data_file, step): if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name) # model is saved into current directory under /models/124M/ sess = gpt2.start_tf_sess() file_name = data_file ''' For finetuning use this command. Following are other parameters of the finetuning function: sess, dataset, steps=-1, model_name='124M', model_dir='models', combine=50000, batch_size=1, learning_rate=0.0001, accumulate_gradients=5, restore_from='latest', run_name='run1', checkpoint_dir='checkpoint', sample_every=100, sample_length=1023, sample_num=1, multi_gpu=False, save_every=1000, print_every=1, max_checkpoints=1, use_memory_saving_gradients=False, only_train_transformer_layers=False, optimizer='adam', overwrite=False ''' gpt2.finetune(sess, file_name, model_name=model_name, run_name='run'+str(step), batch_size=4, checkpoint_dir='checkpoint', steps=step) # steps is max number of training steps
def main(): parser = argparse.ArgumentParser(description='GPT-2 training utility.') parser.add_argument('--name', '-n', help='Name for the trained model') parser.add_argument('--corpus', '-c', help='Corpus to fine-tune the model') parser.add_argument('--model', '-m', help='Pretrained Model name for fine-tuning', default='124M') args = parser.parse_args() import gpt_2_simple as gpt2 # Download the pretrained model if not exists in models if not os.path.exists(os.path.join('models',args.model)): gpt2.download_gpt2(model_name=args.model) sess = gpt2.start_tf_sess() gpt2.finetune(sess, dataset=args.corpus, model_name=args.model, steps=1000, restore_from='fresh', run_name=args.name, print_every=10, sample_every=100, save_every=50)
def finetune_gpt2(iterations=1000, text_path='scraped_text\comments.txt'): """ Finetune gpt2 text_path - Directory of text file to use to finetune_gpt iterations - Iterations to train gpt2 """ assert os.path.exists(text_path), 'Text file {} doesn\'t exist!'.format( text_path) if not gpt2.is_gpt2_downloaded(model_name='345M'): print('Warning: Downloading large file') gpt2.download_gpt2(model_name='345M') print('\nFinetuning\nWarning: Very slow without GPU') print('-' * 30) gpt2.finetune(sess, text_path, model_name='345M', overwrite=True, steps=iterations)
import pandas as pd import time import os from random import randint, randrange, choice import gpt_2_simple as gpt2 import twitter_credentials_trump as tc auth = OAuthHandler(tc.CONSUMER_KEY, tc.CONSUMER_SECRET) auth.set_access_token(tc.ACCESS_TOKEN, tc.ACCESS_TOKEN_SECRET) api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) print('twitter logged in...') model_name = "355M" if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name) print("model loaded") def generate_trending_tweet(): topics = ["Biden", "Trump"] topic = choice(topics) # this is just for testing repeat topics- remove before deployment print("generating topical tweets on subject: " + topic) # update the text file with current tweets file_name = '../data/'+topic+'.txt' topical_tweets = get_topic_tweets(topic, 5000) t_tweet_string = " || ".join(topical_tweets) with open(file_name, 'w') as f:
""" !nvidia-smi """## Downloading GPT-2 * `124M` (default): the "small" model, 500MB on disk. * `355M`: the "medium" model, 1.5GB on disk. * `774M`: doesnt work * `1558M`: really doesnt work Larger models have more knowledge, but take longer to finetune and longer to generate text. """ gpt2.download_gpt2(model_name="355M") """## Mounting Google Drive VM drive mounting to get a text file loaded for i/o """ gpt2.mount_gdrive() """## Uploading txt file Upload **any smaller text file** (<10 MB) and update the file name in the cell below, then run the cell. """ file_name = "shakespeare.txt" if not os.path.isfile(file_name): url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
def get_base_model(model_name="355M"): if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name)
def download_model(args): if not os.path.isdir(os.path.join("models", args.model_name)): print(f"Downloading {args.model_name} model...") gpt2.download_gpt2(model_name=args.model_name) else: print(f"Model {args.model_name} already downloaded")
@author: tbarton """ import sys import subprocess import os # os.chdir('machine_learning_scripts/speed_test') import tensorflow as tf if tf.__version__ == '2.0.0': print('wrong tf version') subprocess.check_call( [sys.executable, 'conda', 'install', 'tensorflow==1.15.0']) import gpt_2_simple as gpt2 gpt2.download_gpt2(model_name='774m') # gpt2.download_gpt2() sess = gpt2.start_tf_sess() #sess = gpt2.load_gpt2(sess, 'second_run', multi_gpu=True) gpt2.finetune(sess, dataset='big_chess_set.txt', run_name='new_run_large', print_every=1, multi_gpu=True, save_every=100, combine=100, steps=10000) # steps is max number of training steps # print('readying to generate!') # single_text = gpt2.generate(sess, prefix='e4, e5 ', return_as_list=True, run_name='large_run')[0]
#!/usr/bin/env python import os import gpt_2_simple as gpt2 STEPS = 500 MODEL_NAME = '355M' FILE_PATH = 'tweets' if not os.path.isdir(os.path.join('models', MODEL_NAME)): print(f'Downloading {MODEL_NAME} model...') gpt2.download_gpt2(model_name=MODEL_NAME) gpt2.encode_dataset(f'{FILE_PATH}.csv') sess = gpt2.start_tf_sess() gpt2.finetune(sess, 'text_encoded.npz', model_name=MODEL_NAME, steps=STEPS, restore_from='fresh', run_name='run1', print_every=10, sample_every=100)
!pip install -q gpt-2-simple import gpt_2_simple as gpt2 from google.colab import files gpt2.download_gpt2(model_name="124M") file_name = "TaylorLyrics.txt" sess = gpt2.start_tf_sess() gpt2.finetune(sess, dataset=file_name, model_name='124M', steps=1000 ) gpt2.generate(sess)
import gpt_2_simple as gpt2 import os import requests model_name = "124M" if not os.path.isdir(os.path.join("models", model_name)): print(f"Downloading {model_name} model...") gpt2.download_gpt2( model_name=model_name ) # model is saved into current directory under /models/124M/ sess = gpt2.start_tf_sess() gpt2.finetune(sess, "./train.txt", model_name=model_name, steps=1000) gpt2.generate(sess)
def check_model(self): if not os.path.isdir(os.path.join("models", self.base_model)): gpt2.download_gpt2(model_name=self.base_model)
def main( # Required source: ('Source file to load data from', 'positional'), # Flags autodownload: ('Automatically download model if needed?', 'flag'), finetune: ('Run a fine-tuning pass on the model?', 'flag'), resume: ('Resume training from a prior run?', 'flag'), skip_plagiarism: ('Skip checking results for plagiarism?', 'flag'), # Options # There are multiple GPT-2 models that have been released. 124M is the smallest, and it # gives more than adequate results. model_name: ('Name of the GPT-2 model to use', 'option')='124M', run_name: ('Name to give this run - used for resuming prior runs', 'option')='run1', steps: ('Number of steps of training to carry out', 'option', None, int)=100, nsamples: ('Number of generation passes to run', 'option', None, int)=1, save_every: ('Save a checkpoint every this many steps', 'option', None, int)=200, sample_every: ('Sample the output during training every this many steps', 'option', None, int)=100, restore_from: ('Checkpoint to resume from', 'option')='latest', output_file: ('Name of the csv file to write', 'option')=None, delimiter: ('Character that delimits columns in source', 'option')=',', quote_column: ('Label for the column with quotes', 'option')='quotes', attribution_column: ('Label for the column with attributions', 'option')='attrib_name', source_attribution: ('Use only quotes from this source', 'option')=None, ): model_directory = 'models' # If we want, we could make this configurable, but there's some # testing involved to make sure we do so consistently everywhere. temporary_input_file = 'temp_input.csv' # The file containing cleaned data from the quotes print(f"Using model: {model_name}") # Do we have the base model? If not, we need to get it. if not os.path.isdir(os.path.join(model_directory, model_name)): if autodownload: print(f"Downloading {model_name} model...") gpt2.download_gpt2(model_name=model_name, model_dir=model_directory) else: print(f"Couldn't find {model_name} in {model_directory}. Turn on autodownload to fetch it.") return attributions = Counter() # Track all individual quotes so that we can check whether we've regenerated them. source_quotes = [] # TODO: Break this out into a parse phase and a write phase with open(source, newline='') as quote_file: with open(temporary_input_file, 'w', newline='') as out_file: quote_reader = csv.DictReader(quote_file, delimiter=delimiter) quote_writer = csv.DictWriter(out_file, fieldnames=[quote_column], extrasaction='ignore') quote_writer.writeheader() # Loader assumes there will be a header and skips it. for row in quote_reader: if source_attribution == None || row[attribution_column] == source_attribution: quote_writer.writerow(row) source_quotes.append(row[quote_column]) attributions[row[attribution_column]] += 1 print("Loaded {} quotes attributed to {} sources.".format(len(source_quotes), len(attributions))) print("Top 10 sources:") print(attributions.most_common(10)) if source_attribution != None: print ("{} quotes by {}".format(attributions[source_attribution], source_attribution)) sess = gpt2.start_tf_sess() if resume: print(f"Loading run {run_name}") # If the model name is set, the run name will be ignored. gpt2.load_gpt2(sess, run_name=run_name) if finetune: print('Fine-tuning the model from training data') gpt2.finetune( sess, temporary_input_file, model_name=model_name, steps=steps, run_name=run_name, save_every=save_every, sample_every=sample_every, restore_from=restore_from, max_checkpoints=100, # How many checkpoints to keep for each run ) print('Generating quotes') quotes_with_delimiters = gpt2.generate( sess, run_name=run_name, nsamples=nsamples, return_as_list=True ) print('Parsing quotes') generated_quotes = [] for sample in quotes_with_delimiters: print("SAMPLE: [" + sample + "]") generated_quotes.extend(find_quotes(sample)) results = [] if skip_plagiarism: for quote in generated_quotes: print("QUOTE: [" + quote + "]") results.append({"quote": quote}) else: print('Checking for plagiarism') novel_quote_count = 0 for quote in generated_quotes: closest_match = process.extractOne(quote, source_quotes) print("QUOTE: [" + quote + "]") if closest_match[1] >= 90: # This is a bit too close. print("MATCH: [" + closest_match[0] + "]") else: novel_quote_count += 1 result = { "quote": quote, "best_match": closest_match[0], "match_score": closest_match[1], } results.append(result) print('Novel quotes generated: {} of {} ({:.2%})'.format(novel_quote_count, len(results), novel_quote_count/len(results))) if output_file == None: output_file = run_name + '_' + source if skip_plagiarism: fieldnames=['quote'] else: fieldnames=['quote', 'best_match', 'match_score'] with open(output_file, 'w', newline='') as out_file: quote_writer = csv.DictWriter(out_file, fieldnames=fieldnames) quote_writer.writeheader() for row in results: quote_writer.writerow(row)