Example #1
0
def train_model(model_name="124M",
                data_path='C:\\\\Users\\pogop\\OneDrive\\Desktop\\NKJ.txt',
                steps=600,
                run_name='run1'):
    gpt2.download_gpt2(
        model_name=model_name
    )  # model is saved into current directory under /models/124M/
    gpt2.finetune(sess,
                  data_path,
                  model_name=model_name,
                  steps=steps,
                  run_name=run_name)  # steps is max number of training steps
    return
def downloadGPT2Model(modelSize = "simple"):
    if(modelSize.lower() == "simple"):
        model_name = "124M"
    elif(modelSize.lower() == "medium"):
        model_name = "355M"
    elif(modelSize.lower() == "large"):
        model_name = "774M"
    else:
        model_name = "1558M"

    if not os.path.isdir(os.path.join("models", model_name)):
        print(f"Downloading {model_name} model...")
        gpt2.download_gpt2(model_name=model_name)
Example #3
0
def main(steps=200):
    model_name = "774M"
    if not os.path.isdir(os.path.join('models', model_name)):
        print(f"Downloading {model_name} model...")
        gpt2.download_gpt2(
            model_name=model_name
        )  # model is saved into current directory under /models/124M/

    file_name = "./datasets/gpt2_dataset.txt"

    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess, file_name, model_name=model_name,
                  steps=steps)  # steps is max number of training steps
Example #4
0
def fine_tune(inFile):
    model_name = "117M"
    gpt2.download_gpt2(
        model_name=model_name
    )  # model is saved into current directory under /models/117M/

    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess,
                  inFile,
                  model_name=model_name,
                  steps=1000,
                  save_every=100)  # steps is max number of training steps

    gpt2.generate(sess)
Example #5
0
def train_GPT():
    model_name = "124M"
    if not os.path.isdir(os.path.join("models", model_name)):
        print(f"Downloading {model_name} model...")
        gpt2.download_gpt2(
            model_name=model_name
        )  # model is saved into current directory under /models/124M/

    file_name = "corpus.txt"
    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess, file_name, model_name=model_name,
                  steps=1)  # steps is max number of training steps

    return sess
def finetune():
    model_name = "124M"
    gpt2.download_gpt2(
        model_name=model_name
    )  # model is saved into current directory under /models/124M/

    session = gpt2.start_tf_sess()
    gpt2.finetune(session,
                  'clnn.txt',
                  model_name=model_name,
                  steps=arguments.training_iterations
                  )  # steps is max number of training steps

    return session
Example #7
0
def main():

    ##models:
    #model_name = "124M"
    #model_name = "355M"
    #model_name = "774M"
    #model_name = "1558M"

    model_name = "355M"
    file_name = "champ.txt"

    if not os.path.isdir(os.path.join("models", model_name)):
        print(f"Downloading {model_name} model...")
        gpt2.download_gpt2(
            model_name=model_name
        )  # model is saved into current directory under ./models/124M/

    if not os.path.isfile(file_name):
        print("please provide a filename..")
        exit()

    #GPU config
    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.77
    config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF
    sess = tf.compat.v1.Session(config=config)

    #sess = gpt2.start_tf_sess() #old for CPU

    print('\n+++ Train model (y)? +++')
    train = input()
    if train == "" or train == "y" or train == 'yes':
        print('---> training model...\n')
        gpt2.finetune(
            sess, file_name, model_name=model_name,
            steps=100)  # steps is max number of training steps - default: 1000
    else:
        print('---> not training model...\n')
    # gpt2.generate(sess) #generate session in file

    ## generate text to file
    gen_file = 'gpt2_gentext_{:%Y%m%d_%H%M%S}.txt'.format(
        datetime.datetime.now(datetime.timezone.utc))
    gpt2.generate_to_file(sess,
                          destination_path=gen_file,
                          length=10000,
                          temperature=0.7,
                          nsamples=1,
                          batch_size=1)
Example #8
0
    async def gpt2_download_model(self, ctx, *, arg=None):
        print('Command gpt2_download_model triggered')

        if arg:
            if arg in VALID_DEFAULT_MODELS:
                gpt2.download_gpt2(model_name=arg)
                await ctx.send("Model downloaded")
            else:
                await ctx.send("ERROR: Invalid argument")
        else:  # If no model name is provided, download the one in the config
            model_name = self.config['model_name']
            if model_name in VALID_DEFAULT_MODELS:
                gpt2.download_gpt2(model_name=model_name)
            else:
                await ctx.send("ERROR: Invalid model_name in config")
Example #9
0
    def generate_models(self, model_name: str, data_path: str) -> None:
        """ Generate new models given a model name and data source path. Data source path being a pre-existing gpt2 learning model """
        print(os.path.join("models", model_name))
        if not os.path.isdir(os.path.join("models", model_name)):
            print(f"Downloading {model_name} model...")
            gpt2.download_gpt2(model_name=model_name)

        gpt2.finetune(self.sess,
                      data_path,
                      model_name=model_name,
                      batch_size=1,
                      sample_every=100,
                      sample_length=100,
                      save_every=100)
        gpt2.generate(self.sess)
Example #10
0
def gpt_2():
    model_name = "117M"
    gpt2.download_gpt2(
        model_name=model_name
    )  # model is saved into current directory under /models/117M/

    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess,
                  'titles.txt',
                  model_name=model_name,
                  steps=1000,
                  save_every=200,
                  sample_every=25)  # steps is max number of training steps

    gpt2.generate(sess)
Example #11
0
def generate_tweet(prefix):
    if not path.exists(path.join("models", MODEL)):
        gpt2.download_gpt2(model_name=MODEL)
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess, model_name=MODEL)
    output = gpt2.generate(
        sess,
        prefix=f"{top_trend} ",
        top_k=40,
        return_as_list=True,
        length=240,
        truncate="<|endoftext|>",
        model_name=MODEL,
    )[0]
    return output
def fine_tune(args, model_name='124M'):
    print(
        f'Run fine-tuning for run {args.run_name} using GPT2 model {model_name}...'
    )
    if not os.path.isdir(os.path.join("models", model_name)):
        log.info(f"Downloading {model_name} model...")
        gpt2.download_gpt2(model_name=model_name)
    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess,
                  args.data_path,
                  model_name=model_name,
                  run_name=args.run_name,
                  steps=-1,
                  sample_every=10,
                  save_every=10)
Example #13
0
    def generate_models(self, name, data):
        model_name = name
        if not os.path.isdir(os.path.join("models", model_name)):
            print(f"Downloading {model_name} model...")
            gpt2.download_gpt2(model_name=model_name)

        data_path = data
        gpt2.finetune(
            self.sess,
            data_path,
            model_name=model_name,
            batch_size=1,
            sample_every=100,
            sample_length=100,
        )
        gpt2.generate(self.sess)
Example #14
0
def train_model(channel: str):
    file_name = 'data/%s.txt' % channel

    if not os.path.exists('model'):
        gpt2.download_gpt2(model_name=model)

    sess = gpt2.start_tf_sess()

    gpt2.finetune(sess,
                  dataset=file_name,
                  model_name=model,
                  steps=steps,
                  restore_from='latest',
                  run_name='run1',
                  print_every=100,
                  sample_every=2000,
                  save_every=500)
Example #15
0
def train_gpt2_model(fileName):
  model_name = "124M"
  if not os.path.isdir(os.path.join("models", model_name)):
    print(f"Downloading {model_name} model...")
    gpt2.download_gpt2(model_name=model_name)   # model is saved into current directory under /models/124M/
    

  sess = gpt2.start_tf_sess()
  gpt2.finetune(sess,
              dataset=fileName,
              model_name=model_name,
              steps=1000,
              restore_from='fresh',
              run_name='run'+fileName,
              print_every=50,
              sample_every=200,
              save_every=500)
Example #16
0
def train(
    tf_session: TfSession,
    file_path_input: str = DEFAULT_FILE_PATH_INPUT,
    training_steps: int = DEFAULT_TRAINING_STEPS,
) -> None:
    if not os.path.isdir(os.path.join('models', MODEL_NAME)):
        print(f'Downloading { MODEL_NAME } model...')
        gpt2.download_gpt2(model_name=MODEL_NAME)

    gpt2.finetune(
        tf_session,
        file_path_input,
        model_name=MODEL_NAME,
        steps=training_steps,
        overwrite=True,
    )

    return tf_session
Example #17
0
def create_data(data_path='data/BonIver.txt',
                model_name='124M',
                steps=70,
                restore_from='fresh',
                run_name='run1',
                print_every=5,
                sample_every=10,
                save_every=50):

    sess = gpt2.start_tf_sess()
    gpt2.download_gpt2()
    gpt2.finetune(sess,
                  dataset=data_path,
                  model_name=model_name,
                  steps=steps,
                  restore_from=restore_from,
                  run_name=run_name,
                  print_every=print_every,
                  sample_every=sample_every,
                  save_every=save_every)
Example #18
0
def gpt2_finetuning(model_name, data_file, step):
    if not os.path.isdir(os.path.join("models", model_name)):
        print(f"Downloading {model_name} model...")
        gpt2.download_gpt2(model_name=model_name)   # model is saved into current directory under /models/124M/

    sess = gpt2.start_tf_sess()
    file_name = data_file
    '''
   	For finetuning use this command. Following are other parameters of the finetuning function:
        sess,
        dataset,
        steps=-1,
        model_name='124M',
        model_dir='models',
        combine=50000,
        batch_size=1,
        learning_rate=0.0001,
        accumulate_gradients=5,
        restore_from='latest',
        run_name='run1',
        checkpoint_dir='checkpoint',
        sample_every=100,
        sample_length=1023,
        sample_num=1,
        multi_gpu=False,
        save_every=1000,
        print_every=1,
        max_checkpoints=1,
        use_memory_saving_gradients=False,
        only_train_transformer_layers=False,
        optimizer='adam',
        overwrite=False
    '''
    gpt2.finetune(sess,
            file_name,
            model_name=model_name,
            run_name='run'+str(step),
            batch_size=4,
            checkpoint_dir='checkpoint',
            steps=step)   # steps is max number of training steps
def main():
    parser = argparse.ArgumentParser(description='GPT-2 training utility.')
    parser.add_argument('--name', '-n', help='Name for the trained model')
    parser.add_argument('--corpus', '-c', help='Corpus to fine-tune the model')
    parser.add_argument('--model', '-m', help='Pretrained Model name for fine-tuning', default='124M')
    args = parser.parse_args()

    import gpt_2_simple as gpt2
    # Download the pretrained model if not exists in models
    if not os.path.exists(os.path.join('models',args.model)):
        gpt2.download_gpt2(model_name=args.model)

    sess = gpt2.start_tf_sess()
    gpt2.finetune(sess,
        dataset=args.corpus,
        model_name=args.model,
        steps=1000,
        restore_from='fresh',
        run_name=args.name,
        print_every=10,
        sample_every=100,
        save_every=50)
Example #20
0
def finetune_gpt2(iterations=1000, text_path='scraped_text\comments.txt'):
    """
    Finetune gpt2

    text_path - Directory of text file to use to finetune_gpt
    iterations - Iterations to train gpt2
    """
    assert os.path.exists(text_path), 'Text file {} doesn\'t exist!'.format(
        text_path)

    if not gpt2.is_gpt2_downloaded(model_name='345M'):
        print('Warning: Downloading large file')
        gpt2.download_gpt2(model_name='345M')

    print('\nFinetuning\nWarning: Very slow without GPU')
    print('-' * 30)

    gpt2.finetune(sess,
                  text_path,
                  model_name='345M',
                  overwrite=True,
                  steps=iterations)
Example #21
0
import pandas as pd
import time
import os
from random import randint, randrange, choice
import gpt_2_simple as gpt2

import twitter_credentials_trump as tc

auth = OAuthHandler(tc.CONSUMER_KEY, tc.CONSUMER_SECRET)
auth.set_access_token(tc.ACCESS_TOKEN, tc.ACCESS_TOKEN_SECRET)
api = API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
print('twitter logged in...')
model_name = "355M"
if not os.path.isdir(os.path.join("models", model_name)):
    print(f"Downloading {model_name} model...")
    gpt2.download_gpt2(model_name=model_name)
print("model loaded")


def generate_trending_tweet():
    topics = ["Biden", "Trump"]
    topic = choice(topics)
    # this is just for testing repeat topics- remove before deployment
    print("generating topical tweets on subject: " + topic)

    # update the text file with current tweets
    file_name = '../data/'+topic+'.txt'
    topical_tweets = get_topic_tweets(topic, 5000)
    t_tweet_string = " || ".join(topical_tweets)

    with open(file_name, 'w') as f:
"""

!nvidia-smi

"""## Downloading GPT-2


* `124M` (default): the "small" model, 500MB on disk.
* `355M`: the "medium" model, 1.5GB on disk.
* `774M`: doesnt work
* `1558M`: really doesnt work

Larger models have more knowledge, but take longer to finetune and longer to generate text. 
"""

gpt2.download_gpt2(model_name="355M")

"""## Mounting Google Drive

VM drive mounting to get a text file loaded for i/o
"""

gpt2.mount_gdrive()

"""## Uploading txt file
Upload **any smaller text file**  (<10 MB) and update the file name in the cell below, then run the cell.
"""

file_name = "shakespeare.txt"
if not os.path.isfile(file_name):
	url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
Example #23
0
def get_base_model(model_name="355M"):
    if not os.path.isdir(os.path.join("models", model_name)):
        print(f"Downloading {model_name} model...")
        gpt2.download_gpt2(model_name=model_name)
Example #24
0
def download_model(args):
    if not os.path.isdir(os.path.join("models", args.model_name)):
        print(f"Downloading {args.model_name} model...")
        gpt2.download_gpt2(model_name=args.model_name)
    else:
        print(f"Model {args.model_name}  already downloaded")
@author: tbarton
"""
import sys
import subprocess
import os
# os.chdir('machine_learning_scripts/speed_test')
import tensorflow as tf

if tf.__version__ == '2.0.0':
    print('wrong tf version')
    subprocess.check_call(
        [sys.executable, 'conda', 'install', 'tensorflow==1.15.0'])
import gpt_2_simple as gpt2

gpt2.download_gpt2(model_name='774m')
# gpt2.download_gpt2()

sess = gpt2.start_tf_sess()
#sess = gpt2.load_gpt2(sess, 'second_run', multi_gpu=True)
gpt2.finetune(sess,
              dataset='big_chess_set.txt',
              run_name='new_run_large',
              print_every=1,
              multi_gpu=True,
              save_every=100,
              combine=100,
              steps=10000)  # steps is max number of training steps

# print('readying to generate!')
# single_text = gpt2.generate(sess, prefix='e4, e5 ', return_as_list=True, run_name='large_run')[0]
Example #26
0
#!/usr/bin/env python

import os

import gpt_2_simple as gpt2

STEPS = 500
MODEL_NAME = '355M'
FILE_PATH = 'tweets'

if not os.path.isdir(os.path.join('models', MODEL_NAME)):
    print(f'Downloading {MODEL_NAME} model...')
    gpt2.download_gpt2(model_name=MODEL_NAME)

gpt2.encode_dataset(f'{FILE_PATH}.csv')

sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              'text_encoded.npz',
              model_name=MODEL_NAME,
              steps=STEPS,
              restore_from='fresh',
              run_name='run1',
              print_every=10,
              sample_every=100)
Example #27
0
!pip install -q gpt-2-simple
import gpt_2_simple as gpt2
from google.colab import files

gpt2.download_gpt2(model_name="124M")
file_name = "TaylorLyrics.txt"
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=file_name,
              model_name='124M',
              steps=1000
              )

gpt2.generate(sess)
Example #28
0
import gpt_2_simple as gpt2
import os
import requests

model_name = "124M"

if not os.path.isdir(os.path.join("models", model_name)):
    print(f"Downloading {model_name} model...")
    gpt2.download_gpt2(
        model_name=model_name
    )  # model is saved into current directory under /models/124M/

sess = gpt2.start_tf_sess()

gpt2.finetune(sess, "./train.txt", model_name=model_name, steps=1000)

gpt2.generate(sess)
Example #29
0
 def check_model(self):
     if not os.path.isdir(os.path.join("models", self.base_model)):
         gpt2.download_gpt2(model_name=self.base_model)
Example #30
0
def main(
        # Required
        source: ('Source file to load data from', 'positional'),

        # Flags
        autodownload: ('Automatically download model if needed?', 'flag'),
        finetune: ('Run a fine-tuning pass on the model?', 'flag'),
        resume: ('Resume training from a prior run?', 'flag'),
        skip_plagiarism: ('Skip checking results for plagiarism?', 'flag'),

        # Options
        # There are multiple GPT-2 models that have been released. 124M is the smallest, and it
        # gives more than adequate results.
        model_name: ('Name of the GPT-2 model to use', 'option')='124M',
        run_name: ('Name to give this run - used for resuming prior runs', 'option')='run1',
        steps: ('Number of steps of training to carry out', 'option', None, int)=100,
        nsamples: ('Number of generation passes to run', 'option', None, int)=1,
        save_every: ('Save a checkpoint every this many steps', 'option', None, int)=200,
        sample_every: ('Sample the output during training every this many steps', 'option', None, int)=100,
        restore_from: ('Checkpoint to resume from', 'option')='latest',
        output_file: ('Name of the csv file to write', 'option')=None,
        delimiter: ('Character that delimits columns in source', 'option')=',',
        quote_column: ('Label for the column with quotes', 'option')='quotes',
        attribution_column: ('Label for the column with attributions', 'option')='attrib_name',
        source_attribution: ('Use only quotes from this source', 'option')=None,
    ):

    model_directory = 'models' # If we want, we could make this configurable, but there's some
                               # testing involved to make sure we do so consistently everywhere.
    temporary_input_file = 'temp_input.csv' # The file containing cleaned data from the quotes

    print(f"Using model: {model_name}")

    # Do we have the base model? If not, we need to get it.
    if not os.path.isdir(os.path.join(model_directory, model_name)):
        if autodownload:
            print(f"Downloading {model_name} model...")
            gpt2.download_gpt2(model_name=model_name, model_dir=model_directory)
        else:
            print(f"Couldn't find {model_name} in {model_directory}. Turn on autodownload to fetch it.")
            return


    attributions = Counter()
    # Track all individual quotes so that we can check whether we've regenerated them.
    source_quotes = []

    # TODO: Break this out into a parse phase and a write phase
    with open(source, newline='') as quote_file:
        with open(temporary_input_file, 'w', newline='') as out_file:
            quote_reader = csv.DictReader(quote_file, delimiter=delimiter)
            quote_writer = csv.DictWriter(out_file, fieldnames=[quote_column], extrasaction='ignore')
            quote_writer.writeheader() # Loader assumes there will be a header and skips it.
            for row in quote_reader:
                if source_attribution == None || row[attribution_column] == source_attribution: 
                    quote_writer.writerow(row)
                source_quotes.append(row[quote_column])
                attributions[row[attribution_column]] += 1

    print("Loaded {} quotes attributed to {} sources.".format(len(source_quotes), len(attributions)))
    print("Top 10 sources:")
    print(attributions.most_common(10))

    if source_attribution != None:
        print ("{} quotes by {}".format(attributions[source_attribution], source_attribution))

    sess = gpt2.start_tf_sess()

    if resume:
        print(f"Loading run {run_name}")
        # If the model name is set, the run name will be ignored.
        gpt2.load_gpt2(sess, run_name=run_name)


    if finetune:
        print('Fine-tuning the model from training data')
        gpt2.finetune(
            sess,
            temporary_input_file,
            model_name=model_name,
            steps=steps,
            run_name=run_name,
            save_every=save_every,
            sample_every=sample_every,
            restore_from=restore_from,
            max_checkpoints=100, # How many checkpoints to keep for each run
        )


    print('Generating quotes')
    quotes_with_delimiters = gpt2.generate(
        sess,
        run_name=run_name,
        nsamples=nsamples,
        return_as_list=True
    )

    print('Parsing quotes')
    generated_quotes = []
    for sample in quotes_with_delimiters:
        print("SAMPLE: [" + sample + "]")
        generated_quotes.extend(find_quotes(sample))

    results = []
    if skip_plagiarism:
        for quote in generated_quotes:
            print("QUOTE: [" + quote + "]")
            results.append({"quote": quote})
    else:
        print('Checking for plagiarism')
        novel_quote_count = 0
        for quote in generated_quotes:
            closest_match = process.extractOne(quote, source_quotes)
            print("QUOTE: [" + quote + "]")
            if closest_match[1] >= 90:
                # This is a bit too close.
                print("MATCH: [" + closest_match[0] + "]")
            else:
                novel_quote_count += 1
            result = {
                "quote": quote,
                "best_match": closest_match[0],
                "match_score": closest_match[1],
            }
            results.append(result)

        print('Novel quotes generated: {} of {} ({:.2%})'.format(novel_quote_count, len(results), novel_quote_count/len(results)))

    if output_file == None:
        output_file = run_name + '_' + source

    if skip_plagiarism:
        fieldnames=['quote']
    else:
        fieldnames=['quote', 'best_match', 'match_score']

    with open(output_file, 'w', newline='') as out_file:
        quote_writer = csv.DictWriter(out_file, fieldnames=fieldnames)
        quote_writer.writeheader()
        for row in results:
            quote_writer.writerow(row)