Example #1
0
            lyrics[i] +=  row['lyric'] + '.\n'
    #When it's done joining a song's lyrics lines , go to the next song :    
    else :
        lyrics,songID,songName = processFirstLine(lyrics,songID,songName,row)
        songNumber = row['track_n']
        i+=1

# Define a new pandas DataFrame to save songID , songName , Lyrics in it to use them later
lyrics_data = pd.DataFrame({'songID':songID, 'songName':songName, 'lyrics':lyrics })
lyrics_data.head(3)

with open('/content/drive/My Drive/lyrics.txt', 'w') as f:
    for item in lyrics_data.lyrics.to_list():
        f.write("%s\n" % item)

gpt2.copy_file_from_gdrive('lyrics.txt')

sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset='lyrics.txt',
              model_name='124M',
              steps=1000,
              restore_from='fresh',
              run_name='run1',
              print_every=10,
              sample_every=200,
              save_every=500
              )

gpt2.generate(sess,
gpt2.mount_gdrive()

"""## Uploading txt file
Upload **any smaller text file**  (<10 MB) and update the file name in the cell below, then run the cell.
"""

file_name = "shakespeare.txt"
if not os.path.isfile(file_name):
	url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	data = requests.get(url)
	
	with open(file_name, 'w') as f:
		f.write(data.text)

gpt2.copy_file_from_gdrive(file_name)

"""## training ... may take a while.  TensorFlow magic, saving checkpoints, be sure to save if using coLab so its not a bunch of wasted time..

*  **`restore_from`**: Set to `fresh` to start training from the base GPT-2, or set to `latest` to restart training from an existing checkpoint.
* **`sample_every`**: Number of steps to print example output
* **`print_every`**: Number of steps to print training progress.
* **`learning_rate`**:  Learning rate for the training. (default `1e-4`, can lower to `1e-5` if you have <1MB input data)
*  **`run_name`**: subfolder within `checkpoint` to save the model. This is useful if you want to work with multiple models (will also need to specify  `run_name` when loading the model)
* **`overwrite`**: Set to `True` if you want to continue finetuning an existing model (w/ `restore_from='latest'`) without creating duplicate copies.
"""

sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=file_name,
Example #3
0
pip install requests

pip install tqdm

print(tf.keras.__version__)
print(tf.__version__)

!ls

"""# Loading Dataset"""

gpt2.mount_gdrive()
drive.mount("/content/drive")

gpt2.copy_file_from_gdrive('emily_dickinson_titleless.txt')

"""The data set loaded above was taken from [JenLooper's Kraggle data set](https://www.kaggle.com/jenlooper/emily-dickinson-poetry). In my earlier test iterations, I had tried to incorporate an API between my Colab Notebook and Kraggle. However, I realised it failed quite often while I was running it. So I extracted out the .txt file and placed it within my Google Drive instead."""

with io.open('emily_dickinson_titleless.txt', encoding='utf-8') as f:
    text = f.read().lower()
print('corpus length:', len(text))

text[:100]

len(text[:50])

!ls

chars = sorted(list(set(text)))
print('total chars:', len(chars))