lyrics[i] += row['lyric'] + '.\n' #When it's done joining a song's lyrics lines , go to the next song : else : lyrics,songID,songName = processFirstLine(lyrics,songID,songName,row) songNumber = row['track_n'] i+=1 # Define a new pandas DataFrame to save songID , songName , Lyrics in it to use them later lyrics_data = pd.DataFrame({'songID':songID, 'songName':songName, 'lyrics':lyrics }) lyrics_data.head(3) with open('/content/drive/My Drive/lyrics.txt', 'w') as f: for item in lyrics_data.lyrics.to_list(): f.write("%s\n" % item) gpt2.copy_file_from_gdrive('lyrics.txt') sess = gpt2.start_tf_sess() gpt2.finetune(sess, dataset='lyrics.txt', model_name='124M', steps=1000, restore_from='fresh', run_name='run1', print_every=10, sample_every=200, save_every=500 ) gpt2.generate(sess,
gpt2.mount_gdrive() """## Uploading txt file Upload **any smaller text file** (<10 MB) and update the file name in the cell below, then run the cell. """ file_name = "shakespeare.txt" if not os.path.isfile(file_name): url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" data = requests.get(url) with open(file_name, 'w') as f: f.write(data.text) gpt2.copy_file_from_gdrive(file_name) """## training ... may take a while. TensorFlow magic, saving checkpoints, be sure to save if using coLab so its not a bunch of wasted time.. * **`restore_from`**: Set to `fresh` to start training from the base GPT-2, or set to `latest` to restart training from an existing checkpoint. * **`sample_every`**: Number of steps to print example output * **`print_every`**: Number of steps to print training progress. * **`learning_rate`**: Learning rate for the training. (default `1e-4`, can lower to `1e-5` if you have <1MB input data) * **`run_name`**: subfolder within `checkpoint` to save the model. This is useful if you want to work with multiple models (will also need to specify `run_name` when loading the model) * **`overwrite`**: Set to `True` if you want to continue finetuning an existing model (w/ `restore_from='latest'`) without creating duplicate copies. """ sess = gpt2.start_tf_sess() gpt2.finetune(sess, dataset=file_name,
pip install requests pip install tqdm print(tf.keras.__version__) print(tf.__version__) !ls """# Loading Dataset""" gpt2.mount_gdrive() drive.mount("/content/drive") gpt2.copy_file_from_gdrive('emily_dickinson_titleless.txt') """The data set loaded above was taken from [JenLooper's Kraggle data set](https://www.kaggle.com/jenlooper/emily-dickinson-poetry). In my earlier test iterations, I had tried to incorporate an API between my Colab Notebook and Kraggle. However, I realised it failed quite often while I was running it. So I extracted out the .txt file and placed it within my Google Drive instead.""" with io.open('emily_dickinson_titleless.txt', encoding='utf-8') as f: text = f.read().lower() print('corpus length:', len(text)) text[:100] len(text[:50]) !ls chars = sorted(list(set(text))) print('total chars:', len(chars))