def main(): ## To ensure consistent identification of language for lyrics ld.DetectorFactory.seed = 0 ## CSV file of lyrics [fName] = getSysArgs.usage(['songStats.py', '<lyric_data_file_path>'])[1:] ## Open CSV file of lyrics dataCSV = csv.reader(open(fName, 'rU')) ## Column headers colNames = dataCSV.next() ## Index of lyrics column lyricI = colNames.index('lyrics') ## To store song statistics (assumes lyrics are in right-most column) stats = [colNames[:-1] + ['lyricCharCt', 'lyricWordCt', 'lyricVocabSize', 'lines', 'lang']] ## To obtain counts of words where strings of non-alphanumeric characters # are not considered words tokenize = CountVectorizer().build_analyzer() for row in dataCSV: print 'Number of song statistics collected: ', row[0] ## Song's language lang = None ## Lyrics for a song lyrics = row[lyricI] ## Lines in song lines = lyrics.count('\n') ## Words in song words = tokenize(lyrics) ## Number of words in song wrdCt = len(words) if wrdCt < 1: lang = 'none' else: try: lang = ld.detect(lyrics.decode('utf-8')) except ld.lang_detect_exception.LangDetectException: lang = 'none' print lyrics stats.append(row[:-1] + [len(lyrics), wrdCt, len(set(words)), lines, lang]) ## File to write song statistics statsCSV = csv.writer(open('songStats.csv', 'wb', buffering = 0)) statsCSV.writerows(stats) return
def main(): ## CSV file of lyrics [fName] = getSysArgs.usage(['readLyrics.py', '<lyric_data_file_path>'])[1:] ## Open CSV file of lyrics dataCSV = csv.reader(open(fName, 'rU')) ## To store lyric statistics lyricStats = [['Song', 'No. Lines', 'No. Words']] ## To store artists artists = {} ## To store genres genres = {} ## To store years years = {} ## Column headers colNames = dataCSV.next() for row in dataCSV: ## Index of row i = row[colNames.index('index')] ## Song in row song = row[colNames.index('song')] ## Lyrics (song text) in row lyrics = row[colNames.index('lyrics')] lyricStats.append([(song, i), lyrics.count('\n') + 1, len(nltk.word_tokenize(lyrics.decode('utf-8')))]) ## Artist in row artist = row[colNames.index('artist')] ## Genre in row genre = row[colNames.index('genre')] ## Year in row year = row[colNames.index('year')] if artist not in artists: artists[artist] = 0 if genre not in genres: genres[genre] = 0 if year not in years: years[year] = 0 artists[artist] += 1 genres[genre] += 1 years[year] += 1 ## File to write lyric statistics lyricCSV = csv.writer(open('lyricStats.csv', 'wb', buffering=0)) artists = np.transpose(np.array([artists.keys(), artists.values()])) genres = np.transpose(np.array([genres.keys(), genres.values()])) years = np.transpose(np.array([years.keys(), years.values()])) ## File to write artists artistCSV = csv.writer(open('artistStats.csv', 'wb', buffering=0)) ## File to write genres genreCSV = csv.writer(open('genreStats.csv', 'wb', buffering=0)) ## File to write years yearCSV = csv.writer(open('yearStats.csv', 'wb', buffering=0)) lyricCSV.writerows(lyricStats) artistCSV.writerows(artists) genreCSV.writerows(genres) yearCSV.writerows(years) return
def main(): ## The specified group from which to generate lyrics and lyric data [groupType, group, seedFile, variation] = getSysArgs.usage( ['generate.py', '<group_type>', '<group_name>', '<seed_lyrics_file_path>', '<variation_score>'])[1:] try: variation = int(variation) except ValueError: variation = float(variation) ## Type of variation score provided typeVar = type(variation) if ((typeVar == int and variation < 1) or (typeVar == float and (variation <= 0.0 or variation > 1.0))): print 'Invalid variation score: ', variation print 'Input a float between 0.0 and 1.0 or an int above 1' return ## Load in seed lyrics seed = open(seedFile, 'rU') ## Seed lyrics seedLyrics = '' for line in seed: seedLyrics += line if seedLyrics[-1] == '\n': #remove newline char if last char in lyrics seedLyrics = seedLyrics[:-1] ## Open CSV tracking the best model filenames for various groups bestModels = csv.reader(open('bestModels.csv', 'rU')) ## Dropout rate of trained model dropout = None ## Filename of trained model modelFile = '' for row in bestModels: #assumes 1st and 2nd cols are groupType and group if row[:2] == [groupType, group]: dropout = float(row[2]) #assumes dropout is in 3rd column modelFile = row[3] #assumes model filename is in 4th column break if modelFile == '': print 'No model has been trained for the', groupType, group, 'yet' return modelFile = './weights/' + modelFile ## Length of training sequence seqLen = 30 ## To gather training lyric data train = load(open('./seqs/' + groupType + '-' + group + '-seq.pkl', 'rb')) ## Seed lyric sequence seedSeq = train.getWordSeq(seedLyrics) seedSeq = ['ppaadd'] * (seqLen - len(seedSeq)) + seedSeq + ['endofline'] seedSeq = seedSeq[len(seedSeq) - seqLen:] if len(seedSeq) < 1: print 'Seed lyrics require at least one word\nGiven seed lyrics:' print seedLyrics return ## Number of words in vocabulary vocabSize = len(train.words) ## Build generator model model = Sequential() model.add(LSTM(256, input_shape = (seqLen, 1), return_sequences = True)) #1 model.add(Dropout(dropout)) model.add(LSTM(256)) #2nd layer model.add(Dropout(dropout)) model.add(Dense(vocabSize, activation = 'softmax')) model.load_weights(modelFile) model.compile(loss = 'categorical_crossentropy', optimizer = 'adam') ## Seed numerical sequence seedNS = [] for word in seedSeq: if word in train.words: seedNS.append(train.words[word]) else: #use most-likely word if seed word not in vocabulary seedNS.append( np.argmax( model.predict( train.normObs( np.array([train.words['ppaadd']] * (seqLen - len(seedNS)) + seedNS, dtype = np.float64), (1, seqLen, 1)), verbose = 0))) ## Generated text genTxt = seedLyrics + ' |\n' ## To store prediction distributions predDists = np.empty((200, vocabSize)) ## Vocabulary as a list for use in randomly selecting word wordList = [None] * len(train.words) for word in train.words: wordList[train.words[word]] = word for i in range(200): ## Generation initialization for model genX = train.normObs(np.array(seedNS, dtype = np.float64), (1, seqLen, 1)) ## Word-prediction distribution pred = model.predict(genX, verbose = 0) predDists[i] = pred[0] if type(variation) == int and variation > vocabSize: print variation, 'larger than vocabulary size of', vocabSize variation = vocabSize if type(variation) == float: variation = int(variation * vocabSize) ## Indicies of highest probability next words topIs = np.argsort(-pred[0])[:variation] pred = pred[0][topIs] pred = pred / np.sum(pred) ## Highest probability words topWords = np.array(wordList)[topIs] ## Next word based on randomly choosing from word distribution next = np.random.choice(topWords, p = pred) if next == 'endofsong': break elif next == 'endofline': genTxt += '\n' elif next == 'commachar': genTxt += ',' elif next == 'questionmark': genTxt += '?' else: genTxt += ' ' + next seedNS.append(train.words[next]) seedNS = seedNS[1:] genTxt = genTxt.replace('endofline', '\n') genTxt = genTxt.replace('commachar', ',') genTxt = genTxt.replace('questionmark', '?') print genTxt ## File to write prediction distibutions for each predicted word distsCSV = csv.writer(open('predDists.csv', 'wb', buffering = 0)) distsCSV.writerows(predDists) return
def main(): ## The specified group on which to train, lyric data, and GBs of memory [fName, groupType, group, memGB] = getSysArgs.usage([ 'trainRnn.py', '<lyric_data_file_path>', '<group_type>', '<group_name>', '<memory_size_limit_in_GB>' ])[1:] ## Length of training sequence seqLen = 30 ## To gather training lyric data train = Lyrics() train.lyrics2seqs(groupType, group, [fName], seqLen) dump(train, open('./seqs/' + groupType + '-' + group + '-seq.pkl', 'wb')) ## Number of epochs for training epochs = 50 ## Vocabulary size vocabSize = len(train.words) ## Training data chunk size # (size chosen to be the number of data points that can fit into the # user-specified memory limit) chunkSize = int(memGB) * 1073741824 / ((seqLen + vocabSize) * 8) ## Range of epochs epochRng = range(epochs) ## Trained weights file from previous iteration prevCP = '' ## Number of training datapoints for model dataPts = 0 for song in train.lyricSeq: dataPts += len(song) - seqLen ## Chunks for training model chunks = [chunkSize] * (dataPts / chunkSize) + [dataPts % chunkSize] if chunks[-1] == 0: chunks = chunks[:-1] ## Number of chunks numChunks = len(chunks) ## Build model model = Sequential() model.add(LSTM(256, input_shape=(seqLen, 1), return_sequences=True)) #1 model.add(Dropout(0.2)) model.add(LSTM(256)) #2nd layer model.add(Dropout(0.2)) model.add(Dense(vocabSize, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') ## Chunk counter chunkC = 0 ## Filename for datapoints dpFname = "./datapoints/%s-%s_chunk_%dof%d.npz" ## Position in lyric sequence seqI = 0 ## Index of song in review songI = 0 for chunk in chunks: print("chunk %d of %d" % (chunkC + 1, numChunks)) print("starting at song %d of %d at word %d/%d" % (songI, len(train.lyricSeq), seqI, len(train.lyricSeq[songI]))) ## Observations dataX = [] ## Responses dataY = [] ## Number range the size of the given chunk chunkRng = range(chunk) ## Starting time ti = time.time() for i in chunkRng: print("datapoint %d/%d %ds" % (i + 1, chunk, time.time() - ti), end='\r') if seqI >= len(train.lyricSeq[songI]) - seqLen: songI += 1 seqI = 0 dataX.append(train.numSeq[songI][seqI:seqI + seqLen]) dataY.append([0] * vocabSize) dataY[-1][train.numSeq[songI][seqI + seqLen]] = 1 seqI += 1 print('\n') ti = time.time() dataX = train.normObs(np.array(dataX, dtype=np.float64), (chunk, seqLen, 1)) dataY = np.array(dataY, dtype=np.float64) print("numpy arrays created in %d seconds" % (time.time() - ti)) np.savez_compressed(dpFname % (groupType, group, chunkC + 1, numChunks), X=dataX, Y=dataY) chunkC += 1 ## Loss Data losses = [['Step', 'Loss']] ## Counter for loss steps lossC = 1 ## Lowest loss from model training minLoss = float('inf') for i in epochRng: print("Epoch %d of %d" % (i + 1, epochs)) chunkC = 0 for chunk in chunks: print("chunk %d of %d" % (chunkC + 1, numChunks)) ti = time.time() data = np.load(dpFname % (groupType, group, chunkC + 1, numChunks)) print("chunk loaded in %d seconds" % (time.time() - ti)) ## Train model with datapoints and store callbacks history cbHist = model.fit(data['X'], data['Y'], epochs=1, batch_size=64) ## Loss of current chunk's training currLoss = cbHist.history['loss'][0] if currLoss < minLoss: ## Checkpoint filename fNameCP = "./weights/%s-weights-improvement-%.4f-epoch_%dof%d-chunk_%dof%d.hdf5" % ( group, currLoss, i + 1, epochs, chunkC + 1, numChunks) model.save(fNameCP) minLoss = currLoss losses.append([lossC, currLoss]) lossC += 1 chunkC += 1 print("end of epoch %d" % (i + 1)) ## File to write loss data lossCSV = csv.writer(open('losses.csv', 'wb', buffering=0)) lossCSV.writerows(losses) return