Esempio n. 1
0
def main():
    ## To ensure consistent identification of language for lyrics
    ld.DetectorFactory.seed = 0

    ## CSV file of lyrics
    [fName] = getSysArgs.usage(['songStats.py', '<lyric_data_file_path>'])[1:]

    ## Open CSV file of lyrics
    dataCSV = csv.reader(open(fName, 'rU'))

    ## Column headers
    colNames = dataCSV.next()

    ## Index of lyrics column
    lyricI = colNames.index('lyrics')

    ## To store song statistics (assumes lyrics are in right-most column)
    stats = [colNames[:-1] + ['lyricCharCt', 'lyricWordCt', 'lyricVocabSize', 
                              'lines', 'lang']]

    ## To obtain counts of words where strings of non-alphanumeric characters
    #  are not considered words
    tokenize = CountVectorizer().build_analyzer()

    for row in dataCSV:
        print 'Number of song statistics collected: ', row[0]

        ## Song's language
        lang = None

        ## Lyrics for a song
        lyrics = row[lyricI]

        ## Lines in song
        lines = lyrics.count('\n')

        ## Words in song
        words = tokenize(lyrics)

        ## Number of words in song
        wrdCt = len(words)

        if wrdCt < 1:
            lang = 'none'
        else:
            try:
                lang = ld.detect(lyrics.decode('utf-8'))
            except ld.lang_detect_exception.LangDetectException:
                lang = 'none'
                print lyrics

        stats.append(row[:-1] + [len(lyrics), wrdCt, len(set(words)), lines,
                                 lang])

    ## File to write song statistics
    statsCSV = csv.writer(open('songStats.csv', 'wb', buffering = 0))

    statsCSV.writerows(stats)

    return
def main():
    ## CSV file of lyrics
    [fName] = getSysArgs.usage(['readLyrics.py', '<lyric_data_file_path>'])[1:]

    ## Open CSV file of lyrics
    dataCSV = csv.reader(open(fName, 'rU'))

    ## To store lyric statistics
    lyricStats = [['Song', 'No. Lines', 'No. Words']]

    ## To store artists
    artists = {}

    ## To store genres
    genres = {}

    ## To store years
    years = {}

    ## Column headers
    colNames = dataCSV.next()

    for row in dataCSV:
        ## Index of row
        i = row[colNames.index('index')]

        ## Song in row
        song = row[colNames.index('song')]

        ## Lyrics (song text) in row
        lyrics = row[colNames.index('lyrics')]

        lyricStats.append([(song, i),
                           lyrics.count('\n') + 1,
                           len(nltk.word_tokenize(lyrics.decode('utf-8')))])

        ## Artist in row
        artist = row[colNames.index('artist')]

        ## Genre in row
        genre = row[colNames.index('genre')]

        ## Year in row
        year = row[colNames.index('year')]

        if artist not in artists:
            artists[artist] = 0

        if genre not in genres:
            genres[genre] = 0

        if year not in years:
            years[year] = 0

        artists[artist] += 1
        genres[genre] += 1
        years[year] += 1

    ## File to write lyric statistics
    lyricCSV = csv.writer(open('lyricStats.csv', 'wb', buffering=0))

    artists = np.transpose(np.array([artists.keys(), artists.values()]))
    genres = np.transpose(np.array([genres.keys(), genres.values()]))
    years = np.transpose(np.array([years.keys(), years.values()]))

    ## File to write artists
    artistCSV = csv.writer(open('artistStats.csv', 'wb', buffering=0))

    ## File to write genres
    genreCSV = csv.writer(open('genreStats.csv', 'wb', buffering=0))

    ## File to write years
    yearCSV = csv.writer(open('yearStats.csv', 'wb', buffering=0))

    lyricCSV.writerows(lyricStats)
    artistCSV.writerows(artists)
    genreCSV.writerows(genres)
    yearCSV.writerows(years)

    return
Esempio n. 3
0
def main():
    ## The specified group from which to generate lyrics and lyric data
    [groupType, group, seedFile, variation] = getSysArgs.usage(
        ['generate.py', '<group_type>', '<group_name>',
         '<seed_lyrics_file_path>', '<variation_score>'])[1:]

    try:
        variation = int(variation)
    except ValueError:
        variation = float(variation)

    ## Type of variation score provided
    typeVar = type(variation)

    if ((typeVar == int and variation < 1)
        or (typeVar == float
            and (variation <= 0.0 or variation > 1.0))):
        print 'Invalid variation score: ', variation
        print 'Input a float between 0.0 and 1.0 or an int above 1'
        return

    ## Load in seed lyrics
    seed = open(seedFile, 'rU')

    ## Seed lyrics
    seedLyrics = ''

    for line in seed:
        seedLyrics += line

    if seedLyrics[-1] == '\n':  #remove newline char if last char in lyrics
        seedLyrics = seedLyrics[:-1]

    ## Open CSV tracking the best model filenames for various groups
    bestModels = csv.reader(open('bestModels.csv', 'rU'))

    ## Dropout rate of trained model
    dropout = None

    ## Filename of trained model
    modelFile = ''

    for row in bestModels:  #assumes 1st and 2nd cols are groupType and group
        if row[:2] == [groupType, group]:
            dropout = float(row[2])  #assumes dropout is in 3rd column
            modelFile = row[3]  #assumes model filename is in 4th column
            break

    if modelFile == '':
        print 'No model has been trained for the', groupType, group, 'yet'
        return

    modelFile = './weights/' + modelFile

    ## Length of training sequence
    seqLen = 30

    ## To gather training lyric data
    train = load(open('./seqs/' + groupType + '-' + group + '-seq.pkl', 'rb'))

    ## Seed lyric sequence
    seedSeq = train.getWordSeq(seedLyrics)

    seedSeq = ['ppaadd'] * (seqLen - len(seedSeq)) + seedSeq + ['endofline']
    seedSeq = seedSeq[len(seedSeq) - seqLen:]

    if len(seedSeq) < 1:
        print 'Seed lyrics require at least one word\nGiven seed lyrics:'
        print seedLyrics
        return

    ## Number of words in vocabulary
    vocabSize = len(train.words)

    ## Build generator model
    model = Sequential()

    model.add(LSTM(256, input_shape = (seqLen, 1), return_sequences = True)) #1
    model.add(Dropout(dropout))
    model.add(LSTM(256))  #2nd layer
    model.add(Dropout(dropout))
    model.add(Dense(vocabSize, activation = 'softmax'))

    model.load_weights(modelFile)
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

    ## Seed numerical sequence
    seedNS = []

    for word in seedSeq:
        if word in train.words:
            seedNS.append(train.words[word])
        else:  #use most-likely word if seed word not in vocabulary
            seedNS.append(
                np.argmax(
                    model.predict(
                        train.normObs(
                            np.array([train.words['ppaadd']]
                                     * (seqLen - len(seedNS)) + seedNS,
                                     dtype = np.float64),
                            (1, seqLen, 1)), verbose = 0)))

    ## Generated text
    genTxt = seedLyrics + ' |\n'

    ## To store prediction distributions
    predDists = np.empty((200, vocabSize))

    ## Vocabulary as a list for use in randomly selecting word
    wordList = [None] * len(train.words)

    for word in train.words:
        wordList[train.words[word]] = word

    for i in range(200):
        ## Generation initialization for model
        genX = train.normObs(np.array(seedNS, dtype = np.float64),
                             (1, seqLen, 1))

        ## Word-prediction distribution
        pred = model.predict(genX, verbose = 0)

        predDists[i] = pred[0]

        if type(variation) == int and variation > vocabSize:
            print variation, 'larger than vocabulary size of', vocabSize
            variation = vocabSize

        if type(variation) == float:
            variation = int(variation * vocabSize)

        ## Indicies of highest probability next words
        topIs = np.argsort(-pred[0])[:variation]

        pred = pred[0][topIs]
        pred = pred / np.sum(pred)

        ## Highest probability words
        topWords = np.array(wordList)[topIs]

        ## Next word based on randomly choosing from word distribution
        next = np.random.choice(topWords, p = pred)

        if next == 'endofsong':
            break
        elif next == 'endofline':
            genTxt += '\n'
        elif next == 'commachar':
            genTxt += ','
        elif next == 'questionmark':
            genTxt += '?'
        else:
            genTxt += ' ' + next

        seedNS.append(train.words[next])
        seedNS = seedNS[1:]

    genTxt = genTxt.replace('endofline', '\n')
    genTxt = genTxt.replace('commachar', ',')
    genTxt = genTxt.replace('questionmark', '?')

    print genTxt

    ## File to write prediction distibutions for each predicted word
    distsCSV = csv.writer(open('predDists.csv', 'wb', buffering = 0))

    distsCSV.writerows(predDists)

    return
Esempio n. 4
0
def main():
    ## The specified group on which to train, lyric data, and GBs of memory
    [fName, groupType, group, memGB] = getSysArgs.usage([
        'trainRnn.py', '<lyric_data_file_path>', '<group_type>',
        '<group_name>', '<memory_size_limit_in_GB>'
    ])[1:]

    ## Length of training sequence
    seqLen = 30

    ## To gather training lyric data
    train = Lyrics()

    train.lyrics2seqs(groupType, group, [fName], seqLen)
    dump(train, open('./seqs/' + groupType + '-' + group + '-seq.pkl', 'wb'))

    ## Number of epochs for training
    epochs = 50

    ## Vocabulary size
    vocabSize = len(train.words)

    ## Training data chunk size
    #  (size chosen to be the number of data points that can fit into the
    #   user-specified memory limit)
    chunkSize = int(memGB) * 1073741824 / ((seqLen + vocabSize) * 8)

    ## Range of epochs
    epochRng = range(epochs)

    ## Trained weights file from previous iteration
    prevCP = ''

    ## Number of training datapoints for model
    dataPts = 0

    for song in train.lyricSeq:
        dataPts += len(song) - seqLen

    ## Chunks for training model
    chunks = [chunkSize] * (dataPts / chunkSize) + [dataPts % chunkSize]

    if chunks[-1] == 0:
        chunks = chunks[:-1]

    ## Number of chunks
    numChunks = len(chunks)

    ## Build model
    model = Sequential()

    model.add(LSTM(256, input_shape=(seqLen, 1), return_sequences=True))  #1
    model.add(Dropout(0.2))
    model.add(LSTM(256))  #2nd layer
    model.add(Dropout(0.2))
    model.add(Dense(vocabSize, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    ## Chunk counter
    chunkC = 0

    ## Filename for datapoints
    dpFname = "./datapoints/%s-%s_chunk_%dof%d.npz"

    ## Position in lyric sequence
    seqI = 0

    ## Index of song in review
    songI = 0

    for chunk in chunks:
        print("chunk %d of %d" % (chunkC + 1, numChunks))
        print("starting at song %d of %d at word %d/%d" %
              (songI, len(train.lyricSeq), seqI, len(train.lyricSeq[songI])))

        ## Observations
        dataX = []

        ## Responses
        dataY = []

        ## Number range the size of the given chunk
        chunkRng = range(chunk)

        ## Starting time
        ti = time.time()

        for i in chunkRng:
            print("datapoint %d/%d %ds" % (i + 1, chunk, time.time() - ti),
                  end='\r')

            if seqI >= len(train.lyricSeq[songI]) - seqLen:
                songI += 1
                seqI = 0

            dataX.append(train.numSeq[songI][seqI:seqI + seqLen])
            dataY.append([0] * vocabSize)
            dataY[-1][train.numSeq[songI][seqI + seqLen]] = 1
            seqI += 1

        print('\n')
        ti = time.time()
        dataX = train.normObs(np.array(dataX, dtype=np.float64),
                              (chunk, seqLen, 1))

        dataY = np.array(dataY, dtype=np.float64)
        print("numpy arrays created in %d seconds" % (time.time() - ti))
        np.savez_compressed(dpFname %
                            (groupType, group, chunkC + 1, numChunks),
                            X=dataX,
                            Y=dataY)

        chunkC += 1

    ## Loss Data
    losses = [['Step', 'Loss']]

    ## Counter for loss steps
    lossC = 1

    ## Lowest loss from model training
    minLoss = float('inf')

    for i in epochRng:
        print("Epoch %d of %d" % (i + 1, epochs))
        chunkC = 0

        for chunk in chunks:
            print("chunk %d of %d" % (chunkC + 1, numChunks))
            ti = time.time()
            data = np.load(dpFname % (groupType, group, chunkC + 1, numChunks))
            print("chunk loaded in %d seconds" % (time.time() - ti))

            ## Train model with datapoints and store callbacks history
            cbHist = model.fit(data['X'], data['Y'], epochs=1, batch_size=64)

            ## Loss of current chunk's training
            currLoss = cbHist.history['loss'][0]

            if currLoss < minLoss:
                ## Checkpoint filename
                fNameCP = "./weights/%s-weights-improvement-%.4f-epoch_%dof%d-chunk_%dof%d.hdf5" % (
                    group, currLoss, i + 1, epochs, chunkC + 1, numChunks)

                model.save(fNameCP)
                minLoss = currLoss

            losses.append([lossC, currLoss])
            lossC += 1
            chunkC += 1

        print("end of epoch %d" % (i + 1))

    ## File to write loss data
    lossCSV = csv.writer(open('losses.csv', 'wb', buffering=0))

    lossCSV.writerows(losses)

    return