Esempio n. 1
0
def readLyrics(csvPath,
               vocabPath,
               indices=None,
               convert2IndicesAndAddEOS=True):
    '''
    Reads in lyrics csv, optionally returns only a subset
    :param csvPath:
    :return:
    '''

    # Create vocabulary (mapping between characters and ints)
    vocab = Vocabulary.create_from_text(
        'abcdefghijklmnopqrstuvwxyz0123456789 \n')

    # Save vocab
    vocab.save(vocabPath)

    wordCount = dict()
    totalWordCount = 0

    # Read in CSV lyrics line by line, build Tensor for each
    data = list()
    with open(csvPath, 'r') as csvFile:  # Open CSV
        # Prepare CSV
        reader = csv.reader(csvFile, delimiter=",")
        firstRow = reader.next()
        lyricsIndex = firstRow.index("lyrics")
        artistIndex = firstRow.index("artist")

        # Go through CSV
        i = 0
        for row in reader:
            i += 1
            if (i % 10000 == 0): print i
            # Prepare lyrics
            seq = preprocessLyrics(row[lyricsIndex])  # Append and preprocess

            if convert2IndicesAndAddEOS:
                # Convert chars to int
                seq = vocab.char2index(seq)
                # Add sequence-end token:
                seq.append(vocab.size)

            # Write lyrics
            data.append([row[artistIndex].lower().strip(), seq])

    if indices is not None:
        return data[indices], vocab
    else:
        return data, vocab