Beispiel #1
0
def writeToFiles(data, artist_path):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.writeToFiles()")
    l.getLogger().info("Writing to path {}".format(artist_path))
    for d in data:
        with open(artist_path, 'w') as f:
            f.write("{}\n{}\n\n{}".format(d['artist'], d['title'],
                                          "\n".join(d['lyrics'])))
    return
Beispiel #2
0
def readDataset(artist_path):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.readDataset()")

    dataset = []
    for file in os.listdir(artist_path):
        file_path = os.path.join(artist_path, file)
        dataset.append(readFile(file_path))
    return dataset
Beispiel #3
0
 def print_out(self, q, k, v):
     temp_out, temp_attn = self.scaled_dot_product_attention(q, k, v, None)
     l.getLogger().info('Attention weights are:')
     l.getLogger().info(temp_attn)
     l.getLogger().info('Output is:')
     l.getLogger().info(temp_out)
     return
Beispiel #4
0
def prunedSentences(sentence):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.prunedSentences()")
    return re.sub(".*?\[(.*?)\]", "", sentence)\
            .lower()\
            .replace("i'm", "i am").replace("it's", "it is")\
            .replace("isn't", "is not").replace("there's", "there is")\
            .replace("they've", "they have").replace("\n", " <ENDLINE>")\
            .replace("we've", "we have").replace("wasn't", "was not")\
            .replace(".", " . ").replace(",", " , ")\
            .replace("-", "").replace("\"", "")\
            .replace(":", "").replace("(", "")\
            .replace(")", "").replace("?", " ?")\
            .replace("!", " !")\
            .split()
Beispiel #5
0
    def __init__(self, data=None, model=None, batch_size=4, **kwargs):
        self._logger = l.getLogger()
        self._logger.debug("pinkySpeaker.lib.model.TfTransformer.__init__()")

        self._num_layers = 2
        self._d_model = 64
        self._dff = 64
        self._num_heads = 2

        self._num_layers, self._d_model, self._dff, self._num_heads = self._parsekwArgs(
            kwargs)

        ## Training history object
        self._history = history.history(
            "TfTransformer",
            num_layers=self._num_layers,
            d_model=self._d_model,
            dff=self._dff,
            num_heads=self._num_heads,
        )
        ## _dataset and _model are the two member variables of the class
        self._raw_data = data
        self._model = model
        self._dataset = None

        self._startToken = "</START>"
        self._endToken = "</END>"  ## TODO
        self._padToken = "</PAD>"

        if data:
            self._initArchitecture(data, batch_size)
        elif model:
            self._model = self._loadNNModel(model)
        self._logger.info("TfTransformer model")
        return
Beispiel #6
0
    def __init__(self, data = None, model = None, LSTM_Depth = 3, sequence_length = 320):
        self._logger = l.getLogger()
        self._logger.debug("pinkySpeaker.lib.model.simpleRNN.__init__()")

        ## Model specs        
        self._lyric_sequence_length = sequence_length
        self._LSTM_depth = LSTM_Depth

        ## _dataset and _model are the two member variables of the class
        self._raw_data = data
        self._model = model
        self._dataset = None

        self._maskToken = "</PAD>"
        self._startToken = "</START>"
        self._endToken = "</END>"

        self._history = history.history("simpleRNN", LSTM_Depth = self._LSTM_depth,
                                                     sequence_length = self._lyric_sequence_length
                                                     )

        if data:
            self._initArchitecture(data)
        elif model:
            self._model = self._loadNNModel(model)
        self._logger.info("SimpleRNN model")
        return
Beispiel #7
0
def readFile(song_path):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.readFile()")

    with open(song_path, 'r') as f:
        song = []
        for line in f:
            if line != "\n":
                sentence = prunedSentences(line)
                if sentence:
                    song.append(sentence)
        # Add endfile token to the end
        song[-1].append("endfile")

    # 0th and 1st lines of datapoint correspond to artist and title
    # The rest of it is the lyrics
    return {'artist': song[0], 'title': song[1], 'lyrics': song[2:]}
Beispiel #8
0
 def __init__(self, modeltype, **kwargs):
     self._logger = l.getLogger()
     self._logger.debug("pinkySpeaker.lib.history.history.__init__()")
     self._modeltype = modeltype
     self._properties = self._createProperty(kwargs)
     self._loss = []
     self._accuracy = []
     return
Beispiel #9
0
def fetchData(artist_path_list, plot_sample):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.fetch_data()")

    l.getLogger().info("Fetch data of artist list.")
    for artist_path in artist_path_list:
        l.getLogger().info(artist_path)
        basename = os.path.basename(artist_path)
        if not datasetExists(artist_path, basename):
            l.getLogger().info("Extract dataset for {}.".format(basename))
            data = fetchArtist(basename)
            writeToFiles(data)
        else:
            l.getLogger().info("OK")
            data = readDataset(artist_path)
    if plot_sample:
        plotSamples(data, plot_sample)
    print(plot_sample)

    return data
Beispiel #10
0
def plotSamples(data, stream_out="save"):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.plotSamples()")

    if stream_out not in {"save", "show"}:
        l.getLogger().error("Wrong value provided for stream_out argument")
        raise ValueError("stream_out: must be one of %r." % {"save", "show"})

    stream_length = {}
    max_len = 0
    for d in data:
        ## dlen is the length of the song in num of tokens
        dlen = len(d['title']) + len([t for l in d['lyrics']
                                      for t in l])  # Flattened lyrics list
        if dlen not in stream_length:
            stream_length[dlen] = 1
        else:
            stream_length[dlen] += 1
        max_len = max(max_len, dlen)

    ordered_list = collections.OrderedDict(sorted(
        stream_length.items())).items()
    num_chunks = 38
    chunk_size = int(max_len / (num_chunks - 1))

    plot_list = [{
        'x': [[
            x for x in range(chunk_size, num_chunks * chunk_size +
                             chunk_size, chunk_size)
        ]],
        'y': [[0] * num_chunks],
        'label': ["Song frequency per song length range"]
    }]

    for leng, freq in ordered_list:
        plot_list[0]['y'][0][int(leng / chunk_size)] += freq

    plt.plotBars(plot_list,
                 show_file=True if stream_out == "show" else False,
                 save_file=True if stream_out == "save" else False,
                 bar_annotations=True,
                 show_xlabels=True)
    return
Beispiel #11
0
def datasetExists(path, basename):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.datasetExists()")

    l.getLogger().info("Check if {} dataset exists.".format(basename))
    if not os.path.isdir(path):
        l.getLogger().warning("{} dataset does not exist.".format(basename))
        return False
    else:
        return True
Beispiel #12
0
def crawl(artist):
    l.getLogger().debug("eupy.mrcrawley.spider.crawl()")
    l.getLogger().info("Set up web crawler to fetch {} data.".format(artist))
    if artist not in ARTIST_MAP:
        raise ValueError("{} not available for crawling".format(artist))
    process = CrawlerProcess(
        {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'})
    process.crawl(AZLyricsSpider, ARTIST_MAP[artist], artist)
    process.start(
    )  # the script will block here until the crawling is finished
    l.getLogger().info("Crawling {} succeeded".format(artist))
    global _data
    return _data
Beispiel #13
0
def setupFolders(folds):
    for f in folds:
        l.getLogger().info("Setting up {}".format(f))
        os.makedirs(f, exist_ok=True)
    return
Beispiel #14
0
def fetchArtist(artist):
    l.getLogger().debug("pinkySpeaker.lib.dataloader.fetchArtist()")
    data = cr.crawl(artist)
    return data