def writeToFiles(data, artist_path): l.getLogger().debug("pinkySpeaker.lib.dataloader.writeToFiles()") l.getLogger().info("Writing to path {}".format(artist_path)) for d in data: with open(artist_path, 'w') as f: f.write("{}\n{}\n\n{}".format(d['artist'], d['title'], "\n".join(d['lyrics']))) return
def readDataset(artist_path): l.getLogger().debug("pinkySpeaker.lib.dataloader.readDataset()") dataset = [] for file in os.listdir(artist_path): file_path = os.path.join(artist_path, file) dataset.append(readFile(file_path)) return dataset
def print_out(self, q, k, v): temp_out, temp_attn = self.scaled_dot_product_attention(q, k, v, None) l.getLogger().info('Attention weights are:') l.getLogger().info(temp_attn) l.getLogger().info('Output is:') l.getLogger().info(temp_out) return
def prunedSentences(sentence): l.getLogger().debug("pinkySpeaker.lib.dataloader.prunedSentences()") return re.sub(".*?\[(.*?)\]", "", sentence)\ .lower()\ .replace("i'm", "i am").replace("it's", "it is")\ .replace("isn't", "is not").replace("there's", "there is")\ .replace("they've", "they have").replace("\n", " <ENDLINE>")\ .replace("we've", "we have").replace("wasn't", "was not")\ .replace(".", " . ").replace(",", " , ")\ .replace("-", "").replace("\"", "")\ .replace(":", "").replace("(", "")\ .replace(")", "").replace("?", " ?")\ .replace("!", " !")\ .split()
def __init__(self, data=None, model=None, batch_size=4, **kwargs): self._logger = l.getLogger() self._logger.debug("pinkySpeaker.lib.model.TfTransformer.__init__()") self._num_layers = 2 self._d_model = 64 self._dff = 64 self._num_heads = 2 self._num_layers, self._d_model, self._dff, self._num_heads = self._parsekwArgs( kwargs) ## Training history object self._history = history.history( "TfTransformer", num_layers=self._num_layers, d_model=self._d_model, dff=self._dff, num_heads=self._num_heads, ) ## _dataset and _model are the two member variables of the class self._raw_data = data self._model = model self._dataset = None self._startToken = "</START>" self._endToken = "</END>" ## TODO self._padToken = "</PAD>" if data: self._initArchitecture(data, batch_size) elif model: self._model = self._loadNNModel(model) self._logger.info("TfTransformer model") return
def __init__(self, data = None, model = None, LSTM_Depth = 3, sequence_length = 320): self._logger = l.getLogger() self._logger.debug("pinkySpeaker.lib.model.simpleRNN.__init__()") ## Model specs self._lyric_sequence_length = sequence_length self._LSTM_depth = LSTM_Depth ## _dataset and _model are the two member variables of the class self._raw_data = data self._model = model self._dataset = None self._maskToken = "</PAD>" self._startToken = "</START>" self._endToken = "</END>" self._history = history.history("simpleRNN", LSTM_Depth = self._LSTM_depth, sequence_length = self._lyric_sequence_length ) if data: self._initArchitecture(data) elif model: self._model = self._loadNNModel(model) self._logger.info("SimpleRNN model") return
def readFile(song_path): l.getLogger().debug("pinkySpeaker.lib.dataloader.readFile()") with open(song_path, 'r') as f: song = [] for line in f: if line != "\n": sentence = prunedSentences(line) if sentence: song.append(sentence) # Add endfile token to the end song[-1].append("endfile") # 0th and 1st lines of datapoint correspond to artist and title # The rest of it is the lyrics return {'artist': song[0], 'title': song[1], 'lyrics': song[2:]}
def __init__(self, modeltype, **kwargs): self._logger = l.getLogger() self._logger.debug("pinkySpeaker.lib.history.history.__init__()") self._modeltype = modeltype self._properties = self._createProperty(kwargs) self._loss = [] self._accuracy = [] return
def fetchData(artist_path_list, plot_sample): l.getLogger().debug("pinkySpeaker.lib.dataloader.fetch_data()") l.getLogger().info("Fetch data of artist list.") for artist_path in artist_path_list: l.getLogger().info(artist_path) basename = os.path.basename(artist_path) if not datasetExists(artist_path, basename): l.getLogger().info("Extract dataset for {}.".format(basename)) data = fetchArtist(basename) writeToFiles(data) else: l.getLogger().info("OK") data = readDataset(artist_path) if plot_sample: plotSamples(data, plot_sample) print(plot_sample) return data
def plotSamples(data, stream_out="save"): l.getLogger().debug("pinkySpeaker.lib.dataloader.plotSamples()") if stream_out not in {"save", "show"}: l.getLogger().error("Wrong value provided for stream_out argument") raise ValueError("stream_out: must be one of %r." % {"save", "show"}) stream_length = {} max_len = 0 for d in data: ## dlen is the length of the song in num of tokens dlen = len(d['title']) + len([t for l in d['lyrics'] for t in l]) # Flattened lyrics list if dlen not in stream_length: stream_length[dlen] = 1 else: stream_length[dlen] += 1 max_len = max(max_len, dlen) ordered_list = collections.OrderedDict(sorted( stream_length.items())).items() num_chunks = 38 chunk_size = int(max_len / (num_chunks - 1)) plot_list = [{ 'x': [[ x for x in range(chunk_size, num_chunks * chunk_size + chunk_size, chunk_size) ]], 'y': [[0] * num_chunks], 'label': ["Song frequency per song length range"] }] for leng, freq in ordered_list: plot_list[0]['y'][0][int(leng / chunk_size)] += freq plt.plotBars(plot_list, show_file=True if stream_out == "show" else False, save_file=True if stream_out == "save" else False, bar_annotations=True, show_xlabels=True) return
def datasetExists(path, basename): l.getLogger().debug("pinkySpeaker.lib.dataloader.datasetExists()") l.getLogger().info("Check if {} dataset exists.".format(basename)) if not os.path.isdir(path): l.getLogger().warning("{} dataset does not exist.".format(basename)) return False else: return True
def crawl(artist): l.getLogger().debug("eupy.mrcrawley.spider.crawl()") l.getLogger().info("Set up web crawler to fetch {} data.".format(artist)) if artist not in ARTIST_MAP: raise ValueError("{} not available for crawling".format(artist)) process = CrawlerProcess( {'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}) process.crawl(AZLyricsSpider, ARTIST_MAP[artist], artist) process.start( ) # the script will block here until the crawling is finished l.getLogger().info("Crawling {} succeeded".format(artist)) global _data return _data
def setupFolders(folds): for f in folds: l.getLogger().info("Setting up {}".format(f)) os.makedirs(f, exist_ok=True) return
def fetchArtist(artist): l.getLogger().debug("pinkySpeaker.lib.dataloader.fetchArtist()") data = cr.crawl(artist) return data