def _fill(csv_file_list): """ Fill the database given the list of data CSVs. :param csv_file_list: List of CSV file paths where the data is. :return: Database filled. """ song_added = 0 for csv_file_name in tqdm(csv_file_list, total=len(csv_file_list)): with open(csv_file_name, 'r') as csv_file: rows = [row for row in csv.reader(csv_file) if row][1:] for row in rows: try: if len(row) == len(SCRIPT_ROW): song_name = row[SCRIPT_ROW.index('SONG_NAME')] artist_name = row[SCRIPT_ROW.index('ARTIST_NAME')] if not song_service.get_song_by_name_and_artist( song_name, artist_name): lyrics = row[SCRIPT_ROW.index('LYRICS')] artist_url = row[SCRIPT_ROW.index('ARTIST_URL')] song_url = row[SCRIPT_ROW.index('SONG_URL')] song_id = song_service.add_song( artist_name, song_name, lyrics, artist_url, song_url) song_added += int(bool(song_id)) except Exception as e: log.warn(f'Skipping row due to [{e}]') log.warn(f'Row: {row}') log.info(f'Songs added: [{song_added}]')
def _extract(id_list): """ Extract the maximum distance given all the song identifiers. :param id_list: List full of song identifiers. :return: Maximum distance. """ log.info('Extracting maximum distance...') if SCRIPT_PARALLEL: with ProcessPool(SCRIPT_PROCESS_AMOUNT) as pool: args_list = [(song_id, len(id_list)) for song_id in id_list] r = list( tqdm(pool.imap(__get_maximum_distance, args_list, chunksize=SCRIPT_CHUNK_SIZE), total=len(args_list))) maximum_distance = max(r) else: nmslib_index = Nmslib() nmslib_index.load(FILE_NAME_INDEX) maximum_distance = 0.0 for idx, song_id in tqdm(enumerate(id_list), total=len(id_list)): features = searcher.extract_features_from_song(song_id) if features is not None: song_maximum_distance = searcher.get_maximum_distance( features, nmslib_index, len(id_list)) if song_maximum_distance > maximum_distance: maximum_distance = song_maximum_distance if idx % 1000 == 0: _save(maximum_distance) log.info('Extracted!') return maximum_distance
def _get_all_lyrics(): """ Retrieve all the song lyrics from the database. :return: List of song lyrics. """ log.info('Getting all songs...') song_list = song_service.get_all_songs() lyrics_list = [song.lyrics for song in song_list] log.info(f'Lyrics: [{len(lyrics_list)}]') return lyrics_list
def _save(maximum_distance): """ Save the maximum distance to the file. :param maximum_distance: Maximum distance to save. :return: Maximum distance saved. """ log.info(f'Saving maximum distance: [{maximum_distance}]') with open(FILE_NAME_MAXIMUM_DISTANCE, 'w') as file: file.write(str(maximum_distance)) log.info('Done!')
def _shape(lyrics_list): """ Shape the normalized lyrics list. :param lyrics_list: List of lyrics. :return: Shaped list. """ log.info('Shaping lyrics list...') lyrics_list = word2vec.shape(lyrics_list) log.info(f'Shaping done. Shape: [{lyrics_list.shape}]') return lyrics_list
def _get_csv_file_list(unzipping_output_folder): """ Extract all the csv file paths given the generated folder. :param unzipping_output_folder: Folder path. :return: List of CSV file paths. """ csv_file_list = [ i for i in glob.glob(f'{unzipping_output_folder}/**/*.csv') ] log.info(f'{len(csv_file_list)} CSV files extracted.') return csv_file_list
def _build(lyrics_list): """ Build the NMSLIB given the prepared input of data. :param lyrics_list: Index needed data input. :return: NMSLIB index built and saved. """ log.info('Building index...') index_instance = Nmslib() index_instance.fit(lyrics_list) index_instance.save(FILE_NAME_INDEX) log.info('Index built!')
def _clean_lyrics(lyrics_list): """ Clean all lyrics for being useful for the training. :param lyrics_list: List of lyrics to clean. :return: Cleaned representation of the given lyrics list. """ lyrics_list_cleaned = [] log.info('Cleaning all lyrics...') for lyrics in tqdm(lyrics_list, total=len(lyrics_list)): lyrics_list_cleaned.append(word2vec.clean_lyrics(lyrics)) log.info(f'Lyrics cleaned: [{len(lyrics_list_cleaned)}]') return lyrics_list_cleaned
def _normalize_lyrics(lyrics_list, w2v_instance): """ Normalize a list of song lyrics given the trained word2vec model. :param lyrics_list: List of songs. :param w2v_instance: Trained word2vec model instance. :return: Normalized lyrics list. """ lyrics_list_normalized = [] log.info('Normalizing all lyrics...') index_id = 0 for idx, lyrics in tqdm(enumerate(lyrics_list), total=len(lyrics_list)): lyrics_normalized = word2vec.normalize(lyrics, w2v_instance) if lyrics_normalized is not None: lyrics_list_normalized.append(lyrics_normalized) song_service.set_index_id(idx + 1, index_id) index_id += 1 log.info(f'Lyrics normalized: [{len(lyrics_list_normalized)}]') return lyrics_list_normalized
def _unzip(): """ Unzip the input file to the given folder. :return: File unzipped. """ assert args.input_file.endswith('.zip') log.info('Opening input file...') with zipfile.ZipFile(args.input_file, 'r') as zip_file: log.info( f'Extracting all file from [{args.input_file}] into [{args.unzipping_output_folder}]...' ) zip_file.extractall(args.unzipping_output_folder) log.info('Unzipping done!') return args.unzipping_output_folder
def _train(w2c_instance, lyrics_list): """ Train the word2vec instance given the prepared data. :param w2c_instance: Word2vec instance to train. :param lyrics_list: Data needed for training the instance. :return: Model trained and saved. """ log.info('Introducing the vocabulary...') w2c_instance.build_vocab(lyrics_list) log.info('Vocabulary introduced.') lyrics_count = w2c_instance.corpus_count epochs_count = w2c_instance.epochs log.info(f'Lyrics count: [{lyrics_count}]') log.info(f'Epochs count: [{epochs_count}]') log.info('Start training...') w2c_instance.train(lyrics_list, total_examples=lyrics_count, epochs=epochs_count) log.info('Trained!') log.info('Saving instance...') word2vec.save_w2v_instance(FILE_NAME_W2V, w2c_instance) log.info(f'Saved in [{FILE_NAME_W2V}]')