def generate_levenshtein_seed_dict( zero_seed_playlists, all_playlist_names, all_playlists_dict, playlist_df, RESULTS_FOLDER, filename, recompute, seed_k=100): fname = os.path.join(RESULTS_FOLDER, filename) if recompute: comp_memory = {} seed_set = {} for idx, playl in enumerate(zero_seed_playlists): playlist_name = Levenshtein.pre_process(playl['name']) print ('\r{:.2f} % :: Retrieving levenshtein similarities for \'{}\''.format( ((idx + 1) / len(zero_seed_playlists)) * 100, playlist_name), end='') return_dict = {} return_dict['counter'] = 0 return_dict['lowest'] = [] return_dict['targets'] = [] _ = all_playlist_names.apply(Levenshtein.get_closest, args=(playlist_name, return_dict, comp_memory)) seeds = Levenshtein.get_seed_tracks(playlist_df, return_dict, all_playlists_dict, seed_k=seed_k) seed_set[playl['pid']] = [x[0] for x in seeds] store_obj(seed_set, fname, 'pickle') else: seed_set = load_obj(fname, 'pickle') return seed_set
def train_and_predict(df_matrix, dev_set, dev_pidx_row_dict, model_dict, recompute=False, exclude_cold=False): prediction_fname = model_dict['prediction_fname'] model_fname = model_dict['model_fname'] # define estimator als = implicit.als.AlternatingLeastSquares( factors=model_dict['factors'], regularization=model_dict['regularization'], use_gpu=model_dict['use_gpu'], calculate_training_loss=model_dict['calculate_training_loss']) if recompute: print('Fitting model ...') als.fit(df_matrix) prediction_results = {} for key in dev_set.keys(): if exclude_cold and key == 0: continue prediction_results[key] = [] df_len = len(dev_pidx_row_dict[key]) perc = int(df_len / 100) for counter, playlist_row_id in enumerate(dev_pidx_row_dict[key]): if counter % perc == 0: print('Predicting: {} % (k = {})'.format( counter / perc, key), end='\r') preds = als.recommend(playlist_row_id, df_matrix, N=500) prediction_results[key].append(preds) with open(os.path.join(baseline_results_folder, prediction_fname), 'wb') as f: pickle.dump(prediction_results, f) with open(os.path.join(baseline_results_folder, model_fname), 'wb') as f: pickle.dump(als, f) else: prediction_results = load_obj( os.path.join(baseline_results_folder, prediction_fname), 'pickle') als = load_obj(os.path.join(baseline_results_folder, model_fname), 'pickle') return prediction_results, als
def __init__(self, data, seq_length, n_batch_size, n_vocab, step=5, test=False, store_folder='step_point/'): """ data: can be either training, validation or test data seq_length: number of tracks that will be fed into the network step: number of words to be skipped over between training samples within each batch """ self.data = data self.seq_length = seq_length self.n_batch_size = n_batch_size self.n_vocab = n_vocab self.store_folder = store_folder if not os.path.exists(self.store_folder): os.makedirs(self.store_folder) # current_idx will save progress and serve as pointer # will reset to 0 once end is reached if os.path.exists( os.path.join(self.store_folder, 'global_step_point.pckl')): self.current_idx = load_obj( os.path.join(self.store_folder, 'global_step_point.pckl'), 'pickle') else: self.current_idx = 0 self.step = step # calculate steps per epoch self.steps_per_epoch = (len(self.data) // (self.n_batch_size) - 1) // self.step # reload or initialize epoch and step counter if os.path.exists( os.path.join(self.store_folder, 'global_epoch_point.pckl')): self.epoch_counter = load_obj( os.path.join(self.store_folder, 'global_epoch_point.pckl'), 'pickle') else: self.epoch_counter = 0
def bucketing_eval_playlists(x_dev_pids, x_test_pids, all_playlists_dict, RESULTS_FOLDER, recompute): test_playlist_dict_fname = os.path.join(RESULTS_FOLDER, 'test_playlist_dict.pckl') dev_playlist_dict_fname = os.path.join(RESULTS_FOLDER, 'dev_playlist_dict.pckl') if recompute: dev_playlists = [] test_playlists = [] dev_pid_order = [] test_pid_order = [] for pid in x_dev_pids: dev_playlists.append(all_playlists_dict[pid]) for pid in x_test_pids: test_playlists.append(all_playlists_dict[pid]) # gather lengths to generate buckets dev_lengths = [len(x['tracks']) for x in dev_playlists] test_lengths = [len(x['tracks']) for x in test_playlists] dev_indices = get_testing_indices(dev_lengths) test_indices = get_testing_indices(test_lengths) dev_playlist_dict = get_complete_testing_sets(dev_playlists, dev_indices) test_playlist_dict = get_complete_testing_sets(test_playlists, test_indices) store_obj(dev_playlist_dict, dev_playlist_dict_fname, 'pickle') store_obj(test_playlist_dict, test_playlist_dict_fname, 'pickle') else: dev_playlist_dict = load_obj(dev_playlist_dict_fname, 'pickle') test_playlist_dict = load_obj(test_playlist_dict_fname, 'pickle') return dev_playlist_dict, test_playlist_dict
def get_correspondant_list(pid_to_name, seed_k, results_folder, recompute): list_fname = os.path.join(results_folder, 'w2v_dev_correspondant_list.pckl') probs_fname = os.path.join(results_folder, 'w2v_dev_correspondant_list_probas.pckl') if recompute: correspondant_list_tmp_fname = 'cwva_dev_correspondant_list.csv' correspondant_probs_tmp_fname = 'cwva_dev_correspondant_probs.csv' correspondant_list = {} correspondant_list_probs = {} for ix, pid in enumerate(pid_to_name): print('Retrieving CWVA for \'{}\' ({:.2f} %)'.format( pid_to_name[pid], ((ix + 1) / len(pid_to_name)) * 100), end='\r') try: playlists, probabilities = get_similar_playlists( pid_to_name[pid], seed_k) correspondant_list[pid] = playlists correspondant_list_probs[pid] = probabilities #write_to_file(pid, playlists, correspondant_list_tmp_fname) #write_to_file(pid, probabilities, correspondant_probs_tmp_fname) except KeyboardInterrupt: break except: print('Something went wrong with playlist: \'{}\' (pid: {})'. format(pid_to_name[pid], pid)) store_obj(correspondant_list, list_fname, 'pickle') store_obj(correspondant_list_probs, probs_fname, 'pickle') else: correspondant_list = load_obj(list_fname, 'pickle') correspondant_list_probs = load_obj(probs_fname, 'pickle') return correspondant_list, correspondant_list_probs
def generate_all_train_playlist_set(x_train_pids, statistician, results_folder, recompute): all_train_playlist_set_fname = os.path.join(results_folder, 'all_train_playlist_set.pckl') if recompute: all_train_playlist_set = {} for pid in x_train_pids: all_train_playlist_set[pid] = statistician.all_playlists_dict[pid] store_obj(all_train_playlist_set, all_train_playlist_set_fname, 'pickle') else: all_train_playlist_set = load_obj(all_train_playlist_set_fname, 'pickle') return all_train_playlist_set
sys.path.append('../') from collections import Counter from copy import deepcopy from keras.utils import to_categorical from tools.io import extract_pids, load_obj, store_obj, write_recommendations_to_file print('#' * 80) print('Track2Seq Model') print('#' * 80) ################################################################## ############################## SETUP ############################# ################################################################## t2s_config = load_obj('config.json', 'json') input_folder = t2s_config['RESULTS_FOLDER'] # data of pre-processing steps model_folder = t2s_config[ 'T2S_MODEL_FOLDER'] # where model checkpoints are stored model_name = t2s_config['T2S_MODEL_NAME'] # name of model full_model_path = os.path.join(model_folder, model_name) # generate folder if not os.path.exists(full_model_path): print('Created {} ...'.format(full_model_path)) os.makedirs(full_model_path) print('Loading data ...') data = load_obj(os.path.join(input_folder, 'id_sequence.pckl'), 'pickle') vocab = load_obj(os.path.join(input_folder, 'track2id.pckl'), 'pickle') track2int = vocab
def prepare_data_full_cf( df_filename=os.path.join(baseline_results_folder, 'playlist_train.csv'), playlist_src_folder=t2s_config['PLAYLIST_FOLDER'], item_dict_filename=os.path.join(baseline_results_folder, 'track_uri_to_item_id.pckl'), user_dict_filename=os.path.join(baseline_results_folder, 'playlist_id_to_pidx.pckl'), test_playlist_fname=os.path.join(t2s_config['RESULTS_FOLDER'], 'filled_dev_playlists_dict.pckl'), train_pid_ids_fname=os.path.join(t2s_config['RESULTS_FOLDER'], 'x_train_pids.pckl'), test_pidx_row_dict_fname=os.path.join(baseline_results_folder, 'test_pidx_row_dict.pckl'), recompute=True): """ Prepares a list of lists where every individual list stores track ids. Also stores pid to match information at a later point. Parameters: -------------- recompute: bool flag which determines if stored information should be used Returns: -------------- res_df: pd.DataFrame, mapping user to item interaction item_dict: dict, item id to track uri user_dict: dict, simplified playlist id to pid """ if recompute: counter = 0 total_files = len(os.listdir(playlist_src_folder)) list_of_list = [] item_dict = {} user_dict = {} item_counter = 0 playlists_of_tracks_uri = [] pidx = 0 train_pid_ids_dict = load_obj(train_pid_ids_fname, dtype='pickle') for playlist_json in os.listdir(playlist_src_folder): print( "Working on slice {} ({:.2f} %) (File Name: {} || Total Slices: {})" .format(counter, (counter / total_files) * 100, playlist_json, total_files), end='\r') counter += 1 data_json = load_obj(os.path.join(playlist_src_folder, playlist_json), dtype='json') for playlist in data_json['playlists']: if playlist['pid'] not in train_pid_ids_dict: continue # filter out any test and dev playlists if playlist['pid'] not in user_dict: user_dict[playlist['pid']] = pidx pidx += 1 for track in playlist['tracks']: if track['track_uri'] in item_dict: track_id = item_dict[track['track_uri']] else: track_id = item_counter item_dict[track['track_uri']] = track_id item_counter += 1 list_of_list.append( [user_dict[playlist['pid']], track_id, 1]) # pid, track_id, rating # add dev set to matrix and dicts print('Loading Test/Dev Set...') test_pidx_row_dict = {} test_set = load_obj(test_playlist_fname, 'pickle') for key in [0, 1, 5, 10, 25, 100]: list_of_dev_playlists = test_set[key] test_pidx_row_dict[key] = [] for playlist in list_of_dev_playlists: if len(playlist['seed']) < 1: continue # filter out any 0 seed playlists if playlist['pid'] not in user_dict: test_pidx_row_dict[key].append(pidx) user_dict[playlist['pid']] = pidx pidx += 1 for track in playlist['seed']: if track in item_dict: track_id = item_dict[track] else: track_id = item_counter item_dict[track] = track_id item_counter += 1 list_of_list.append( [user_dict[playlist['pid']], track_id, 1]) # pid, track_id, rating print('Storing results ...') # store results with open(item_dict_filename, 'wb') as f: pickle.dump(item_dict, f) with open(user_dict_filename, 'wb') as f: pickle.dump(user_dict, f) with open(test_pidx_row_dict_fname, 'wb') as f: pickle.dump(test_pidx_row_dict, f) res_df = pd.DataFrame(list_of_list) res_df.to_csv(df_filename, sep='\t', index=False, header=False) else: # load results res_df = load_obj(df_filename, dtype='pandas') item_dict = load_obj(item_dict_filename, dtype='pickle') user_dict = load_obj(user_dict_filename, dtype='pickle') test_pidx_row_dict = load_obj(test_pidx_row_dict_fname, dtype='pickle') return res_df, item_dict, {v: k for k, v in user_dict.items() }, test_pidx_row_dict
from scipy.sparse import csr_matrix, lil_matrix from tools.io import load_obj from tools.metrics import recsys_metrics ################################################################## ############################## SETUP ############################# ################################################################## recompute = True baseline_results_folder = 'baselines/' dev_playlist_fname = 'results/dev_playlist_dict.pckl' if not os.path.exists(baseline_results_folder): os.makedirs(baseline_results_folder) t2s_config = load_obj('config.json', 'json') ################################################################## ######################### HYPER PARAMETERS ####################### ################################################################## # define hyper-parameter for alternating least-squares model als_model_dict = { 'one': { 'factors': 30, 'regularization': 0.01, 'use_gpu': True, 'calculate_training_loss': True, 'model_fname': 'model_wmf_30_001_18_04_13.pckl', 'prediction_fname': 'prediction_wmf_30_001_18_04_13.pckl' },
import string import pandas as pd from collections import Counter from tools.io import load_obj, store_obj, extract_pids, write_to_file print('#' * 80) print('Track2Seq CWVA Seeds') print('#' * 80) ################################################################## ############################## SETUP ############################# ################################################################## t2s_config = load_obj( 'config.json', 'json') # all configuration files can be set manually as well PLAYLIST_FOLDER = t2s_config[ 'PLAYLIST_FOLDER'] # set folder of playlist information RESULTS_FOLDER = t2s_config[ 'RESULTS_FOLDER'] # all information will be stored here W2V_FOLDER = t2s_config['W2V_FNAME'] RANDOM_STATE = t2s_config['RANDOM_STATE'] recompute = True np.random.seed(RANDOM_STATE) # download `GoogleNews-vectors-negative300.bin.gz` from # https://github.com/mmihaltz/word2vec-GoogleNews-vectors w2v_fname = t2s_config['W2V_BINARY_FNAME']
def get_playlist_df(self, recompute): """ Method that iterates over a playlist collection and retrieves all potential information to store in one list of lists. This list can be used to create a well-formed pandas DataFrame. Parameters: --------------- columns: list storing all available and additional features for playlists artist_popularity_dict: lookup dict for artist popularity metrics artist_popularity_dict: lookup dict for track popularity metrics playlist_collection: retrieved playlist json Returns: --------------- tmp_playlist_list: list of lists every list containing features of a playlist columns: list of column names """ playlist_df_fname = os.path.join(self.results_folder, 'playlist_df.csv') if not os.path.exists(playlist_df_fname) or recompute: # check if popularity dict has been created and loaded if not self.track_popularity_dict: _ = self.create_track_popularity_dict(recompute) playlist_popularity = [] for playlist_coll_fname in self.all_playlist_filenames: tmp_playlist_list = [] playlist_coll = load_obj(playlist_coll_fname, 'json') for playlist in playlist_coll['playlists']: tmp_track_pop = [] track_count = 0 columns = [ x for x in playlist.keys() if 'tracks' not in str(x) and 'description' not in str(x) ] columns.extend([ 'track_popularity_median', 'description', 'num_tracks' ]) tmp_playlist_features = [ playlist[x] for x in playlist.keys() if 'tracks' not in str(x) and 'description' not in str(x) ] for track in playlist['tracks']: track_count += 1 artist = track['artist_uri'] track = track['track_uri'] tmp_track_pop.append(self.track_popularity_dict[track]) tmp_playlist_features.extend([np.median(tmp_track_pop)]) tmp_playlist_features.append( playlist['description'] if 'description' in playlist.keys() else None) tmp_playlist_features.append(track_count) tmp_playlist_list.append(tmp_playlist_features) playlist_popularity.extend(tmp_playlist_list) self.playlist_df = pd.DataFrame(playlist_popularity, columns=columns) # store DataFrame to HDD self.playlist_df.to_csv(playlist_df_fname) else: self.playlist_df = pd.read_csv(playlist_df_fname, index_col=0) return self.playlist_df
def track_uri_to_artist_and_title(self, uri): if not self.uri_dict: print('Loading URI dict...') self.uri_dict = load_obj(self.uri_dict_fname, 'pickle') return self.uri_dict[uri]
import numpy as np import os import pandas as pd from collections import Counter from tools.io import load_obj, store_obj print ('#' * 80) print ('Track2Seq Levenshtein Seeds') print ('#' * 80) ################################################################## ############################## SETUP ############################# ################################################################## t2s_config = load_obj('config.json', 'json') # all configuration files can be set manually as well RESULTS_FOLDER = t2s_config['RESULTS_FOLDER'] # all information will be stored here RANDOM_STATE = t2s_config['RANDOM_STATE'] recompute = True np.random.seed(RANDOM_STATE) ################################################################## ############################# METHODS ############################ ################################################################## class Levenshtein(object): def __init__(self): version = '0.1' @staticmethod
def create_track_popularity_dict(self, recompute=False): """ Iteration method leveraging count_artists_and_tracks method to aggregate information out of all playlist collections. Parameters: -------------- recompute: bool flag determining whether precomputed results should be used or not Returns: -------------- track_popularity_dict: dict mapping track uris to their popularity count in all playlists """ track_popularity_dict_fname = os.path.join( self.results_folder, 'track_popularity_dict.pckl') all_playlists_dict_fname = os.path.join(self.results_folder, 'all_playlists_dict.pckl') track_uri_to_track_artist_string_fname = os.path.join( self.results_folder, 'track_uri_to_track_artist_string.pckl') if not os.path.exists(track_popularity_dict_fname) or recompute: track_uri_to_track_artist_string = {} # TODO: fill with goods track_popularity_dict = {} total_files = len(self.all_playlist_filenames) counter = 0 for playlist_file in self.all_playlist_filenames: counter += 1 print( "Working on slice {} ({:.2f} %) (File Name: {} || Total Slices: {})" .format(counter, (counter / total_files) * 100, playlist_file, total_files), end='\r') playlist_collection = load_obj(playlist_file, 'json') for playlist in playlist_collection['playlists']: self.all_playlists_dict[playlist['pid']] = { 'pid': playlist['pid'], 'name': playlist['name'], 'tracks': [] } for t in playlist['tracks']: track_uri = t['track_uri'] # create popularity dict if track_uri in track_popularity_dict: track_popularity_dict[track_uri] += 1 else: track_popularity_dict[track_uri] = 1 # create all playlist dict self.all_playlists_dict[ playlist['pid']]['tracks'].append(track_uri) # store dict print('\nStoring all_playlist and popularity dicts ...') store_obj(track_popularity_dict, track_popularity_dict_fname, 'pickle') store_obj(self.all_playlists_dict, all_playlists_dict_fname, 'pickle') self.track_popularity_dict = track_popularity_dict else: self.track_popularity_dict = load_obj(track_popularity_dict_fname, 'pickle') self.all_playlists_dict = load_obj(all_playlists_dict_fname, 'pickle') return self.track_popularity_dict
def split_playlist_df(df, random_state, all_playlists_dict, results_folder, recompute=False): x_train_pids_fname = os.path.join(results_folder, 'x_train_pids.pckl') x_dev_pids_fname = os.path.join(results_folder, 'x_dev_pids.pckl') x_test_pids_fname = os.path.join(results_folder, 'x_test_pids.pckl') if recompute: # To meet the second criteria for all tracks in the dev # and test sets to be in the training set # a bigger split is being produced. X_train_full, X_test = train_test_split( df, test_size=.1, random_state=random_state, stratify=df[[ 'track_popularity_median_class_quantile', 'num_tracks_class_quantile', 'modified_at_class_quantile' ]]) # filter playlist for rare tracks that occur only in one set but not in the other x_train_pids = X_train_full.pid.values x_test_pids = X_test.pid.values all_tracks = set() test_playlists = {} for p in all_playlists_dict: if p in x_train_pids: for track in all_playlists_dict[p]['tracks']: all_tracks.add(track) elif p in x_test_pids: test_playlists[p] = all_playlists_dict[p] missing_pid = {} candidates = [] for p in test_playlists: is_candidate = True for track in test_playlists[p]['tracks']: if track not in all_tracks: is_candidate = False if p not in missing_pid: missing_pid[p] = 1 else: missing_pid[p] += 1 if is_candidate: candidates.append(p) # do final dev / test split dev_test = np.random.choice(candidates, 20000, replace=False) dev_test = shuffle(dev_test, random_state=random_state) x_dev_pids, x_test_pids = dev_test[:10000], dev_test[10000:] print('Storing train, dev and test playlist ids ...') store_obj(x_train_pids, x_train_pids_fname, 'pickle') store_obj(x_dev_pids, x_dev_pids_fname, 'pickle') store_obj(x_test_pids, x_test_pids_fname, 'pickle') else: x_train_pids = load_obj(x_train_pids_fname, 'pickle') x_dev_pids = load_obj(x_dev_pids_fname, 'pickle') x_test_pids = load_obj(x_test_pids_fname, 'pickle') return x_train_pids, x_dev_pids, x_test_pids
def artist_uri_to_artist_string(self, uri): if not self.artist_uri_dict: print('Loading Artist URI dict...') self.artist_uri_dict = load_obj(self.artist_uri_dict_fname, 'pickle') return self.artist_uri_dict[uri]
def main(): # in case a specific GPU should be used #gpu_options = tf.GPUOptions(visible_device_list='0') #config = tf.ConfigProto(gpu_options=gpu_options) #sess = tf.Session(config=config) sess = tf.Session() # initialize data generator n_vocab = len(int2track) bg = BatchGenerator(data=data, seq_length=seq_length, n_batch_size=n_batch_size, n_vocab=n_vocab, step=skips, store_folder=os.path.join(full_model_path, 'step_point')) current_epoch = bg.epoch_counter # intialize model for training model = Seq2Track(n_batch_size=n_batch_size, seq_length=seq_length, n_vocab=n_vocab, n_layers=n_layers, latent_size=latent_size) # initialize model for prediction # reusing scope for recommendations with tf.variable_scope(tf.get_variable_scope(), reuse=True): pred_model = Seq2Track(n_batch_size=n_batch_size, seq_length=seq_length, n_vocab=n_vocab, n_layers=n_layers, latent_size=latent_size, recommendation=True) # pick up the process where we left off - if possible saver = tf.train.Saver(tf.global_variables()) init_operation = tf.global_variables_initializer() sess.run(init_operation) # check if a model exists, if so - load it if os.path.exists(os.path.join(full_model_path, 'checkpoint')): saver.restore(sess, tf.train.latest_checkpoint(full_model_path)) # training routine if training: # run epochs for e in range(current_epoch, epochs): avg_epoch_cost = [] # store average cost per epoch # for any epoch initialize state as zeros current_state = np.zeros((n_layers, 2, n_batch_size, latent_size)) for step in range(bg.current_idx, bg.steps_per_epoch): X_batch, y_batch = next( bg.generate()) # generate fresh training batch if step % 10 == 0: # show progress every 10 steps start_time = time.time() cost, _, current_state = sess.run( [model.cost, model.training_op, model.state], feed_dict={ model.X: X_batch, model.y: y_batch, model.initial_state: current_state }) avg_epoch_cost.append(cost) end_time = (time.time() - start_time) print('Epoch: {} - Step: {} / {} - Cost: {} - Time: {}s'. format(e, step, bg.steps_per_epoch, np.mean(avg_epoch_cost), end_time)) elif step % 1000 == 0: # show recommendation examples every 1000 steps start_time = time.time() cost, _, current_state, acc = sess.run( [ model.cost, model.training_op, model.state, model.accuracy ], feed_dict={ model.X: X_batch, model.y: y_batch, model.initial_state: current_state }) # Compute cost and accuracy avg_epoch_cost.append(cost) end_time = (time.time() - start_time) print( 'Epoch: {} - Step: {} / {} - Cost: {} - Accuracy: {} - Time: {}s' .format(e, step, bg.steps_per_epoch, np.mean(avg_epoch_cost), acc, end_time)) # Show recommendations # can be changed to incorporate any track that's in int2track sample_seed_sequence = [ 'spotify:track:14AaSKhUMiR5qbNvhjlj9L', 'spotify:track:2tznHmp70DxMyr2XhWLOW0', 'spotify:track:0uqPG793dkDDN7sCUJJIVC' ] print('Seeds: {} '.format(x for x in sample_seed_sequence)) results = pred_model.recommend(sess, sample_seed_sequence, int2track, track2int, n=500) print('Recommendations: {}'.format([x for x in results])) else: cost, _, current_state = sess.run( [model.cost, model.training_op, model.state], feed_dict={ model.X: X_batch, model.y: y_batch, model.initial_state: current_state }) avg_epoch_cost.append(cost) # Save the model and the vocab if step != 0 and step % save_steps == 0: # Save model bg.store_step_counter(step) bg.store_epoch_counter(e) model_file_name = os.path.join(full_model_path, 'model') saver.save(sess, model_file_name, global_step=step) print('Model Saved To: {}'.format(model_file_name)) # if epoch is over bg.store_epoch_counter(e) bg.current_idx = 0 bg.store_step_counter(0) model_file_name = os.path.join(full_model_path, 'model') saver.save(sess, model_file_name, global_step=step) print('Model Saved To: {}'.format(model_file_name)) else: pid_collection = extract_pids(result_fname) all_challenge_playlists = load_obj(evaluation_set_fname, 'pickle') init = tf.global_variables_initializer() sess.run(init) if os.path.exists(os.path.join(full_model_path, 'checkpoint')): saver.restore(sess, tf.train.latest_checkpoint(full_model_path)) num_playlists = 0 for k in all_challenge_playlists: num_playlists += len(all_challenge_playlists[k]) print( 'Recommending tracks for {:,} playlists...'.format(num_playlists)) avg_time = [] for k in all_challenge_playlists: for ix, playlist in enumerate(all_challenge_playlists[k]): start_wall_time = time.time() if playlist['pid'] in pid_collection: continue reco_per_playlist = [] reco_store = [] try: reco_per_playlist = pred_model.recommend(sess, playlist['seed'], int2track, track2int, n=600) if not reco_per_playlist: print('Something went wrong with playlist {}'.format( playlist['pid'])) continue except KeyboardInterrupt: sys.exit() except Exception as err: print('Something went wrong with playlist {} (Error: {})'. format(playlist['pid'], err)) continue # store recommendations reco_per_playlist = reco_per_playlist[:500] pid_collection.append(playlist['pid']) time_elapsed = time.time() - start_wall_time avg_time.append(time_elapsed) print( 'Recommended {} songs ({} / {}). Avg time per playlist: {:.2f} seconds.' .format(len(reco_per_playlist), ix, num_playlists, np.mean(avg_time))) write_recommendations_to_file(challenge_track, team_name, contact_info, playlist['pid'], reco_per_playlist, result_fname) with open(result_fname, 'a') as f: f.write(str(playlist['pid']) + ', ') f.write(', '.join([x for x in reco_per_playlist])) f.write('\n\n')
from collections import Counter from sklearn.utils import shuffle from sklearn.model_selection import train_test_split from tools.io import load_obj, store_obj print('#' * 80) print('Track2Seq Preprocessing') print('#' * 80) ################################################################## ############################## SETUP ############################# ################################################################## t2s_config = load_obj( 'config.json', 'json') # all configuration files can be set manually as well PLAYLIST_FOLDER = t2s_config[ 'PLAYLIST_FOLDER'] # set folder of playlist information RESULTS_FOLDER = t2s_config[ 'RESULTS_FOLDER'] # all information will be stored here RANDOM_STATE = t2s_config['RANDOM_STATE'] recompute = True np.random.seed(RANDOM_STATE) ################################################################## ############################# METHODS ############################ ##################################################################