def generate_levenshtein_seed_dict( zero_seed_playlists, all_playlist_names, all_playlists_dict, playlist_df, RESULTS_FOLDER, filename, recompute, seed_k=100): fname = os.path.join(RESULTS_FOLDER, filename) if recompute: comp_memory = {} seed_set = {} for idx, playl in enumerate(zero_seed_playlists): playlist_name = Levenshtein.pre_process(playl['name']) print ('\r{:.2f} % :: Retrieving levenshtein similarities for \'{}\''.format( ((idx + 1) / len(zero_seed_playlists)) * 100, playlist_name), end='') return_dict = {} return_dict['counter'] = 0 return_dict['lowest'] = [] return_dict['targets'] = [] _ = all_playlist_names.apply(Levenshtein.get_closest, args=(playlist_name, return_dict, comp_memory)) seeds = Levenshtein.get_seed_tracks(playlist_df, return_dict, all_playlists_dict, seed_k=seed_k) seed_set[playl['pid']] = [x[0] for x in seeds] store_obj(seed_set, fname, 'pickle') else: seed_set = load_obj(fname, 'pickle') return seed_set
def generate_all_train_playlist_set(x_train_pids, statistician, results_folder, recompute): all_train_playlist_set_fname = os.path.join(results_folder, 'all_train_playlist_set.pckl') if recompute: all_train_playlist_set = {} for pid in x_train_pids: all_train_playlist_set[pid] = statistician.all_playlists_dict[pid] store_obj(all_train_playlist_set, all_train_playlist_set_fname, 'pickle') else: all_train_playlist_set = load_obj(all_train_playlist_set_fname, 'pickle') return all_train_playlist_set
def bucketing_eval_playlists(x_dev_pids, x_test_pids, all_playlists_dict, RESULTS_FOLDER, recompute): test_playlist_dict_fname = os.path.join(RESULTS_FOLDER, 'test_playlist_dict.pckl') dev_playlist_dict_fname = os.path.join(RESULTS_FOLDER, 'dev_playlist_dict.pckl') if recompute: dev_playlists = [] test_playlists = [] dev_pid_order = [] test_pid_order = [] for pid in x_dev_pids: dev_playlists.append(all_playlists_dict[pid]) for pid in x_test_pids: test_playlists.append(all_playlists_dict[pid]) # gather lengths to generate buckets dev_lengths = [len(x['tracks']) for x in dev_playlists] test_lengths = [len(x['tracks']) for x in test_playlists] dev_indices = get_testing_indices(dev_lengths) test_indices = get_testing_indices(test_lengths) dev_playlist_dict = get_complete_testing_sets(dev_playlists, dev_indices) test_playlist_dict = get_complete_testing_sets(test_playlists, test_indices) store_obj(dev_playlist_dict, dev_playlist_dict_fname, 'pickle') store_obj(test_playlist_dict, test_playlist_dict_fname, 'pickle') else: dev_playlist_dict = load_obj(dev_playlist_dict_fname, 'pickle') test_playlist_dict = load_obj(test_playlist_dict_fname, 'pickle') return dev_playlist_dict, test_playlist_dict
def get_correspondant_list(pid_to_name, seed_k, results_folder, recompute): list_fname = os.path.join(results_folder, 'w2v_dev_correspondant_list.pckl') probs_fname = os.path.join(results_folder, 'w2v_dev_correspondant_list_probas.pckl') if recompute: correspondant_list_tmp_fname = 'cwva_dev_correspondant_list.csv' correspondant_probs_tmp_fname = 'cwva_dev_correspondant_probs.csv' correspondant_list = {} correspondant_list_probs = {} for ix, pid in enumerate(pid_to_name): print('Retrieving CWVA for \'{}\' ({:.2f} %)'.format( pid_to_name[pid], ((ix + 1) / len(pid_to_name)) * 100), end='\r') try: playlists, probabilities = get_similar_playlists( pid_to_name[pid], seed_k) correspondant_list[pid] = playlists correspondant_list_probs[pid] = probabilities #write_to_file(pid, playlists, correspondant_list_tmp_fname) #write_to_file(pid, probabilities, correspondant_probs_tmp_fname) except KeyboardInterrupt: break except: print('Something went wrong with playlist: \'{}\' (pid: {})'. format(pid_to_name[pid], pid)) store_obj(correspondant_list, list_fname, 'pickle') store_obj(correspondant_list_probs, probs_fname, 'pickle') else: correspondant_list = load_obj(list_fname, 'pickle') correspondant_list_probs = load_obj(probs_fname, 'pickle') return correspondant_list, correspondant_list_probs
def store_epoch_counter(self, e): self.epoch_counter = e store_obj(self.epoch_counter, os.path.join(self.store_folder, 'global_epoch_point.pckl'), 'pickle')
def store_step_counter(self, s): store_obj(s, os.path.join(self.store_folder, 'global_step_point.pckl'), 'pickle')
binary=True) x_train_pids = load_obj(x_train_pids_fname, 'pickle') print('Calculating average tokens for playlist titles ...') playlist_df = pd.read_csv(playlist_df_fname, index_col=0) df = playlist_df[playlist_df['pid'].isin(x_train_pids)] del (playlist_df) return_vecs = {} _ = df.apply(get_vecs, axis=1, args=(return_vecs, )) return_vecs_norm = mean_and_unify(return_vecs) playlist_title_2_vec, translation_dict = avg_vector_to_matrix( return_vecs_norm) store_obj(playlist_title_2_vec, playlist_title_2_vec_fname, 'pickle') store_obj(translation_dict, translation_dict_fname, 'pickle') else: playlist_title_2_vec = load_obj(playlist_title_2_vec_fname, 'pickle') translation_dict = load_obj(translation_dict_fname, 'pickle') complete_dev_seed_list_fname = os.path.join(W2V_FOLDER, 'complete_dev_seed_list.pckl') if recompute: dev_playlist_dict = load_obj(dev_playlist_dict_fname, 'pickle') zero_dev = dev_playlist_dict[0] dev_pid_to_name = {} for dplaylist in zero_dev: dev_pid_to_name[dplaylist['pid']] = dplaylist['name']
def create_track_popularity_dict(self, recompute=False): """ Iteration method leveraging count_artists_and_tracks method to aggregate information out of all playlist collections. Parameters: -------------- recompute: bool flag determining whether precomputed results should be used or not Returns: -------------- track_popularity_dict: dict mapping track uris to their popularity count in all playlists """ track_popularity_dict_fname = os.path.join( self.results_folder, 'track_popularity_dict.pckl') all_playlists_dict_fname = os.path.join(self.results_folder, 'all_playlists_dict.pckl') track_uri_to_track_artist_string_fname = os.path.join( self.results_folder, 'track_uri_to_track_artist_string.pckl') if not os.path.exists(track_popularity_dict_fname) or recompute: track_uri_to_track_artist_string = {} # TODO: fill with goods track_popularity_dict = {} total_files = len(self.all_playlist_filenames) counter = 0 for playlist_file in self.all_playlist_filenames: counter += 1 print( "Working on slice {} ({:.2f} %) (File Name: {} || Total Slices: {})" .format(counter, (counter / total_files) * 100, playlist_file, total_files), end='\r') playlist_collection = load_obj(playlist_file, 'json') for playlist in playlist_collection['playlists']: self.all_playlists_dict[playlist['pid']] = { 'pid': playlist['pid'], 'name': playlist['name'], 'tracks': [] } for t in playlist['tracks']: track_uri = t['track_uri'] # create popularity dict if track_uri in track_popularity_dict: track_popularity_dict[track_uri] += 1 else: track_popularity_dict[track_uri] = 1 # create all playlist dict self.all_playlists_dict[ playlist['pid']]['tracks'].append(track_uri) # store dict print('\nStoring all_playlist and popularity dicts ...') store_obj(track_popularity_dict, track_popularity_dict_fname, 'pickle') store_obj(self.all_playlists_dict, all_playlists_dict_fname, 'pickle') self.track_popularity_dict = track_popularity_dict else: self.track_popularity_dict = load_obj(track_popularity_dict_fname, 'pickle') self.all_playlists_dict = load_obj(all_playlists_dict_fname, 'pickle') return self.track_popularity_dict
def split_playlist_df(df, random_state, all_playlists_dict, results_folder, recompute=False): x_train_pids_fname = os.path.join(results_folder, 'x_train_pids.pckl') x_dev_pids_fname = os.path.join(results_folder, 'x_dev_pids.pckl') x_test_pids_fname = os.path.join(results_folder, 'x_test_pids.pckl') if recompute: # To meet the second criteria for all tracks in the dev # and test sets to be in the training set # a bigger split is being produced. X_train_full, X_test = train_test_split( df, test_size=.1, random_state=random_state, stratify=df[[ 'track_popularity_median_class_quantile', 'num_tracks_class_quantile', 'modified_at_class_quantile' ]]) # filter playlist for rare tracks that occur only in one set but not in the other x_train_pids = X_train_full.pid.values x_test_pids = X_test.pid.values all_tracks = set() test_playlists = {} for p in all_playlists_dict: if p in x_train_pids: for track in all_playlists_dict[p]['tracks']: all_tracks.add(track) elif p in x_test_pids: test_playlists[p] = all_playlists_dict[p] missing_pid = {} candidates = [] for p in test_playlists: is_candidate = True for track in test_playlists[p]['tracks']: if track not in all_tracks: is_candidate = False if p not in missing_pid: missing_pid[p] = 1 else: missing_pid[p] += 1 if is_candidate: candidates.append(p) # do final dev / test split dev_test = np.random.choice(candidates, 20000, replace=False) dev_test = shuffle(dev_test, random_state=random_state) x_dev_pids, x_test_pids = dev_test[:10000], dev_test[10000:] print('Storing train, dev and test playlist ids ...') store_obj(x_train_pids, x_train_pids_fname, 'pickle') store_obj(x_dev_pids, x_dev_pids_fname, 'pickle') store_obj(x_test_pids, x_test_pids_fname, 'pickle') else: x_train_pids = load_obj(x_train_pids_fname, 'pickle') x_dev_pids = load_obj(x_dev_pids_fname, 'pickle') x_test_pids = load_obj(x_test_pids_fname, 'pickle') return x_train_pids, x_dev_pids, x_test_pids
if recompute: # load playlists x_train = [] print('Working on training set ...') for p in all_train_playlists: tmp_playlist = all_train_playlists[p]['tracks'] tmp_playlist.append('<eos>') x_train.extend(tmp_playlist) print('Extracting sequences and building vocabulary ...') track2id, track_sequence = build_vocabulary(x_train) print('Filtering sequences ...') track2id, track_sequence = filter_sequence(track_sequence, track2id, 5, c_set_tracks) print('Transforming track-uri sequences in int sequences ...') track_sequence = sequences_to_ids(track_sequence, track2id) print('Storing id_sequence file ...') store_obj(track_sequence, id_sequence_fname, 'pickle') print('Storing vocabulary file ...') store_obj(track2id, track2id_fname, 'pickle') else: track_sequence = load_obj(id_sequence_fname, 'pickle') track2id = load_obj(track2id_fname, 'pickle') ## END OF PRE-PROCESSING print('Generated all files for next steps ...')