def generate_levenshtein_seed_dict(
        zero_seed_playlists, 
        all_playlist_names, 
        all_playlists_dict, 
        playlist_df, 
        RESULTS_FOLDER, 
        filename, 
        recompute,
        seed_k=100):
        fname = os.path.join(RESULTS_FOLDER, filename)
        if recompute:
            comp_memory = {}
            seed_set = {}
            for idx, playl in enumerate(zero_seed_playlists):
                playlist_name = Levenshtein.pre_process(playl['name'])
                print ('\r{:.2f} % :: Retrieving levenshtein similarities for \'{}\''.format(
                    ((idx + 1) / len(zero_seed_playlists)) * 100, playlist_name), end='')
                return_dict = {}
                return_dict['counter'] = 0
                return_dict['lowest'] = []
                return_dict['targets'] = []
                _ = all_playlist_names.apply(Levenshtein.get_closest, args=(playlist_name, return_dict, comp_memory))
                seeds = Levenshtein.get_seed_tracks(playlist_df, return_dict, all_playlists_dict, seed_k=seed_k)
                seed_set[playl['pid']] = [x[0] for x in seeds]

            store_obj(seed_set, fname, 'pickle')
        else:
            seed_set = load_obj(fname, 'pickle')

        return seed_set
Example #2
0
def generate_all_train_playlist_set(x_train_pids, statistician, results_folder,
                                    recompute):
    all_train_playlist_set_fname = os.path.join(results_folder,
                                                'all_train_playlist_set.pckl')
    if recompute:
        all_train_playlist_set = {}
        for pid in x_train_pids:
            all_train_playlist_set[pid] = statistician.all_playlists_dict[pid]
        store_obj(all_train_playlist_set, all_train_playlist_set_fname,
                  'pickle')
    else:
        all_train_playlist_set = load_obj(all_train_playlist_set_fname,
                                          'pickle')

    return all_train_playlist_set
Example #3
0
def bucketing_eval_playlists(x_dev_pids, x_test_pids, all_playlists_dict,
                             RESULTS_FOLDER, recompute):
    test_playlist_dict_fname = os.path.join(RESULTS_FOLDER,
                                            'test_playlist_dict.pckl')
    dev_playlist_dict_fname = os.path.join(RESULTS_FOLDER,
                                           'dev_playlist_dict.pckl')

    if recompute:
        dev_playlists = []
        test_playlists = []
        dev_pid_order = []
        test_pid_order = []

        for pid in x_dev_pids:
            dev_playlists.append(all_playlists_dict[pid])

        for pid in x_test_pids:
            test_playlists.append(all_playlists_dict[pid])

        # gather lengths to generate buckets
        dev_lengths = [len(x['tracks']) for x in dev_playlists]
        test_lengths = [len(x['tracks']) for x in test_playlists]

        dev_indices = get_testing_indices(dev_lengths)
        test_indices = get_testing_indices(test_lengths)

        dev_playlist_dict = get_complete_testing_sets(dev_playlists,
                                                      dev_indices)
        test_playlist_dict = get_complete_testing_sets(test_playlists,
                                                       test_indices)

        store_obj(dev_playlist_dict, dev_playlist_dict_fname, 'pickle')
        store_obj(test_playlist_dict, test_playlist_dict_fname, 'pickle')
    else:
        dev_playlist_dict = load_obj(dev_playlist_dict_fname, 'pickle')
        test_playlist_dict = load_obj(test_playlist_dict_fname, 'pickle')

    return dev_playlist_dict, test_playlist_dict
Example #4
0
def get_correspondant_list(pid_to_name, seed_k, results_folder, recompute):
    list_fname = os.path.join(results_folder,
                              'w2v_dev_correspondant_list.pckl')
    probs_fname = os.path.join(results_folder,
                               'w2v_dev_correspondant_list_probas.pckl')

    if recompute:
        correspondant_list_tmp_fname = 'cwva_dev_correspondant_list.csv'
        correspondant_probs_tmp_fname = 'cwva_dev_correspondant_probs.csv'

        correspondant_list = {}
        correspondant_list_probs = {}
        for ix, pid in enumerate(pid_to_name):

            print('Retrieving CWVA for \'{}\' ({:.2f} %)'.format(
                pid_to_name[pid], ((ix + 1) / len(pid_to_name)) * 100),
                  end='\r')
            try:
                playlists, probabilities = get_similar_playlists(
                    pid_to_name[pid], seed_k)
                correspondant_list[pid] = playlists
                correspondant_list_probs[pid] = probabilities
                #write_to_file(pid, playlists, correspondant_list_tmp_fname)
                #write_to_file(pid, probabilities, correspondant_probs_tmp_fname)
            except KeyboardInterrupt:
                break
            except:
                print('Something went wrong with playlist: \'{}\' (pid: {})'.
                      format(pid_to_name[pid], pid))
        store_obj(correspondant_list, list_fname, 'pickle')
        store_obj(correspondant_list_probs, probs_fname, 'pickle')
    else:
        correspondant_list = load_obj(list_fname, 'pickle')
        correspondant_list_probs = load_obj(probs_fname, 'pickle')

    return correspondant_list, correspondant_list_probs
Example #5
0
 def store_epoch_counter(self, e):
     self.epoch_counter = e
     store_obj(self.epoch_counter,
               os.path.join(self.store_folder, 'global_epoch_point.pckl'),
               'pickle')
Example #6
0
 def store_step_counter(self, s):
     store_obj(s, os.path.join(self.store_folder, 'global_step_point.pckl'),
               'pickle')
Example #7
0
                                                                binary=True)

        x_train_pids = load_obj(x_train_pids_fname, 'pickle')
        print('Calculating average tokens for playlist titles ...')
        playlist_df = pd.read_csv(playlist_df_fname, index_col=0)
        df = playlist_df[playlist_df['pid'].isin(x_train_pids)]
        del (playlist_df)

        return_vecs = {}
        _ = df.apply(get_vecs, axis=1, args=(return_vecs, ))
        return_vecs_norm = mean_and_unify(return_vecs)

        playlist_title_2_vec, translation_dict = avg_vector_to_matrix(
            return_vecs_norm)

        store_obj(playlist_title_2_vec, playlist_title_2_vec_fname, 'pickle')
        store_obj(translation_dict, translation_dict_fname, 'pickle')
    else:
        playlist_title_2_vec = load_obj(playlist_title_2_vec_fname, 'pickle')
        translation_dict = load_obj(translation_dict_fname, 'pickle')

    complete_dev_seed_list_fname = os.path.join(W2V_FOLDER,
                                                'complete_dev_seed_list.pckl')

    if recompute:
        dev_playlist_dict = load_obj(dev_playlist_dict_fname, 'pickle')
        zero_dev = dev_playlist_dict[0]
        dev_pid_to_name = {}
        for dplaylist in zero_dev:
            dev_pid_to_name[dplaylist['pid']] = dplaylist['name']
Example #8
0
    def create_track_popularity_dict(self, recompute=False):
        """
        Iteration method leveraging count_artists_and_tracks method 
        to aggregate information out of all playlist collections.
        
        Parameters:
        --------------
        recompute:    bool flag determining whether precomputed results should be used or not
        
        Returns:
        --------------
        track_popularity_dict:     dict mapping track uris to their popularity count in all playlists
        """
        track_popularity_dict_fname = os.path.join(
            self.results_folder, 'track_popularity_dict.pckl')
        all_playlists_dict_fname = os.path.join(self.results_folder,
                                                'all_playlists_dict.pckl')
        track_uri_to_track_artist_string_fname = os.path.join(
            self.results_folder, 'track_uri_to_track_artist_string.pckl')

        if not os.path.exists(track_popularity_dict_fname) or recompute:
            track_uri_to_track_artist_string = {}  # TODO: fill with goods
            track_popularity_dict = {}
            total_files = len(self.all_playlist_filenames)
            counter = 0
            for playlist_file in self.all_playlist_filenames:
                counter += 1
                print(
                    "Working on slice {} ({:.2f} %) (File Name:  {} || Total Slices: {})"
                    .format(counter, (counter / total_files) * 100,
                            playlist_file, total_files),
                    end='\r')
                playlist_collection = load_obj(playlist_file, 'json')
                for playlist in playlist_collection['playlists']:

                    self.all_playlists_dict[playlist['pid']] = {
                        'pid': playlist['pid'],
                        'name': playlist['name'],
                        'tracks': []
                    }

                    for t in playlist['tracks']:
                        track_uri = t['track_uri']
                        # create popularity dict
                        if track_uri in track_popularity_dict:
                            track_popularity_dict[track_uri] += 1
                        else:
                            track_popularity_dict[track_uri] = 1

                        # create all playlist dict
                        self.all_playlists_dict[
                            playlist['pid']]['tracks'].append(track_uri)

            # store dict
            print('\nStoring all_playlist and popularity dicts ...')
            store_obj(track_popularity_dict, track_popularity_dict_fname,
                      'pickle')
            store_obj(self.all_playlists_dict, all_playlists_dict_fname,
                      'pickle')
            self.track_popularity_dict = track_popularity_dict
        else:
            self.track_popularity_dict = load_obj(track_popularity_dict_fname,
                                                  'pickle')
            self.all_playlists_dict = load_obj(all_playlists_dict_fname,
                                               'pickle')

        return self.track_popularity_dict
Example #9
0
def split_playlist_df(df,
                      random_state,
                      all_playlists_dict,
                      results_folder,
                      recompute=False):
    x_train_pids_fname = os.path.join(results_folder, 'x_train_pids.pckl')
    x_dev_pids_fname = os.path.join(results_folder, 'x_dev_pids.pckl')
    x_test_pids_fname = os.path.join(results_folder, 'x_test_pids.pckl')

    if recompute:
        # To meet the second criteria for all tracks in the dev
        # and test sets to be in the training set
        # a bigger split is being produced.

        X_train_full, X_test = train_test_split(
            df,
            test_size=.1,
            random_state=random_state,
            stratify=df[[
                'track_popularity_median_class_quantile',
                'num_tracks_class_quantile', 'modified_at_class_quantile'
            ]])

        # filter playlist for rare tracks that occur only in one set but not in the other
        x_train_pids = X_train_full.pid.values
        x_test_pids = X_test.pid.values

        all_tracks = set()
        test_playlists = {}

        for p in all_playlists_dict:
            if p in x_train_pids:
                for track in all_playlists_dict[p]['tracks']:
                    all_tracks.add(track)
            elif p in x_test_pids:
                test_playlists[p] = all_playlists_dict[p]

        missing_pid = {}
        candidates = []
        for p in test_playlists:
            is_candidate = True
            for track in test_playlists[p]['tracks']:
                if track not in all_tracks:
                    is_candidate = False
                    if p not in missing_pid:
                        missing_pid[p] = 1
                    else:
                        missing_pid[p] += 1
            if is_candidate:
                candidates.append(p)

        # do final dev / test split
        dev_test = np.random.choice(candidates, 20000, replace=False)
        dev_test = shuffle(dev_test, random_state=random_state)
        x_dev_pids, x_test_pids = dev_test[:10000], dev_test[10000:]
        print('Storing train, dev and test playlist ids ...')
        store_obj(x_train_pids, x_train_pids_fname, 'pickle')
        store_obj(x_dev_pids, x_dev_pids_fname, 'pickle')
        store_obj(x_test_pids, x_test_pids_fname, 'pickle')
    else:
        x_train_pids = load_obj(x_train_pids_fname, 'pickle')
        x_dev_pids = load_obj(x_dev_pids_fname, 'pickle')
        x_test_pids = load_obj(x_test_pids_fname, 'pickle')

    return x_train_pids, x_dev_pids, x_test_pids
Example #10
0
    if recompute:
        # load playlists
        x_train = []
        print('Working on training set ...')
        for p in all_train_playlists:
            tmp_playlist = all_train_playlists[p]['tracks']
            tmp_playlist.append('<eos>')
            x_train.extend(tmp_playlist)

        print('Extracting sequences and building vocabulary ...')
        track2id, track_sequence = build_vocabulary(x_train)

        print('Filtering sequences ...')
        track2id, track_sequence = filter_sequence(track_sequence, track2id, 5,
                                                   c_set_tracks)

        print('Transforming track-uri sequences in int sequences ...')
        track_sequence = sequences_to_ids(track_sequence, track2id)

        print('Storing id_sequence file ...')
        store_obj(track_sequence, id_sequence_fname, 'pickle')
        print('Storing vocabulary file ...')
        store_obj(track2id, track2id_fname, 'pickle')
    else:
        track_sequence = load_obj(id_sequence_fname, 'pickle')
        track2id = load_obj(track2id_fname, 'pickle')

    ## END OF PRE-PROCESSING
    print('Generated all files for next steps ...')