Ejemplo n.º 1
0
    def generate_levenshtein_seed_dict(
        zero_seed_playlists, 
        all_playlist_names, 
        all_playlists_dict, 
        playlist_df, 
        RESULTS_FOLDER, 
        filename, 
        recompute,
        seed_k=100):
        fname = os.path.join(RESULTS_FOLDER, filename)
        if recompute:
            comp_memory = {}
            seed_set = {}
            for idx, playl in enumerate(zero_seed_playlists):
                playlist_name = Levenshtein.pre_process(playl['name'])
                print ('\r{:.2f} % :: Retrieving levenshtein similarities for \'{}\''.format(
                    ((idx + 1) / len(zero_seed_playlists)) * 100, playlist_name), end='')
                return_dict = {}
                return_dict['counter'] = 0
                return_dict['lowest'] = []
                return_dict['targets'] = []
                _ = all_playlist_names.apply(Levenshtein.get_closest, args=(playlist_name, return_dict, comp_memory))
                seeds = Levenshtein.get_seed_tracks(playlist_df, return_dict, all_playlists_dict, seed_k=seed_k)
                seed_set[playl['pid']] = [x[0] for x in seeds]

            store_obj(seed_set, fname, 'pickle')
        else:
            seed_set = load_obj(fname, 'pickle')

        return seed_set
Ejemplo n.º 2
0
def train_and_predict(df_matrix,
                      dev_set,
                      dev_pidx_row_dict,
                      model_dict,
                      recompute=False,
                      exclude_cold=False):

    prediction_fname = model_dict['prediction_fname']
    model_fname = model_dict['model_fname']

    # define estimator
    als = implicit.als.AlternatingLeastSquares(
        factors=model_dict['factors'],
        regularization=model_dict['regularization'],
        use_gpu=model_dict['use_gpu'],
        calculate_training_loss=model_dict['calculate_training_loss'])

    if recompute:
        print('Fitting model ...')
        als.fit(df_matrix)
        prediction_results = {}
        for key in dev_set.keys():
            if exclude_cold and key == 0:
                continue
            prediction_results[key] = []
            df_len = len(dev_pidx_row_dict[key])
            perc = int(df_len / 100)
            for counter, playlist_row_id in enumerate(dev_pidx_row_dict[key]):
                if counter % perc == 0:
                    print('Predicting: {} % (k = {})'.format(
                        counter / perc, key),
                          end='\r')
                preds = als.recommend(playlist_row_id, df_matrix, N=500)
                prediction_results[key].append(preds)
        with open(os.path.join(baseline_results_folder, prediction_fname),
                  'wb') as f:
            pickle.dump(prediction_results, f)
        with open(os.path.join(baseline_results_folder, model_fname),
                  'wb') as f:
            pickle.dump(als, f)
    else:
        prediction_results = load_obj(
            os.path.join(baseline_results_folder, prediction_fname), 'pickle')
        als = load_obj(os.path.join(baseline_results_folder, model_fname),
                       'pickle')

    return prediction_results, als
Ejemplo n.º 3
0
    def __init__(self,
                 data,
                 seq_length,
                 n_batch_size,
                 n_vocab,
                 step=5,
                 test=False,
                 store_folder='step_point/'):
        """
        data: can be either training, validation or test data
        seq_length: number of tracks that will be fed into the network
        step: number of words to be skipped over between training samples within each batch
        
        """
        self.data = data
        self.seq_length = seq_length
        self.n_batch_size = n_batch_size
        self.n_vocab = n_vocab
        self.store_folder = store_folder

        if not os.path.exists(self.store_folder):
            os.makedirs(self.store_folder)

        # current_idx will save progress and serve as pointer
        # will reset to 0 once end is reached
        if os.path.exists(
                os.path.join(self.store_folder, 'global_step_point.pckl')):
            self.current_idx = load_obj(
                os.path.join(self.store_folder, 'global_step_point.pckl'),
                'pickle')
        else:
            self.current_idx = 0

        self.step = step
        # calculate steps per epoch
        self.steps_per_epoch = (len(self.data) //
                                (self.n_batch_size) - 1) // self.step

        # reload or initialize epoch and step counter
        if os.path.exists(
                os.path.join(self.store_folder, 'global_epoch_point.pckl')):
            self.epoch_counter = load_obj(
                os.path.join(self.store_folder, 'global_epoch_point.pckl'),
                'pickle')
        else:
            self.epoch_counter = 0
Ejemplo n.º 4
0
def bucketing_eval_playlists(x_dev_pids, x_test_pids, all_playlists_dict,
                             RESULTS_FOLDER, recompute):
    test_playlist_dict_fname = os.path.join(RESULTS_FOLDER,
                                            'test_playlist_dict.pckl')
    dev_playlist_dict_fname = os.path.join(RESULTS_FOLDER,
                                           'dev_playlist_dict.pckl')

    if recompute:
        dev_playlists = []
        test_playlists = []
        dev_pid_order = []
        test_pid_order = []

        for pid in x_dev_pids:
            dev_playlists.append(all_playlists_dict[pid])

        for pid in x_test_pids:
            test_playlists.append(all_playlists_dict[pid])

        # gather lengths to generate buckets
        dev_lengths = [len(x['tracks']) for x in dev_playlists]
        test_lengths = [len(x['tracks']) for x in test_playlists]

        dev_indices = get_testing_indices(dev_lengths)
        test_indices = get_testing_indices(test_lengths)

        dev_playlist_dict = get_complete_testing_sets(dev_playlists,
                                                      dev_indices)
        test_playlist_dict = get_complete_testing_sets(test_playlists,
                                                       test_indices)

        store_obj(dev_playlist_dict, dev_playlist_dict_fname, 'pickle')
        store_obj(test_playlist_dict, test_playlist_dict_fname, 'pickle')
    else:
        dev_playlist_dict = load_obj(dev_playlist_dict_fname, 'pickle')
        test_playlist_dict = load_obj(test_playlist_dict_fname, 'pickle')

    return dev_playlist_dict, test_playlist_dict
Ejemplo n.º 5
0
def get_correspondant_list(pid_to_name, seed_k, results_folder, recompute):
    list_fname = os.path.join(results_folder,
                              'w2v_dev_correspondant_list.pckl')
    probs_fname = os.path.join(results_folder,
                               'w2v_dev_correspondant_list_probas.pckl')

    if recompute:
        correspondant_list_tmp_fname = 'cwva_dev_correspondant_list.csv'
        correspondant_probs_tmp_fname = 'cwva_dev_correspondant_probs.csv'

        correspondant_list = {}
        correspondant_list_probs = {}
        for ix, pid in enumerate(pid_to_name):

            print('Retrieving CWVA for \'{}\' ({:.2f} %)'.format(
                pid_to_name[pid], ((ix + 1) / len(pid_to_name)) * 100),
                  end='\r')
            try:
                playlists, probabilities = get_similar_playlists(
                    pid_to_name[pid], seed_k)
                correspondant_list[pid] = playlists
                correspondant_list_probs[pid] = probabilities
                #write_to_file(pid, playlists, correspondant_list_tmp_fname)
                #write_to_file(pid, probabilities, correspondant_probs_tmp_fname)
            except KeyboardInterrupt:
                break
            except:
                print('Something went wrong with playlist: \'{}\' (pid: {})'.
                      format(pid_to_name[pid], pid))
        store_obj(correspondant_list, list_fname, 'pickle')
        store_obj(correspondant_list_probs, probs_fname, 'pickle')
    else:
        correspondant_list = load_obj(list_fname, 'pickle')
        correspondant_list_probs = load_obj(probs_fname, 'pickle')

    return correspondant_list, correspondant_list_probs
Ejemplo n.º 6
0
def generate_all_train_playlist_set(x_train_pids, statistician, results_folder,
                                    recompute):
    all_train_playlist_set_fname = os.path.join(results_folder,
                                                'all_train_playlist_set.pckl')
    if recompute:
        all_train_playlist_set = {}
        for pid in x_train_pids:
            all_train_playlist_set[pid] = statistician.all_playlists_dict[pid]
        store_obj(all_train_playlist_set, all_train_playlist_set_fname,
                  'pickle')
    else:
        all_train_playlist_set = load_obj(all_train_playlist_set_fname,
                                          'pickle')

    return all_train_playlist_set
Ejemplo n.º 7
0
sys.path.append('../')

from collections import Counter
from copy import deepcopy
from keras.utils import to_categorical
from tools.io import extract_pids, load_obj, store_obj, write_recommendations_to_file

print('#' * 80)
print('Track2Seq Model')
print('#' * 80)

##################################################################
############################## SETUP #############################
##################################################################

t2s_config = load_obj('config.json', 'json')
input_folder = t2s_config['RESULTS_FOLDER']  # data of pre-processing steps
model_folder = t2s_config[
    'T2S_MODEL_FOLDER']  # where model checkpoints are stored
model_name = t2s_config['T2S_MODEL_NAME']  # name of model
full_model_path = os.path.join(model_folder, model_name)

# generate folder
if not os.path.exists(full_model_path):
    print('Created {} ...'.format(full_model_path))
    os.makedirs(full_model_path)

print('Loading data ...')
data = load_obj(os.path.join(input_folder, 'id_sequence.pckl'), 'pickle')
vocab = load_obj(os.path.join(input_folder, 'track2id.pckl'), 'pickle')
track2int = vocab
Ejemplo n.º 8
0
def prepare_data_full_cf(
        df_filename=os.path.join(baseline_results_folder,
                                 'playlist_train.csv'),
        playlist_src_folder=t2s_config['PLAYLIST_FOLDER'],
        item_dict_filename=os.path.join(baseline_results_folder,
                                        'track_uri_to_item_id.pckl'),
        user_dict_filename=os.path.join(baseline_results_folder,
                                        'playlist_id_to_pidx.pckl'),
        test_playlist_fname=os.path.join(t2s_config['RESULTS_FOLDER'],
                                         'filled_dev_playlists_dict.pckl'),
        train_pid_ids_fname=os.path.join(t2s_config['RESULTS_FOLDER'],
                                         'x_train_pids.pckl'),
        test_pidx_row_dict_fname=os.path.join(baseline_results_folder,
                                              'test_pidx_row_dict.pckl'),
        recompute=True):
    """
    Prepares a list of lists where every individual list stores track ids. 
    Also stores pid to match information at a later point.
    
    Parameters:
    --------------
    recompute: bool flag which determines if stored information should be used
    
    Returns:
    --------------
    res_df:       pd.DataFrame, mapping user to item interaction
    item_dict:    dict, item id to track uri
    user_dict:    dict, simplified playlist id to pid
    """

    if recompute:
        counter = 0
        total_files = len(os.listdir(playlist_src_folder))

        list_of_list = []
        item_dict = {}
        user_dict = {}
        item_counter = 0

        playlists_of_tracks_uri = []
        pidx = 0

        train_pid_ids_dict = load_obj(train_pid_ids_fname, dtype='pickle')

        for playlist_json in os.listdir(playlist_src_folder):
            print(
                "Working on slice {} ({:.2f} %) (File Name:  {} || Total Slices: {})"
                .format(counter, (counter / total_files) * 100, playlist_json,
                        total_files),
                end='\r')

            counter += 1
            data_json = load_obj(os.path.join(playlist_src_folder,
                                              playlist_json),
                                 dtype='json')

            for playlist in data_json['playlists']:
                if playlist['pid'] not in train_pid_ids_dict:
                    continue  # filter out any test and dev playlists

                if playlist['pid'] not in user_dict:
                    user_dict[playlist['pid']] = pidx
                    pidx += 1

                for track in playlist['tracks']:
                    if track['track_uri'] in item_dict:
                        track_id = item_dict[track['track_uri']]
                    else:
                        track_id = item_counter
                        item_dict[track['track_uri']] = track_id
                        item_counter += 1
                    list_of_list.append(
                        [user_dict[playlist['pid']], track_id,
                         1])  # pid, track_id, rating

        # add dev set to matrix and dicts
        print('Loading Test/Dev Set...')
        test_pidx_row_dict = {}
        test_set = load_obj(test_playlist_fname, 'pickle')

        for key in [0, 1, 5, 10, 25, 100]:
            list_of_dev_playlists = test_set[key]
            test_pidx_row_dict[key] = []

            for playlist in list_of_dev_playlists:
                if len(playlist['seed']) < 1:
                    continue  # filter out any 0 seed playlists
                if playlist['pid'] not in user_dict:
                    test_pidx_row_dict[key].append(pidx)
                    user_dict[playlist['pid']] = pidx
                    pidx += 1

                for track in playlist['seed']:
                    if track in item_dict:
                        track_id = item_dict[track]
                    else:
                        track_id = item_counter
                        item_dict[track] = track_id
                        item_counter += 1
                    list_of_list.append(
                        [user_dict[playlist['pid']], track_id,
                         1])  # pid, track_id, rating

        print('Storing results ...')
        # store results
        with open(item_dict_filename, 'wb') as f:
            pickle.dump(item_dict, f)
        with open(user_dict_filename, 'wb') as f:
            pickle.dump(user_dict, f)
        with open(test_pidx_row_dict_fname, 'wb') as f:
            pickle.dump(test_pidx_row_dict, f)

        res_df = pd.DataFrame(list_of_list)
        res_df.to_csv(df_filename, sep='\t', index=False, header=False)
    else:
        # load results
        res_df = load_obj(df_filename, dtype='pandas')
        item_dict = load_obj(item_dict_filename, dtype='pickle')
        user_dict = load_obj(user_dict_filename, dtype='pickle')
        test_pidx_row_dict = load_obj(test_pidx_row_dict_fname, dtype='pickle')
    return res_df, item_dict, {v: k
                               for k, v in user_dict.items()
                               }, test_pidx_row_dict
Ejemplo n.º 9
0
from scipy.sparse import csr_matrix, lil_matrix
from tools.io import load_obj
from tools.metrics import recsys_metrics

##################################################################
############################## SETUP #############################
##################################################################

recompute = True
baseline_results_folder = 'baselines/'
dev_playlist_fname = 'results/dev_playlist_dict.pckl'

if not os.path.exists(baseline_results_folder):
    os.makedirs(baseline_results_folder)

t2s_config = load_obj('config.json', 'json')

##################################################################
######################### HYPER PARAMETERS #######################
##################################################################

# define hyper-parameter for alternating least-squares model
als_model_dict = {
    'one': {
        'factors': 30,
        'regularization': 0.01,
        'use_gpu': True,
        'calculate_training_loss': True,
        'model_fname': 'model_wmf_30_001_18_04_13.pckl',
        'prediction_fname': 'prediction_wmf_30_001_18_04_13.pckl'
    },
Ejemplo n.º 10
0
import string
import pandas as pd

from collections import Counter
from tools.io import load_obj, store_obj, extract_pids, write_to_file

print('#' * 80)
print('Track2Seq CWVA Seeds')
print('#' * 80)

##################################################################
############################## SETUP #############################
##################################################################

t2s_config = load_obj(
    'config.json',
    'json')  # all configuration files can be set manually as well
PLAYLIST_FOLDER = t2s_config[
    'PLAYLIST_FOLDER']  # set folder of playlist information
RESULTS_FOLDER = t2s_config[
    'RESULTS_FOLDER']  # all information will be stored here
W2V_FOLDER = t2s_config['W2V_FNAME']
RANDOM_STATE = t2s_config['RANDOM_STATE']
recompute = True

np.random.seed(RANDOM_STATE)

# download `GoogleNews-vectors-negative300.bin.gz` from
# https://github.com/mmihaltz/word2vec-GoogleNews-vectors
w2v_fname = t2s_config['W2V_BINARY_FNAME']
Ejemplo n.º 11
0
    def get_playlist_df(self, recompute):
        """
        Method that iterates over a playlist collection and retrieves all potential information 
        to store in one list of lists. This list can be used to create a well-formed pandas
        DataFrame.
        
        Parameters:
        ---------------
        columns: list storing all available and additional features for playlists
        artist_popularity_dict: lookup dict for artist popularity metrics
        artist_popularity_dict: lookup dict for track popularity metrics
        playlist_collection: retrieved playlist json
        
        Returns:
        ---------------
        tmp_playlist_list: list of lists every list containing features of a playlist
        columns: list of column names
        """
        playlist_df_fname = os.path.join(self.results_folder,
                                         'playlist_df.csv')

        if not os.path.exists(playlist_df_fname) or recompute:

            # check if popularity dict has been created and loaded
            if not self.track_popularity_dict:
                _ = self.create_track_popularity_dict(recompute)

            playlist_popularity = []
            for playlist_coll_fname in self.all_playlist_filenames:
                tmp_playlist_list = []
                playlist_coll = load_obj(playlist_coll_fname, 'json')
                for playlist in playlist_coll['playlists']:
                    tmp_track_pop = []
                    track_count = 0
                    columns = [
                        x for x in playlist.keys() if 'tracks' not in str(x)
                        and 'description' not in str(x)
                    ]
                    columns.extend([
                        'track_popularity_median', 'description', 'num_tracks'
                    ])
                    tmp_playlist_features = [
                        playlist[x] for x in playlist.keys() if
                        'tracks' not in str(x) and 'description' not in str(x)
                    ]
                    for track in playlist['tracks']:
                        track_count += 1
                        artist = track['artist_uri']
                        track = track['track_uri']
                        tmp_track_pop.append(self.track_popularity_dict[track])
                    tmp_playlist_features.extend([np.median(tmp_track_pop)])
                    tmp_playlist_features.append(
                        playlist['description'] if 'description' in
                        playlist.keys() else None)
                    tmp_playlist_features.append(track_count)
                    tmp_playlist_list.append(tmp_playlist_features)
                playlist_popularity.extend(tmp_playlist_list)

            self.playlist_df = pd.DataFrame(playlist_popularity,
                                            columns=columns)
            # store DataFrame to HDD
            self.playlist_df.to_csv(playlist_df_fname)
        else:
            self.playlist_df = pd.read_csv(playlist_df_fname, index_col=0)

        return self.playlist_df
Ejemplo n.º 12
0
 def track_uri_to_artist_and_title(self, uri):
     if not self.uri_dict:
         print('Loading URI dict...')
         self.uri_dict = load_obj(self.uri_dict_fname, 'pickle')
     return self.uri_dict[uri]
Ejemplo n.º 13
0
import numpy as np
import os
import pandas as pd

from collections import Counter
from tools.io import load_obj, store_obj

print ('#' * 80)
print ('Track2Seq Levenshtein Seeds')
print ('#' * 80)

##################################################################
############################## SETUP #############################
##################################################################

t2s_config = load_obj('config.json', 'json')  # all configuration files can be set manually as well
RESULTS_FOLDER = t2s_config['RESULTS_FOLDER']  # all information will be stored here
RANDOM_STATE = t2s_config['RANDOM_STATE']
recompute = True  

np.random.seed(RANDOM_STATE)

##################################################################
############################# METHODS ############################
##################################################################

class Levenshtein(object):
    def __init__(self):
        version = '0.1'

    @staticmethod
Ejemplo n.º 14
0
    def create_track_popularity_dict(self, recompute=False):
        """
        Iteration method leveraging count_artists_and_tracks method 
        to aggregate information out of all playlist collections.
        
        Parameters:
        --------------
        recompute:    bool flag determining whether precomputed results should be used or not
        
        Returns:
        --------------
        track_popularity_dict:     dict mapping track uris to their popularity count in all playlists
        """
        track_popularity_dict_fname = os.path.join(
            self.results_folder, 'track_popularity_dict.pckl')
        all_playlists_dict_fname = os.path.join(self.results_folder,
                                                'all_playlists_dict.pckl')
        track_uri_to_track_artist_string_fname = os.path.join(
            self.results_folder, 'track_uri_to_track_artist_string.pckl')

        if not os.path.exists(track_popularity_dict_fname) or recompute:
            track_uri_to_track_artist_string = {}  # TODO: fill with goods
            track_popularity_dict = {}
            total_files = len(self.all_playlist_filenames)
            counter = 0
            for playlist_file in self.all_playlist_filenames:
                counter += 1
                print(
                    "Working on slice {} ({:.2f} %) (File Name:  {} || Total Slices: {})"
                    .format(counter, (counter / total_files) * 100,
                            playlist_file, total_files),
                    end='\r')
                playlist_collection = load_obj(playlist_file, 'json')
                for playlist in playlist_collection['playlists']:

                    self.all_playlists_dict[playlist['pid']] = {
                        'pid': playlist['pid'],
                        'name': playlist['name'],
                        'tracks': []
                    }

                    for t in playlist['tracks']:
                        track_uri = t['track_uri']
                        # create popularity dict
                        if track_uri in track_popularity_dict:
                            track_popularity_dict[track_uri] += 1
                        else:
                            track_popularity_dict[track_uri] = 1

                        # create all playlist dict
                        self.all_playlists_dict[
                            playlist['pid']]['tracks'].append(track_uri)

            # store dict
            print('\nStoring all_playlist and popularity dicts ...')
            store_obj(track_popularity_dict, track_popularity_dict_fname,
                      'pickle')
            store_obj(self.all_playlists_dict, all_playlists_dict_fname,
                      'pickle')
            self.track_popularity_dict = track_popularity_dict
        else:
            self.track_popularity_dict = load_obj(track_popularity_dict_fname,
                                                  'pickle')
            self.all_playlists_dict = load_obj(all_playlists_dict_fname,
                                               'pickle')

        return self.track_popularity_dict
Ejemplo n.º 15
0
def split_playlist_df(df,
                      random_state,
                      all_playlists_dict,
                      results_folder,
                      recompute=False):
    x_train_pids_fname = os.path.join(results_folder, 'x_train_pids.pckl')
    x_dev_pids_fname = os.path.join(results_folder, 'x_dev_pids.pckl')
    x_test_pids_fname = os.path.join(results_folder, 'x_test_pids.pckl')

    if recompute:
        # To meet the second criteria for all tracks in the dev
        # and test sets to be in the training set
        # a bigger split is being produced.

        X_train_full, X_test = train_test_split(
            df,
            test_size=.1,
            random_state=random_state,
            stratify=df[[
                'track_popularity_median_class_quantile',
                'num_tracks_class_quantile', 'modified_at_class_quantile'
            ]])

        # filter playlist for rare tracks that occur only in one set but not in the other
        x_train_pids = X_train_full.pid.values
        x_test_pids = X_test.pid.values

        all_tracks = set()
        test_playlists = {}

        for p in all_playlists_dict:
            if p in x_train_pids:
                for track in all_playlists_dict[p]['tracks']:
                    all_tracks.add(track)
            elif p in x_test_pids:
                test_playlists[p] = all_playlists_dict[p]

        missing_pid = {}
        candidates = []
        for p in test_playlists:
            is_candidate = True
            for track in test_playlists[p]['tracks']:
                if track not in all_tracks:
                    is_candidate = False
                    if p not in missing_pid:
                        missing_pid[p] = 1
                    else:
                        missing_pid[p] += 1
            if is_candidate:
                candidates.append(p)

        # do final dev / test split
        dev_test = np.random.choice(candidates, 20000, replace=False)
        dev_test = shuffle(dev_test, random_state=random_state)
        x_dev_pids, x_test_pids = dev_test[:10000], dev_test[10000:]
        print('Storing train, dev and test playlist ids ...')
        store_obj(x_train_pids, x_train_pids_fname, 'pickle')
        store_obj(x_dev_pids, x_dev_pids_fname, 'pickle')
        store_obj(x_test_pids, x_test_pids_fname, 'pickle')
    else:
        x_train_pids = load_obj(x_train_pids_fname, 'pickle')
        x_dev_pids = load_obj(x_dev_pids_fname, 'pickle')
        x_test_pids = load_obj(x_test_pids_fname, 'pickle')

    return x_train_pids, x_dev_pids, x_test_pids
Ejemplo n.º 16
0
 def artist_uri_to_artist_string(self, uri):
     if not self.artist_uri_dict:
         print('Loading Artist URI dict...')
         self.artist_uri_dict = load_obj(self.artist_uri_dict_fname,
                                         'pickle')
     return self.artist_uri_dict[uri]
Ejemplo n.º 17
0
def main():
    # in case a specific GPU should be used
    #gpu_options = tf.GPUOptions(visible_device_list='0')
    #config = tf.ConfigProto(gpu_options=gpu_options)
    #sess = tf.Session(config=config)

    sess = tf.Session()

    # initialize data generator
    n_vocab = len(int2track)
    bg = BatchGenerator(data=data,
                        seq_length=seq_length,
                        n_batch_size=n_batch_size,
                        n_vocab=n_vocab,
                        step=skips,
                        store_folder=os.path.join(full_model_path,
                                                  'step_point'))

    current_epoch = bg.epoch_counter

    # intialize model for training
    model = Seq2Track(n_batch_size=n_batch_size,
                      seq_length=seq_length,
                      n_vocab=n_vocab,
                      n_layers=n_layers,
                      latent_size=latent_size)

    # initialize model for prediction
    # reusing scope for recommendations
    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
        pred_model = Seq2Track(n_batch_size=n_batch_size,
                               seq_length=seq_length,
                               n_vocab=n_vocab,
                               n_layers=n_layers,
                               latent_size=latent_size,
                               recommendation=True)

    # pick up the process where we left off - if possible
    saver = tf.train.Saver(tf.global_variables())
    init_operation = tf.global_variables_initializer()
    sess.run(init_operation)

    # check if a model exists, if so - load it
    if os.path.exists(os.path.join(full_model_path, 'checkpoint')):
        saver.restore(sess, tf.train.latest_checkpoint(full_model_path))

    # training routine
    if training:
        # run epochs
        for e in range(current_epoch, epochs):
            avg_epoch_cost = []  # store average cost per epoch

            # for any epoch initialize state as zeros
            current_state = np.zeros((n_layers, 2, n_batch_size, latent_size))
            for step in range(bg.current_idx, bg.steps_per_epoch):
                X_batch, y_batch = next(
                    bg.generate())  # generate fresh training batch

                if step % 10 == 0:  # show progress every 10 steps
                    start_time = time.time()
                    cost, _, current_state = sess.run(
                        [model.cost, model.training_op, model.state],
                        feed_dict={
                            model.X: X_batch,
                            model.y: y_batch,
                            model.initial_state: current_state
                        })
                    avg_epoch_cost.append(cost)
                    end_time = (time.time() - start_time)
                    print('Epoch: {} - Step: {} / {} - Cost: {} - Time: {}s'.
                          format(e, step, bg.steps_per_epoch,
                                 np.mean(avg_epoch_cost), end_time))

                elif step % 1000 == 0:  # show recommendation examples every 1000 steps
                    start_time = time.time()

                    cost, _, current_state, acc = sess.run(
                        [
                            model.cost, model.training_op, model.state,
                            model.accuracy
                        ],
                        feed_dict={
                            model.X: X_batch,
                            model.y: y_batch,
                            model.initial_state: current_state
                        })

                    # Compute cost and accuracy
                    avg_epoch_cost.append(cost)
                    end_time = (time.time() - start_time)
                    print(
                        'Epoch: {} - Step: {} / {} - Cost: {} - Accuracy: {} - Time: {}s'
                        .format(e, step, bg.steps_per_epoch,
                                np.mean(avg_epoch_cost), acc, end_time))

                    # Show recommendations
                    # can be changed to incorporate any track that's in int2track
                    sample_seed_sequence = [
                        'spotify:track:14AaSKhUMiR5qbNvhjlj9L',
                        'spotify:track:2tznHmp70DxMyr2XhWLOW0',
                        'spotify:track:0uqPG793dkDDN7sCUJJIVC'
                    ]

                    print('Seeds: {} '.format(x for x in sample_seed_sequence))
                    results = pred_model.recommend(sess,
                                                   sample_seed_sequence,
                                                   int2track,
                                                   track2int,
                                                   n=500)
                    print('Recommendations: {}'.format([x for x in results]))

                else:
                    cost, _, current_state = sess.run(
                        [model.cost, model.training_op, model.state],
                        feed_dict={
                            model.X: X_batch,
                            model.y: y_batch,
                            model.initial_state: current_state
                        })
                    avg_epoch_cost.append(cost)

                # Save the model and the vocab
                if step != 0 and step % save_steps == 0:
                    # Save model
                    bg.store_step_counter(step)
                    bg.store_epoch_counter(e)

                    model_file_name = os.path.join(full_model_path, 'model')
                    saver.save(sess, model_file_name, global_step=step)
                    print('Model Saved To: {}'.format(model_file_name))
        # if epoch is over
        bg.store_epoch_counter(e)
        bg.current_idx = 0
        bg.store_step_counter(0)
        model_file_name = os.path.join(full_model_path, 'model')
        saver.save(sess, model_file_name, global_step=step)
        print('Model Saved To: {}'.format(model_file_name))

    else:
        pid_collection = extract_pids(result_fname)
        all_challenge_playlists = load_obj(evaluation_set_fname, 'pickle')

        init = tf.global_variables_initializer()
        sess.run(init)
        if os.path.exists(os.path.join(full_model_path, 'checkpoint')):
            saver.restore(sess, tf.train.latest_checkpoint(full_model_path))

        num_playlists = 0
        for k in all_challenge_playlists:
            num_playlists += len(all_challenge_playlists[k])

        print(
            'Recommending tracks for {:,} playlists...'.format(num_playlists))

        avg_time = []
        for k in all_challenge_playlists:
            for ix, playlist in enumerate(all_challenge_playlists[k]):
                start_wall_time = time.time()

                if playlist['pid'] in pid_collection:
                    continue
                reco_per_playlist = []
                reco_store = []

                try:
                    reco_per_playlist = pred_model.recommend(sess,
                                                             playlist['seed'],
                                                             int2track,
                                                             track2int,
                                                             n=600)
                    if not reco_per_playlist:
                        print('Something went wrong with playlist {}'.format(
                            playlist['pid']))
                        continue
                except KeyboardInterrupt:
                    sys.exit()
                except Exception as err:
                    print('Something went wrong with playlist {} (Error: {})'.
                          format(playlist['pid'], err))
                    continue

                # store recommendations
                reco_per_playlist = reco_per_playlist[:500]
                pid_collection.append(playlist['pid'])
                time_elapsed = time.time() - start_wall_time
                avg_time.append(time_elapsed)

                print(
                    'Recommended {} songs ({} / {}). Avg time per playlist: {:.2f} seconds.'
                    .format(len(reco_per_playlist), ix, num_playlists,
                            np.mean(avg_time)))

                write_recommendations_to_file(challenge_track, team_name,
                                              contact_info, playlist['pid'],
                                              reco_per_playlist, result_fname)

                with open(result_fname, 'a') as f:
                    f.write(str(playlist['pid']) + ', ')
                    f.write(', '.join([x for x in reco_per_playlist]))
                    f.write('\n\n')
Ejemplo n.º 18
0
from collections import Counter
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tools.io import load_obj, store_obj

print('#' * 80)
print('Track2Seq Preprocessing')
print('#' * 80)

##################################################################
############################## SETUP #############################
##################################################################

t2s_config = load_obj(
    'config.json',
    'json')  # all configuration files can be set manually as well
PLAYLIST_FOLDER = t2s_config[
    'PLAYLIST_FOLDER']  # set folder of playlist information
RESULTS_FOLDER = t2s_config[
    'RESULTS_FOLDER']  # all information will be stored here
RANDOM_STATE = t2s_config['RANDOM_STATE']
recompute = True

np.random.seed(RANDOM_STATE)

##################################################################
############################# METHODS ############################
##################################################################