Exemple #1
0
 def get_r_hat(self, load_from_file=False, path=''):
     """
     :param load_from_file: if the matrix has been saved can be set to true for load it from it
     :param path: path in which the matrix has been saved
     -------
     :return the extimated urm from the recommender
     """
     r_hat = data.get_empty_urm()
     r_hat[data.get_target_playlists()] = self.U[data.get_target_playlists()].dot(self.s_Vt)
     return r_hat
Exemple #2
0
 def get_r_hat(self):
     """
     compute the r_hat for the model
     :return  r_hat only for the target playlists
     """
     if self.W_sparse == None:
         log.error(
             'the recommender has not been trained, call the fit() method for compute W'
         )
     r_hat = data.get_empty_urm()
     r_hat[data.get_target_playlists()] = self.URM_train[
         data.get_target_playlists()].dot(self.W_sparse)
     return r_hat
Exemple #3
0
def cluster_users_by_interactions_count(clip):
    """
    Split the playlists based on interactions count above or below the specified clip value.
    
    Parameters
    ----------
    clip : (int), clip value for splitting. The playlists will be splitted in 2 groups:
        those that have an interactions count <= clip and those that have an interactions count > clip.

    Returns
    -------
    2 lists of playlists ids
    """
    playlists = data.get_playlists_df()

    target_playlist = pd.DataFrame({'playlist_id':data.get_target_playlists()})
    target_playlist['index'] = target_playlist.index

    counts = target_playlist.merge(playlists).groupby(['playlist_id', 'index']).size().reset_index(name='counts')

    #counts = counts.reset_index()
    #counts.columns[2] = 'index'
    #counts['index'] = counts.index

    # build dataframe of number of interactions: playlist_id | tracks_count
    #counts = playlists.groupby('playlist_id').size().reset_index(name='counts')

    # split based on the interactions counts
    return counts[counts['counts']<=clip]['index'].values, counts[counts['counts']>clip]['index'].values
Exemple #4
0
    def process(self, df):
        """
        From each non sequential playlist, removes the percentual of songs specified in the constructor by randomly
        picking songs inside the playlist. For each sequential playlist, removes just the last songs of the playlist.
        This is done because this way the train-test splitting of kaggle is reproduced.
        See https://www.kaggle.com/c/recommender-system-2018-challenge-polimi/discussion/69325

        @Param
        df:             (panda's df) dataframe associated to train.csv

        @Output
        df:             the dataframe from which we have removed the picked songs
        """
        seq_l = d.get_target_playlists()[0:d.N_SEQUENTIAL]
        non_seq_l = list(set(d.get_all_playlists()) - set(seq_l))

        seq_df = df[df.playlist_id.isin(seq_l)]
        non_seq_df = df[df.playlist_id.isin(non_seq_l)]

        seq_df_dropped = seq_df.groupby('playlist_id').apply(
            lambda x: x.iloc[:-math.floor(len(x) * self.perc)]).reset_index(
                drop=True)
        non_seq_df_dropped = non_seq_df.groupby('playlist_id').apply(
            lambda x: x.drop(x.sample(n=math.floor(len(x) * self.perc)).index))
        return pd.concat([seq_df_dropped,
                          non_seq_df_dropped]).sort_values(by='playlist_id',
                                                           kind='mergesort')
Exemple #5
0
    def validate(self, factors_array, iteration_array, urm_train=data.get_urm_train_1(), urm_test=data.get_urm_test_1(), verbose=True,
                 write_on_file=True, userids=data.get_target_playlists(), N=10, filter_already_liked=True, items_to_exclude=[]):

        #create the initial model
        recommender = Pure_SVD()

        path = 'validation_results/'
        name = 'pure_SVD'
        folder = time.strftime('%d-%m-%Y')
        filename = '{}/{}/{}{}.csv'.format(path, folder, name, time.strftime('_%H-%M-%S'))
        # create dir if not exists
        os.makedirs(os.path.dirname(filename), exist_ok=True)

        with open(filename, 'w') as out:
            for f in factors_array:
                for i in iteration_array:
                    #train the model with the parameters
                    if verbose:
                        print('\n\nTraining PURE_SVD with\n Factors: {}\n Iteration: {}\n'.format(f, i))
                        print('\n training phase...')
                    recommender.fit(urm_train=urm_train, num_factors=f, iteration=i)

                    #get the recommendations from the trained model
                    recommendations = recommender.recommend_batch(userids=userids, N=N, filter_already_liked=filter_already_liked,
                                                                  items_to_exclude=items_to_exclude)
                    #evaluate the model with map10
                    map10 = recommender.evaluate(recommendations, test_urm=urm_test)
                    if verbose:
                        print('map@10: {}'.format(map10))

                    #write on external files on folder models_validation
                    if write_on_file:
                        out.write('\n\nFactors: {}\n Iteration: {}\n evaluation map@10: {}'.format(f, i, map10))
def cluster_ensemble(clip, path_sparse, path_dense):
    sparse_pl, dense_pl = cluster.cluster_users_by_interactions_count(clip=clip)

    log.success('Cluster 1 (interactions count <= {}): {} playlists'.format(clip, len(sparse_pl)))
    log.success('Cluster 2 (interactions count  > {}): {} playlists'.format(clip, len(dense_pl)))

    # filter target playlists from the 2 clusters
    s1 = set(sparse_pl)
    s2 = set(dense_pl)
    s_target = set(data.get_target_playlists())
    s1_target = s1 & s_target
    s2_target = s2 & s_target

    sparse_pl = pd.DataFrame({'playlist_id':list(s1_target)})
    dense_pl= pd.DataFrame({'playlist_id': list(s2_target)})




    df_sparse = pd.read_csv(path_sparse)
    df_dense = pd.read_csv(path_dense)

    cluster1 = df_sparse.merge(sparse_pl)
    cluster2 = df_dense.merge(dense_pl)

    final = pd.concat([cluster1, cluster2])
    final.to_csv(path_or_buf='submissions/cluster_ensemble' + t.strftime('_%H-%M-%S'), index=False)
    def get_r_hat(self):
        """
        compute the r_hat for the model filled with zeros in playlists not target
        :return  r_hat
        """

        if self.user_vecs is None:
            log.error('the recommender has not been trained, call the fit() method')

        r_hat = data.get_empty_urm()
        r_hat = r_hat.todense()
        r_estimated = np.dot(self.user_vecs[data.get_target_playlists()], self.item_vecs.T)
        r_hat[data.get_target_playlists()] = r_estimated
        r_hat = sps.csr_matrix(r_hat)
        print('saving matrix')
        return r_hat
Exemple #8
0
def histogram_of_interactions():
    """
    Plot the histogram of the interactions counts:
    x axis: interactions
    y axis: count of playlists with that number of interactions
    """
    playlists = data.get_playlists_df()
    target_playlist = pd.DataFrame(
        {'playlist_id': data.get_target_playlists()})

    counts = playlists.merge(target_playlist).groupby(
        'playlist_id').size().reset_index(name='interactions')

    # plot counts for each playlist
    #counts.plot(x='playlist_id', y='interactions', kind='scatter', figsize=(200,100))

    hist = counts.groupby('interactions').size().reset_index(name='counts')
    hist.plot(x='interactions',
              y='counts',
              kind='bar',
              fontsize=7,
              figsize=(150, 100))

    # plot histogram
    plt.show(block=True)
Exemple #9
0
    def fit(self,
            URM,
            n_factors=10,
            learning_rate=1e-4,
            epochs=10,
            user_regularization=0.001,
            positive_item_regularization=0.001,
            negative_item_regularization=0.001,
            evaluate_every=1):
        self.URM = URM
        self.epochs = epochs
        self.n_users = self.URM.shape[0]
        self.n_items = self.URM.shape[1]

        e = MFBPR_Epoch(
            URM,
            n_factors=n_factors,
            learning_rate=learning_rate,
            user_regularization=user_regularization,
            positive_item_regularization=positive_item_regularization,
            negative_item_regularization=negative_item_regularization)
        print('Fitting MFBPR...')

        for numEpoch in range(self.epochs):
            print('Epoch:', numEpoch)
            e.epochIteration()
            if (numEpoch + 1) % evaluate_every == 0:
                self.user_factors, self.item_factors = e.get_user_item_factors(
                )
                recs = self.recommend_batch(userids=d.get_target_playlists())
                self.evaluate(recs, d.get_urm_test_1())

        self.user_factors, self.item_factors = e.get_user_item_factors()

        # let's see how fine it performs in the test set:
        # getting as positive sample a semple in the test set but not in the training
        trials = 10000
        count_wrong = 0
        for _ in range(trials):
            test = d.get_urm_test_1()
            user_id = np.random.choice(self.n_users)
            user_seen_items = d.get_urm()[user_id, :].indices
            test_items = test[user_id, :].indices
            pos_item_id = np.random.choice(test_items)
            neg_item_selected = False
            while (not neg_item_selected):
                neg_item_id = np.random.randint(0, self.n_items)
                if (neg_item_id not in user_seen_items):
                    neg_item_selected = True
            xui = np.dot(self.user_factors[user_id, :],
                         self.item_factors[pos_item_id, :])
            xuj = np.dot(self.user_factors[user_id, :],
                         self.item_factors[neg_item_id, :])
            xuij = xui - xuj
            if xuij < 0:
                count_wrong += 1
            # print('u: {}, i: {}, j: {}. xui - xuj: {}'.format(user_id, pos_item_id, neg_item_id, xuij))
        print('percentange of wrong preferences in test set: {}'.format(
            count_wrong / trials))
Exemple #10
0
    def run(self,
            epochs=70,
            batch_size=1000,
            lambda_i=0.0,
            lambda_j=0.0,
            learning_rate=0.01,
            topK=1500,
            sgd_mode='adagrad',
            export_results=True,
            export_r_hat=False):
        """
        meant as a shortcut to run the model after the validation procedure,
        allowing the export of the scores on the playlists or of the estimated csr matrix

        :param epochs(int)
        :param batch_size(int) after how many items the params should be updated
        :param lambda_i(float) first regularization term
        :param lambda_j(float) second regularization term
        :param learning_rate(float) algorithm learning rate
        :param topK(int) how many elements should be taken into account while computing URM*W
        :param sgd_mode(string) optimization algorithm
        :param export_results(bool) export a ready-to-kaggle csv with the predicted songs for each playlist
        :param export_r_hat(bool) whether to export or not the estimated csr matrix
        """

        self.fit(URM_train=d.get_urm(),
                 epochs=epochs,
                 URM_test=None,
                 user_ids=None,
                 batch_size=batch_size,
                 validate_every_N_epochs=1,
                 start_validation_after_N_epochs=epochs + 1,
                 lambda_i=lambda_i,
                 lambda_j=lambda_j,
                 learning_rate=learning_rate,
                 topK=topK,
                 sgd_mode=sgd_mode)
        if export_results:
            print('exporting results')
            recs = self.recommend_batch(d.get_target_playlists(),
                                        N=10,
                                        urm=d.get_urm(),
                                        filter_already_liked=True,
                                        with_scores=False,
                                        items_to_exclude=[],
                                        verbose=False)
            importexport.exportcsv(
                recs, 'submission',
                self._print(epochs=epochs,
                            batch_size=batch_size,
                            lambda_i=lambda_i,
                            lambda_j=lambda_j,
                            learning_rate=learning_rate,
                            topK=topK,
                            sgd_mode=sgd_mode))
        elif export_r_hat:
            print('saving estimated urm')
            self.save_r_hat()
Exemple #11
0
    def fit(self, URM_train=d.get_urm_train_1(), epochs=190, URM_test=d.get_urm_test_1(),
            user_ids=d.get_target_playlists(),
            batch_size=1000, validate_every_N_epochs=2, start_validation_after_N_epochs=191, lambda_i=0.0,
            lambda_j=0.0, learning_rate=0.01, topK=1500, sgd_mode='adagrad'):

        """
        train the model finding matrix W
        :param epochs(int)
        :param batch_size(int) after how many items the params should be updated
        :param lambda_i(float) first regularization term
        :param lambda_j(float) second regularization term
        :param learning_rate(float) algorithm learning rate
        :param topK(int) how many elements should be taken into account while computing URM*W
        :param sgd_mode(string) optimization algorithm
        :param URM_train(csr_matrix) the URM used to train the model. Either the full or the validation one
        :param URM_test(csr_matrix) needed if we'd like to perform validation
        :param user_ids(list) needed if we'd like to perform validation
        :param validate_every_N_epochs(int) how often the MAP evaluation should be displayed
        :param start_validation_after_N_epochs(int)
        """

        self.URM_train = URM_train.T
        self.n_users = URM_train.shape[0]
        self.n_items = URM_train.shape[1]

        self.sgd_mode = sgd_mode

        print('before cython')

        from cythoncompiled.SLIM_BPR.SLIM_BPR_Cython_Epoch import SLIM_BPR_Cython_Epoch

        self.cythonEpoch = SLIM_BPR_Cython_Epoch(self.URM_train,
                                                 sparse_weights=False,
                                                 topK=topK,
                                                 learning_rate=learning_rate,
                                                 li_reg=lambda_i,
                                                 lj_reg=lambda_j,
                                                 batch_size=100,
                                                 symmetric=True,
                                                 sgd_mode=sgd_mode)
        print('after cython')

        # Cal super.fit to start training
        self._fit_alreadyInitialized(epochs=epochs,
                                     logFile=None,
                                     URM_test=URM_test,
                                     user_ids=user_ids,
                                     filterTopPop=False,
                                     minRatingsPerUser=1,
                                     batch_size=batch_size,
                                     validate_every_N_epochs=validate_every_N_epochs,
                                     start_validation_after_N_epochs=start_validation_after_N_epochs,
                                     lambda_i=lambda_i,
                                     lambda_j=lambda_j,
                                     learning_rate=learning_rate,
                                     topK=topK)
        print('after already_initialized')
Exemple #12
0
    def get_r_hat(self, weights_array):
        hybrid_matrix = sps.csr_matrix(self.normalized_matrices_array[0].shape)
        count = 0
        for m in self.normalized_matrices_array:
            hybrid_matrix += m*weights_array[count]
            count += 1

        if self.name == 'HybridSimilarity':
            #compute the r_hat if we have the similarity
            if self.INVERSE == False:
                hybrid_matrix = self.urm_filter_tracks[data.get_target_playlists()].dot(hybrid_matrix)
            else:
                hybrid_matrix = hybrid_matrix[data.get_target_playlists()].dot(self.urm_filter_tracks)
            r_hat = data.get_empty_urm()
            r_hat[data.get_target_playlists()] = hybrid_matrix
            hybrid_matrix = r_hat

        return hybrid_matrix
Exemple #13
0
 def get_r_hat(self, load_from_file=False, path=''):
     """
     :param load_from_file: if the matrix has been saved can be set to true for load it from it
     :param path: path in which the matrix has been saved
     -------
     :return the extimated urm from the recommender
     """
     U_filtered = self.U[data.get_target_playlists()]
     r_hat = U_filtered.dot(self.s_Vt)
     return sps.csr_matrix(r_hat)
Exemple #14
0
    def run(self,
            num_factors,
            urm_train=None,
            urm=None,
            urm_test=None,
            targetids=None,
            with_scores=False,
            export=True,
            verbose=True):
        """
        Run the model and export the results to a file

        Parameters
        ----------
        num_factors : int, number of latent factors
        urm : csr matrix, URM. If None, used: data.get_urm_train(). This should be the
            entire URM for which the targetids corresponds to the row indexes.
        urm_test : csr matrix, urm where to test the model. If None, use: data.get_urm_test()
        targetids : list, target user ids. If None, use: data.get_target_playlists()

        Returns
        -------
        recs: (list) recommendations
        map10: (float) MAP10 for the provided recommendations
        """
        _urm = data.get_urm_train()
        _icm = data.get_icm()
        _urm_test = data.get_urm_test()
        _targetids = data.get_target_playlists()
        #_targetids = data.get_all_playlists()

        start = time.time()

        urm_train = _urm if urm_train is None else urm_train
        #urm = _urm if urm is None else urm
        urm_test = _urm_test if urm_test is None else urm_test
        targetids = _targetids if targetids is None else targetids

        self.fit(urm_train=urm_train, num_factors=num_factors)
        recs = self.recommend_batch(userids=targetids)

        map10 = None
        if len(recs) > 0:
            map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose)
        else:
            log.warning('No recommendations available, skip evaluation')

        if export:
            exportcsv(recs, path='submission', name=self.name, verbose=verbose)

        if verbose:
            log.info('Run in: {:.2f}s'.format(time.time() - start))

        return recs, map10
 def get_r_hat(self, verbose=False):
     """
     Return the r_hat matrix as: R^ = R•S or R^ = S•R
     """
     R = self.urm
     targetids = data.get_target_playlists()
     if self._matrix_mul_order == 'inverse':
         return sim.dot_product(self._sim_matrix, R, target_rows=targetids, k=R.shape[0],
                                 format_output='csr', verbose=verbose)
     else:
         return sim.dot_product(R, self._sim_matrix, target_rows=targetids, k=R.shape[0],
                                 format_output='csr', verbose=verbose)
Exemple #16
0
    def validate(self, epochs=200, user_ids=d.get_target_playlists(),
            batch_size = [1000], validate_every_N_epochs = 5, start_validation_after_N_epochs = 0, lambda_i = [0.0],
            lambda_j = [0.0], learning_rate = [0.01], topK = [200], sgd_mode='adagrad', log_path=None):
        """
        train the model finding matrix W
        :param epochs(int)
        :param batch_size(list) after how many items the params should be updated
        :param lambda_i(list) first regularization term
        :param lambda_j(list) second regularization term
        :param learning_rate(list) algorithm learning rate
        :param topK(list) how many elements should be taken into account while computing URM*W
        :param sgd_mode(string) optimization algorithm
        :param user_ids(list) needed if we'd like to perform validation
        :param validate_every_N_epochs(int) how often the MAP evaluation should be displayed
        :param start_validation_after_N_epochs(int)
        :param log_path(string) folder to which the validation results should be saved
        """
        if log_path != None:
            orig_stdout = sys.stdout
            f = open(log_path + '/' + self.name + ' ' + time.strftime('_%H-%M-%S') + ' ' +
                     time.strftime('%d-%m-%Y') + '.txt', 'w')
            sys.stdout = f

        for li in lambda_i:
            for lj in lambda_j:
                for tk in topK:
                    for lr in learning_rate:
                        for b in batch_size:
                            print(self._print(epochs=epochs,
                                              batch_size=b,
                                              lambda_i=li,
                                              lambda_j=lj,
                                              learning_rate=lr,
                                              topK=tk,
                                              sgd_mode=sgd_mode))
                            s.fit(URM_train=d.get_urm_train(),
                                  epochs=epochs,
                                  URM_test=d.get_urm_test(),
                                  user_ids=user_ids,
                                  batch_size=b,
                                  validate_every_N_epochs=validate_every_N_epochs,
                                  start_validation_after_N_epochs=start_validation_after_N_epochs,
                                  lambda_i = li,
                                  lambda_j = lj,
                                  learning_rate = lr,
                                  topK=tk,
                                  sgd_mode=sgd_mode
                                  )

        if log_path != None:
            sys.stdout = orig_stdout
            f.close()
Exemple #17
0
def histogram_of_top_pop_items(top_n, only_target=True):
    playlists_df = data.get_playlists_df()
    if only_target:
        # filter only target playlist
        target_playlist_df = pd.DataFrame({'playlist_id' : data.get_target_playlists()})
        playlists_df = playlists_df.merge(target_playlist_df)
    # track_id | count
    toptracks_df = playlists_df.groupby('track_id').size().reset_index(name='count')
    toptracks_df = toptracks_df.sort_values('count', ascending=False).head(top_n)
    toptracks_df.plot(x='track_id', y='count', kind='bar', fontsize=6, figsize=(150,100))

    # plot histogram
    plt.show(block=True)
Exemple #18
0
 def __init__(self, h):
     """
     h: (int), length of the sequences
     split_perc: (float) validation split percentage, 0 to skip the creation of the validation set
     """
     super(SequentialRecommender, self).__init__()
     self.name = 'sequential'
     self.h = h
     
     # build sequences dataset and cache it
     self.sequences, self.target_indices = ps.get_sequences(h=h)
     target_ids = data.get_target_playlists()[0:data.N_SEQUENTIAL]
     self.target_ids = np.array(target_ids)
     self.already_liked_indices = (data.get_urm_train_1()[target_ids]).nonzero()
     self.H = seqsim.getH(self.sequences)
Exemple #19
0
 def get_r_hat(self, load_from_file=False, path=''):
     """
     compute the r_hat for the model
     :return  r_hat only for the target playlists
     """
     if load_from_file:
         r_hat = sps.load_npz(path)
     else:
         if self.W_sparse == None:
             log.error(
                 'the recommender has not been trained, call the fit() method for compute W'
             )
         r_hat = self.URM_train[data.get_target_playlists()].dot(
             self.W_sparse)
     return r_hat
Exemple #20
0
 def get_r_hat(self, load_from_file=False, path=''):
     """
     compute the r_hat for the model
     :return  r_hat
     """
     if load_from_file:
         r_hat = sps.load_npz(path)
     else:
         if self.user_vecs is None:
             log.error(
                 'the recommender has not been trained, call the fit() method'
             )
         s_user_vecs = sps.csr_matrix(self.user_vecs)
         s_item_vecs_t = sps.csr_matrix(self.item_vecs.T)
         r_hat = s_user_vecs[data.get_target_playlists()].dot(s_item_vecs_t)
     return r_hat
Exemple #21
0
    def run(self, distance, ucm_train=None, urm=None, urm_test=None, targetids=None, k=100, shrink=10, threshold=0,
            implicit=True, alpha=None, beta=None, l=None, c=None, with_scores=False, export=True, verbose=True):
        """
        Run the model and export the results to a file

        Parameters
        ----------
        distance : str, distance metric
        targetids : list, target user ids. If None, use: data.get_target_playlists()
        k : int, K nearest neighbour to consider
        shrink : float, shrink term used in the normalization
        threshold : float, all the values under this value are cutted from the final result
        implicit : bool, if true, treat the URM as implicit, otherwise consider explicit ratings (real values) in the URM

        Returns
        -------
        recs: (list) recommendations
        map10: (float) MAP10 for the provided recommendations
        """
        start = time.time()
        
        _ucm_train = data.get_ucm_train()
        _urm = data.get_urm_train_1()
        _urm_test = data.get_urm_test_1()
        _targetids = data.get_target_playlists()

        ucm_train = _ucm_train if ucm_train is None else ucm_train
        urm = _urm if urm is None else urm
        urm_test = _urm_test if urm_test is None else urm_test
        targetids = _targetids if targetids is None else targetids

        self.fit(ucm_train, k=k, distance=distance, alpha=alpha, beta=beta, c=c, l=l, shrink=shrink, threshold=threshold, implicit=implicit)
        recs = self.recommend_batch(targetids, urm=urm, with_scores=with_scores, verbose=verbose)

        map10 = None
        if len(recs) > 0:
            map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose)
        else:
            log.warning('No recommendations available, skip evaluation')

        if export:
            exportcsv(recs, path='submission', name='{}_{}'.format(self.name,distance), verbose=verbose)

        if verbose:
            log.info('Run in: {:.2f}s'.format(time.time()-start))
        
        return recs, map10
Exemple #22
0
    def validate(self,
                 user_ids=d.get_target_playlists(),
                 log_path=None,
                 normalize_similarity=[False],
                 damp_coeff=[1],
                 add_zeros_quota=[1],
                 loss_tolerance=[1e-6],
                 iteration_limit=[30],
                 use_incremental=[False]):
        if log_path != None:
            orig_stdout = sys.stdout
            f = open(
                log_path + '/' + self.name + ' ' + time.strftime('_%H-%M-%S') +
                ' ' + time.strftime('%d-%m-%Y') + '.txt', 'w')
            sys.stdout = f

        for ns in normalize_similarity:
            for dc in damp_coeff:
                for adq in add_zeros_quota:
                    for lt in loss_tolerance:
                        for il in iteration_limit:
                            for ui in use_incremental:
                                print(
                                    self._print(normalize_similarity=ns,
                                                add_zeros_quota=dc,
                                                loss_tolerance=lt,
                                                iteration_limit=il,
                                                damp_coeff=dc,
                                                use_incremental=ui))
                                self.fit(ICM=d.get_icm(),
                                         URM_train=d.get_urm_train_1(),
                                         normalize_similarity=ns,
                                         add_zeros_quota=adq,
                                         loss_tolerance=lt,
                                         iteration_limit=il,
                                         damp_coeff=dc,
                                         use_incremental=ui)

                                recs = self.recommend_batch(
                                    user_ids, urm=d.get_urm_train_1())
                                r.evaluate(recs, d.get_urm_test_1())
        if log_path != None:
            sys.stdout = orig_stdout
            f.close()
Exemple #23
0
def cluster_users_by_top_pop_count(clip_perc, top_n=100, only_target=True):
    """
    Return the ids of the playlists containing at least the specified percentage of top
    popular track (in descending order based on contained top pop tracks count)
    
    Parameters
    ----------
    clip_perc: (float) returns only playlist with a percentage of top pop tracks over the total 
                tracks count >= clip_perc
    top_n: consider only the most popular tracks (it should be set equal to the max
            track count among all playlists)
    only_target: (bool) consider only the target playlist

    Returns
    -------
    List of playlist_id
    """
    playlists_df = data.get_playlists_df()
    #tot_interactions = playlists_df.shape[0]
    if only_target:
        # filter only target playlist
        target_playlist_df = pd.DataFrame({'playlist_id' : data.get_target_playlists()})
        playlists_df = playlists_df.merge(target_playlist_df)

    # track_id | count
    toptracks_df = playlists_df.groupby('track_id').size().reset_index(name='count')
    #toptracks_df['relative_count'] = toptracks_df['count'] / tot_interactions
    toptracks_df = toptracks_df.sort_values('count', ascending=False).head(top_n)

    # playlist_id | top_pop_count
    filtered_df = playlists_df.merge(toptracks_df)
    filtered_df = filtered_df.groupby('playlist_id').size().reset_index(name='top_pop_count')
    #filtered_df = filtered_df.sort_values('top_pop_count', ascending=False)

    # playlist_id | count | top_pop_count | perc
    playlists_count_df = playlists_df.groupby('playlist_id').size().reset_index(name='count')

    final_df = playlists_count_df.merge(filtered_df)
    final_df['perc'] = np.divide(final_df['top_pop_count'], final_df['count'])
    # filter only playlist with top pop perc >= clip_perc
    final_df = final_df[final_df['perc']>=clip_perc]
    final_df.sort_values(['perc','top_pop_count'], ascending=False, inplace=True)
    return final_df['playlist_id'].values
Exemple #24
0
    def run(self,
            normalize_similarity=False,
            add_zeros_quota=1,
            loss_tolerance=1e-6,
            iteration_limit=30,
            damp_coeff=1,
            use_incremental=False,
            export_results=True,
            export_r_hat=False,
            export_for_validation=False):
        if export_r_hat and export_for_validation:
            urm = d.get_urm_train_1()
        else:
            urm = d.get_urm()

        self.fit(ICM=d.get_icm(),
                 URM_train=urm,
                 normalize_similarity=normalize_similarity,
                 add_zeros_quota=add_zeros_quota,
                 loss_tolerance=loss_tolerance,
                 iteration_limit=iteration_limit,
                 damp_coeff=damp_coeff,
                 use_incremental=use_incremental)
        if export_results:
            print('exporting results')
            recs = self.recommend_batch(d.get_target_playlists(),
                                        N=10,
                                        urm=urm,
                                        filter_already_liked=True,
                                        with_scores=False,
                                        items_to_exclude=[],
                                        verbose=False)
            importexport.exportcsv(
                recs, 'submission',
                self._print(normalize_similarity=normalize_similarity,
                            add_zeros_quota=add_zeros_quota,
                            loss_tolerance=loss_tolerance,
                            iteration_limit=iteration_limit,
                            damp_coeff=damp_coeff,
                            use_incremental=use_incremental))
        elif export_r_hat:
            print('saving estimated urm')
            self.save_r_hat(export_for_validation)
Exemple #25
0
    def fit(self, clip=7):
        sparse_pl, dense_pl = cluster.cluster_users_by_interactions_count(
            clip=clip)

        log.success(
            'Cluster 1 (interactions count <= {}): {} playlists'.format(
                clip, len(sparse_pl)))
        log.success(
            'Cluster 2 (interactions count  > {}): {} playlists'.format(
                clip, len(dense_pl)))

        # filter target playlists from the 2 clusters
        s1 = set(sparse_pl)
        s2 = set(dense_pl)
        s_target = set(data.get_target_playlists())
        s1_target = s1 & s_target
        s2_target = s2 & s_target
        self.sparse_pl = list(s1_target)
        self.dense_pl = list(s2_target)
Exemple #26
0
    def test(self, distance=DistanceBasedRecommender.SIM_SPLUS, k=600, shrink=10, threshold=0, alpha=0.25, beta=0.5, l=0.5, c=0.25):
        """
        meant as a shortcut to run the model after the validation procedure,
        allowing the export of the scores on the playlists or of the estimated csr matrix
        """
        recs, map = self.run(urm=d.get_urm(),
                             icm=d.get_icm(),
                             targetids=d.get_target_playlists(),
                             distance=distance,
                             k=k, shrink=shrink,
                             threshold=threshold,
                             alpha=alpha,
                             beta=beta,
                             l=l,
                             c=c,
                             export=export_results)

        if export_r_hat:
            print('saving estimated urm')
            self.save_r_hat()
        return recs, map
Exemple #27
0
def _check_presence_test_samples(df_test, target='target'):
    """
    Checks that in the test dataframe there is at least one track for each target playlist
    :param df_test: (panda's dataframe)
    :param target_playlists: (list)
    """
    if target == 'all':
        p = d.get_all_playlists()
    elif target == 'target':
        p = d.get_target_playlists()

    if len(df_test[df_test['playlist_id'].isin(p)].groupby(
            'playlist_id')) != len(p):
        if target == 'all':
            print(
                "WARNING: not all the target playlists (JUST THE TARGETS) have a song in the training set"
            )
        elif target == 'target':
            print(
                "WARNING: not all the playlists (ALL OF THEM) have a song in the training set"
            )
Exemple #28
0
def validate(l1_ratio_array,
             alpha_array,
             max_iter_array,
             topK_array,
             userids=data.get_target_playlists(),
             urm_train=data.get_urm_train_1(),
             urm_test=data.get_urm_test_1(),
             filter_already_liked=True,
             items_to_exclude=[],
             N=10,
             verbose=True,
             write_on_file=True):
    """
    -----------
    :return: _
    """

    #create the initial model
    recommender = SLIMElasticNetRecommender()

    path = 'validation_results/'
    name = 'slim_rmse'
    folder = time.strftime('%d-%m-%Y')
    filename = '{}/{}/{}{}.csv'.format(path, folder, name,
                                       time.strftime('_%H-%M-%S'))
    # create dir if not exists
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    with open(filename, 'w') as out:
        for l in l1_ratio_array:
            for a in alpha_array:
                for m in max_iter_array:
                    for k in topK_array:

                        #train the model with the parameters
                        if verbose:
                            print(
                                '\n\nTraining slim_rmse with\n l1_ratio: {}\n alpha: {}\n'
                                'Iterations: {}\n topK: {}'.format(l, a, m, k))
                            print('\n training phase...')
                        recommender.fit(urm=urm_train,
                                        l1_ratio=l,
                                        alpha=a,
                                        max_iter=m,
                                        topK=k)

                        #get the recommendations from the trained model
                        recommendations = recommender.recommend_batch(
                            userids=userids,
                            N=N,
                            filter_already_liked=filter_already_liked,
                            items_to_exclude=items_to_exclude)
                        #evaluate the model with map10
                        map10 = recommender.evaluate(recommendations,
                                                     test_urm=urm_test)
                        if verbose:
                            print('map@10: {}'.format(map10))

                        #write on external files on folder models_validation
                        if write_on_file:
                            out.write(
                                '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n '
                                'topK: {}\n evaluation map@10: {}'.format(
                                    l, a, m, k, map10))
Exemple #29
0
    def run(self,
            urm_train=None,
            urm=None,
            urm_test=None,
            targetids=None,
            factors=100,
            regularization=0.01,
            iterations=100,
            alpha=25,
            with_scores=False,
            export=True,
            verbose=True):
        """
        Run the model and export the results to a file

        Returns
        -------
        :return: recs: (list) recommendations
        :return: map10: (float) MAP10 for the provided recommendations
        """
        _urm_train = data.get_urm_train_1()
        _urm = data.get_urm()
        _icm = data.get_icm()
        _urm_test = data.get_urm_test_1()
        _targetids = data.get_target_playlists()
        # _targetids = data.get_all_playlists()

        start = time.time()

        urm_train = _urm_train if urm_train is None else urm_train
        urm = _urm if urm is None else urm
        urm_test = _urm_test if urm_test is None else urm_test
        targetids = _targetids if targetids is None else targetids

        self.fit(l1_ratio=0.1,
                 positive_only=True,
                 alpha=1e-4,
                 fit_intercept=False,
                 copy_X=False,
                 precompute=False,
                 selection='random',
                 max_iter=100,
                 topK=100,
                 tol=1e-4,
                 workers=multiprocessing.cpu_count())
        recs = self.recommend_batch(userids=targetids,
                                    with_scores=with_scores,
                                    verbose=verbose)

        map10 = None
        if len(recs) > 0:
            map10 = self.evaluate(recs, test_urm=urm_test, verbose=verbose)
        else:
            log.warning('No recommendations available, skip evaluation')

        if export:
            exportcsv(recs, path='submission', name=self.name, verbose=verbose)

        if verbose:
            log.info('Run in: {:.2f}s'.format(time.time() - start))

        return recs, map10
Exemple #30
0
                            userids=userids,
                            N=N,
                            filter_already_liked=filter_already_liked,
                            items_to_exclude=items_to_exclude)
                        #evaluate the model with map10
                        map10 = recommender.evaluate(recommendations,
                                                     test_urm=urm_test)
                        if verbose:
                            print('map@10: {}'.format(map10))

                        #write on external files on folder models_validation
                        if write_on_file:
                            out.write(
                                '\n\nl1_ratio: {}\n alpha: {}\n Iterations: {}\n '
                                'topK: {}\n evaluation map@10: {}'.format(
                                    l, a, m, k, map10))


"""
If this file is executed, test the SPLUS distance metric
"""
if __name__ == '__main__':
    rec = SLIMElasticNetRecommender()
    rec.fit(urm=data.get_urm_train_1(),
            max_iter=1,
            topK=400,
            alpha=1e-4,
            l1_ratio=0.5)
    recs = rec.recommend_batch(userids=data.get_target_playlists())
    rec.evaluate(recommendations=recs, test_urm=data.get_urm_test_1())