Ejemplo n.º 1
0
    def applyPearsonCorrelation(self):
        """
        Remove from every data point the average for the corresponding column
        :return:
        """

        self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csc')

        interactionsPerCol = np.diff(self.dataMatrix.indptr)

        nonzeroCols = interactionsPerCol > 0
        sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()

        colAverage = np.zeros_like(sumPerCol)
        colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[
            nonzeroCols]

        # Split in blocks to avoid duplicating the whole data structure
        start_col = 0
        end_col = 0

        blockSize = 1000

        while end_col < self.n_columns:
            end_col = min(self.n_columns, end_col + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
                np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])

            start_col += blockSize
Ejemplo n.º 2
0
    def applyAdjustedCosine(self):
        """
        Remove from every data point the average for the corresponding row
        :return:
        """

        self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csr')

        interactionsPerRow = np.diff(self.dataMatrix.indptr)

        nonzeroRows = interactionsPerRow > 0
        sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()

        rowAverage = np.zeros_like(sumPerRow)
        rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[
            nonzeroRows]

        # Split in blocks to avoid duplicating the whole data structure
        start_row = 0
        end_row = 0

        blockSize = 1000

        while end_row < self.n_rows:
            end_row = min(self.n_rows, end_row + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
                np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])

            start_row += blockSize
Ejemplo n.º 3
0
    def __init__(self, URM_train, sparse_weights=True):
        super(ItemKNNCFRecommender, self).__init__()

        # CSR is faster during evaluation
        self.URM_train = cm.check_matrix(URM_train, 'csr')

        self.dataset = None

        self.sparse_weights = sparse_weights
Ejemplo n.º 4
0
    def __init__(self, ICM, URM_train, sparse_weights=True):
        super(ItemKNNCBFRecommender, self).__init__()

        self.ICM = ICM.copy()

        # CSR is faster during evaluation
        self.URM_train = cm.check_matrix(URM_train.copy(), 'csr')

        self.sparse_weights = sparse_weights
Ejemplo n.º 5
0
    def fit(self, R):
        self.dataset = R
        R = cm.check_matrix(R, 'csr', dtype=np.float32)
        self.X, self.Y = AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean,
                                    self.init_std,
                                    self.lrate_decay, self.rnd_seed)
        # precompute the user factors
        M = R.shape[0]

        self.U = np.vstack([AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)])
Ejemplo n.º 6
0
    def __init__(self, URM_train, sparse_weights=True):
        super(UserKNNCFRecommender, self).__init__()
        self.name = 'UserKNN'
        # Not sure if CSR here is faster
        self.URM_train = cm.check_matrix(URM_train, 'csr')

        self.dataset = None

        self.sparse_weights = sparse_weights

        self.compute_item_score = self.compute_score_user_based
Ejemplo n.º 7
0
    def __init__(self, urm_filter_tracks):

        self.urm_filter_tracks = urm_filter_tracks

        index = 0
        for r in self.matrices_array:
            self.matrices_array[index] = cm.check_matrix(r, 'csr')
            index += 1

        self._normalization(normalization_mode=self.normalization_mode)

        print('matrices_normalized')
Ejemplo n.º 8
0
    def save_r_hat(self, evaluation):

        r_hat = self.W_sparse
        r_hat = check_matrix(r_hat, format='csr')

        # create dir if not exists
        if evaluation:
            filename = 'raw_data/saved_r_hat_evaluation/{}_{}'.format(self.name, time.strftime('%H-%M-%S'))
            os.makedirs(os.path.dirname(filename), exist_ok=True)
        else:
            filename = 'raw_data/saved_r_hat/{}_{}'.format(self.name, time.strftime('%H-%M-%S'))
            os.makedirs(os.path.dirname(filename), exist_ok=True)

        sps.save_npz(filename, r_hat)
        log.success('R_hat succesfully saved in: {}.npz'.format(filename))
Ejemplo n.º 9
0
    def save_r_hat(self, evaluation=False):

        r_hat = self.get_r_hat()
        r_hat = check_matrix(r_hat, format='csr')

        # create dir if not exists
        if evaluation:
            filename = 'raw_data/saved_r_hat_evaluation/{}_{}'.format(
                self.name, time.strftime('%H-%M-%S'))
            os.makedirs(os.path.dirname(filename), exist_ok=True)
        else:
            filename = 'raw_data/saved_r_hat/{}_{}'.format(
                self.name, time.strftime('%H-%M-%S'))
            os.makedirs(os.path.dirname(filename), exist_ok=True)

        sps.save_npz(filename, r_hat)
Ejemplo n.º 10
0
def apply_top_k(matrix, k):
    start = time.time()
    matrix = cm.check_matrix(matrix, format='csr')
    # initializing the new row to substitue to the preciding one
    filtered_matrix = np.empty(shape=(matrix.shape[0], matrix.shape[1]))
    for i in range(matrix.shape[0]):

        row = matrix.getrow(i)
        row = row.todense()
        relevant_items_row_indices = (-row).argpartition(k)[0, 0:k]
        for c_index in relevant_items_row_indices[0]:
            filtered_matrix[i, c_index] = row[0, c_index]
    #convert the matrix to a sparse format
    sp_filtered_matrix = sps.csr_matrix(filtered_matrix)
    print('topK applied in {} s'.format(time.time() - start))
    return sp_filtered_matrix
Ejemplo n.º 11
0
    def __init__(self, name, cluster, mode, matrices_array, normalization_mode,
                 weights_array):

        super(Hybrid, self).__init__(name=name, cluster=cluster, mode=mode)

        # load handle and dictionary based on mode will be used during the recommend batch
        self.dict_col = data.dictionary_col(mode=self.mode)
        self.df_handle = data.handle_df(mode=self.mode)
        self.targetids = data.target_urm_rows(self.mode)
        self.r_hat = None

        # will be set if the hybrid is done via similarity matrices
        self.urm_name = None
        self.weights_array = weights_array

        # store the array of matrices in the hybrid recommender
        self.matrices_array = matrices_array

        # check the shapes of the matrices
        self._check_matrices_array_shapes()

        # normalize the matrices
        self.normalization_mode = normalization_mode

        # will be filled when the _normalization method will be called
        self.normalized_matrices_array = None

        print(
            'checking that all the matrix in matrices array are in CSR format...\n'
        )
        for index in range(len(self.matrices_array)):
            self.matrices_array[index] = cm.check_matrix(
                self.matrices_array[index], 'csr')
        print('done\n')

        print('normalizing the matrix in matrices array...\n')
        self._normalization(normalization_mode=self.normalization_mode)
        print('matrices_normalized\n')
Ejemplo n.º 12
0
 def fit(self, R):
     '''
     Initialize the model
     :param num_factors: number of latent factors
     :param lrate: initial learning rate used in SGD
     :param user_reg: regularization for the user factors
     :param pos_reg: regularization for the factors of the positive sampled items
     :param neg_reg: regularization for the factors of the negative sampled items
     :param iters: number of iterations in training the model with SGD
     :param sampling_type: type of sampling. Supported types are 'user_uniform_item_uniform' and 'user_uniform_item_pop'
     :param sample_with_replacement: `True` to sample positive items with replacement (doesn't work with 'user_uniform_item_pop')
     :param use_resampling: `True` to resample at each iteration during training
     :param sampling_pop_alpha: float smoothing factor for popularity based samplers (e.g., 'user_uniform_item_pop')
     :param init_mean: mean used to initialize the latent factors
     :param init_std: standard deviation used to initialize the latent factors
     :param lrate_decay: learning rate decay
     :param rnd_seed: random seed
     :param verbose: controls verbosity in output
     '''
     self.dataset = R
     R = cm.check_matrix(R, 'csr', dtype=np.float32)
     self.X, self.Y = BPRMF_sgd(R,
                                num_factors=100,
                                lrate=0.1,
                                user_reg=0.0015,
                                pos_reg=0.0015,
                                neg_reg=0.0015,
                                iters=10,
                                sampling_type='user_uniform_item_uniform',
                                sample_with_replacement=True,
                                use_resampling=True,
                                sampling_pop_alpha=1.0,
                                init_mean=0.0,
                                init_std=0.1,
                                lrate_decay=1.0,
                                rnd_seed=42,
                                verbose=True)
Ejemplo n.º 13
0
    def fit(self):
        print('hybrid matrix creation...')
        start = time.time()
        hybrid_matrix = sps.csr_matrix(self.normalized_matrices_array[0].shape)

        count = 0
        for m in self.normalized_matrices_array:
            hybrid_matrix += m * self.weights_array[count]
            count += 1

        if self.name == 'HybridSimilarity':
            # compute the r_hat if we have the similarity
            urm = data.urm(self.mode, self.urm_name)
            # check that urm is in CSR format
            urm = cm.check_matrix(urm, 'csr')
            # check if the similarity is user-user or item-item
            if hybrid_matrix.shape[0] == urm.shape[1]:
                # user - user similarity
                hybrid_matrix = urm[self.targetids].dot(hybrid_matrix)
            else:
                # item - item similarity
                hybrid_matrix = hybrid_matrix[self.targetids].dot(urm)
        print('hybrid matrix created in {:.2f} s'.format(time.time() - start))
        self.r_hat = hybrid_matrix
Ejemplo n.º 14
0
    def compute_similarity(self, start_col=None, end_col=None, block_size=100):
        """
        Compute the Similarity_MFD for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        values = []
        rows = []
        cols = []

        start_time = time.time()
        start_time_print_batch = start_time
        processedItems = 0

        if self.adjusted_cosine:
            self.applyAdjustedCosine()

        elif self.pearson_correlation:
            self.applyPearsonCorrelation()

        elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient:
            self.useOnlyBooleanInteractions()

        # We explore the matrix column-wise
        self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csc')

        # Compute sum of squared values to be used in normalization
        sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()

        # Tanimoto does not require the square root to be applied
        if not (self.tanimoto_coefficient or self.dice_coefficient
                or self.tversky_coefficient):
            sumOfSquared = np.sqrt(sumOfSquared)

        if self.asymmetric_cosine:
            sumOfSquared_to_1_minus_alpha = np.power(
                sumOfSquared, 2 * (1 - self.asymmetric_alpha))
            sumOfSquared_to_alpha = np.power(sumOfSquared,
                                             2 * self.asymmetric_alpha)

        self.dataMatrix = cm.check_matrix(self.dataMatrix, 'csc')

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col > 0 and start_col < self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col > start_col_local and end_col < self.n_columns:
            end_col_local = end_col

        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            # Add previous block size
            processedItems += this_block_size

            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block - start_col_block

            if time.time(
            ) - start_time_print_batch >= 30 or end_col_block == end_col_local:
                columnPerSec = processedItems / (time.time() - start_time)

                print(
                    "Similarity_MFD column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min"
                    .format(
                        processedItems, processedItems /
                        (end_col_local - start_col_local) * 100, columnPerSec,
                        (time.time() - start_time) / 60))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray().squeeze()

            if self.use_row_weights:
                # item_data = np.multiply(item_data, self.row_weights)
                # item_data = item_data.T.dot(self.row_weights_diag).T
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)

            else:
                # Compute item similarities
                this_block_weights = self.dataMatrix.T.dot(item_data)

            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights
                else:
                    this_column_weights = this_block_weights[:,
                                                             col_index_in_block]

                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:

                    if self.asymmetric_cosine:
                        denominator = sumOfSquared_to_alpha[
                            columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6
                    else:
                        denominator = sumOfSquared[
                            columnIndex] * sumOfSquared + self.shrink + 1e-6

                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # Apply the specific denominator for Tanimoto
                elif self.tanimoto_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.dice_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.tversky_coefficient:
                    denominator = this_column_weights + \
                                  (sumOfSquared[columnIndex] - this_column_weights) * self.tversky_alpha + \
                                  (sumOfSquared - this_column_weights) * self.tversky_beta + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # If no normalization or tanimoto is selected, apply only shrink
                elif self.shrink != 0:
                    this_column_weights = this_column_weights / self.shrink

                # this_column_weights = this_column_weights.toarray().ravel()

                if self.TopK == 0:
                    self.W_dense[:, columnIndex] = this_column_weights

                else:
                    # Sort indices and select TopK
                    # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                    # - Partition the data to extract the set of relevant items
                    # - Sort only the relevant items
                    # - Get the original item index
                    relevant_items_partition = (
                        -this_column_weights).argpartition(self.TopK -
                                                           1)[0:self.TopK]
                    relevant_items_partition_sorting = np.argsort(
                        -this_column_weights[relevant_items_partition])
                    top_k_idx = relevant_items_partition[
                        relevant_items_partition_sorting]

                    # Incrementally build sparse matrix, do not add zeros
                    notZerosMask = this_column_weights[top_k_idx] != 0.0
                    numNotZeros = np.sum(notZerosMask)

                    values.extend(this_column_weights[top_k_idx][notZerosMask])
                    rows.extend(top_k_idx[notZerosMask])
                    cols.extend(np.ones(numNotZeros) * columnIndex)

            start_col_block += block_size

        # End while on columns

        if self.TopK == 0:
            return self.W_dense

        else:

            W_sparse = sps.csr_matrix((values, (rows, cols)),
                                      shape=(self.n_columns, self.n_columns),
                                      dtype=np.float32)

            return W_sparse
Ejemplo n.º 15
0
    def __init__(self, URM_train):
        super(RP3betaRecommender, self).__init__()

        self.URM_train = cm.check_matrix(URM_train, format='csr', dtype=np.float32)
        self.sparse_weights = True