Python check_matrix Examples, SLIM_BPR2.Base.Recommender_utils.check_matrix Python Examples

Example #1

0

Show file

    def __init__(self, URM_train, ICM, S_matrix_target):

        super(CFW_D_Similarity_Linalg, self).__init__()

        if (URM_train.shape[1] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. URM contains {} but ICM contains {}"
                .format(URM_train.shape[1], ICM.shape[0]))

        if (S_matrix_target.shape[0] != S_matrix_target.shape[1]):
            raise ValueError(
                "Items imilarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if (S_matrix_target.shape[0] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], ICM.shape[0]))

        self.URM_train = check_matrix(URM_train, 'csr')
        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')
        self.ICM = check_matrix(ICM, 'csr')

        self.n_items = self.URM_train.shape[1]
        self.n_users = self.URM_train.shape[0]
        self.n_features = self.ICM.shape[1]

        self.sparse_weights = True

Example #2

0

Show file

File: PredefinedListRecommender.py Project: vittorio96/RecSys

    def __init__(self, URM_recommendations_items):
        super(PredefinedListRecommender, self).__init__()

        # convert to csc matrix for faster column-wise sum
        self.URM_recommendations = check_matrix(URM_recommendations_items, 'csr', dtype=np.int)

        self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))

Example #3

0

Show file

    def __init__(self, URM_train):
        super(RP3betaRecommender, self).__init__()

        self.URM_train = check_matrix(URM_train,
                                      format='csr',
                                      dtype=np.float32)
        self.sparse_weights = True

Example #4

0

Show file

    def __init__(self, URM_train):
        super(PureSVDRecommender, self).__init__()

        # CSR is faster during evaluation
        self.URM_train = check_matrix(URM_train, 'csr')

        self.compute_item_score = self.compute_score_SVD

Example #5

0

Show file

File: Compute_Similarity_Python.py Project: vittorio96/RecSys

    def applyPearsonCorrelation(self):
        """
        Remove from every data point the average for the corresponding column
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        interactionsPerCol = np.diff(self.dataMatrix.indptr)

        nonzeroCols = interactionsPerCol > 0
        sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()

        colAverage = np.zeros_like(sumPerCol)
        colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[
            nonzeroCols]

        # Split in blocks to avoid duplicating the whole data structure
        start_col = 0
        end_col = 0

        blockSize = 1000

        while end_col < self.n_columns:

            end_col = min(self.n_columns, end_col + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
                np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])

            start_col += blockSize

Example #6

0

Show file

File: Compute_Similarity_Python.py Project: vittorio96/RecSys

    def applyAdjustedCosine(self):
        """
        Remove from every data point the average for the corresponding row
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csr')

        interactionsPerRow = np.diff(self.dataMatrix.indptr)

        nonzeroRows = interactionsPerRow > 0
        sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()

        rowAverage = np.zeros_like(sumPerRow)
        rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[
            nonzeroRows]

        # Split in blocks to avoid duplicating the whole data structure
        start_row = 0
        end_row = 0

        blockSize = 1000

        while end_row < self.n_rows:

            end_row = min(self.n_rows, end_row + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
                np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])

            start_row += blockSize

Example #7

0

Show file

File: RevisedCBF.py Project: mindis/RecSys-2

    def __init__(self, ICM, URM_train, sparse_weights=True):
        super(RevisedCBF, self).__init__()

        self.ICM = ICM.copy()

        # CSR is faster during evaluation
        self.URM_train = check_matrix(URM_train.copy(), 'csr')

        self.sparse_weights = sparse_weights

Example #8

0

Show file

File: MatrixFactorization_RMSE.py Project: vittorio96/RecSys

    def fit(self, R):
        print("Starting AsySVD fitting")
        self.dataset = R
        R = check_matrix(R, 'csr', dtype=np.float32)
        self.X, self.Y = AsySVD_sgd(R, self.num_factors, self.lrate, self.reg, self.iters, self.init_mean,
                                    self.init_std,
                                    self.lrate_decay, self.rnd_seed)

        # precompute the user factors
        M = R.shape[0]
        self.U = np.vstack([AsySVD_compute_user_factors(R[i], self.Y) for i in range(M)])
        print("Done AsySVD Fitting")

Example #9

0

Show file

File: MatrixFactorization_RMSE.py Project: vittorio96/RecSys

 def fit(self, R):
     self.dataset = R
     R = check_matrix(R, 'csr', dtype=np.float32)
     self.X, self.Y = BPRMF_sgd(R,
                                num_factors=self.num_factors,
                                lrate=self.lrate,
                                user_reg=self.user_reg,
                                pos_reg=self.pos_reg,
                                neg_reg=self.neg_reg,
                                iters=self.iters,
                                sampling_type=self.sampling_type,
                                sample_with_replacement=self.sample_with_replacement,
                                use_resampling=self.use_resampling,
                                sampling_pop_alpha=self.sampling_pop_alpha,
                                init_mean=self.init_mean,
                                init_std=self.init_std,
                                lrate_decay=self.lrate_decay,
                                rnd_seed=self.rnd_seed,
                                verbose=self.verbose)

Example #10

0

Show file

File: Slim_RMSE.py Project: vittorio96/RecSys

    def fit(self,
            l1_penalty=0.1,
            l2_penalty=0.1,
            positive_only=True,
            topK=100,
            workers=multiprocessing.cpu_count()):
        self.l1_penalty = l1_penalty
        self.l2_penalty = l2_penalty
        self.positive_only = positive_only
        self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty)
        self.topK = topK

        self.workers = workers

        self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)
        n_items = self.URM_train.shape[1]
        # fit item's factors in parallel

        # oggetto riferito alla funzione nel quale predefinisco parte dell'input
        _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK)

        # creo un pool con un certo numero di processi
        pool = Pool(processes=self.workers)

        # avvio il pool passando la funzione (con la parte fissa dell'input)
        # e il rimanente parametro, variabile
        res = pool.map(_pfit, np.arange(n_items))

        # res contains a vector of (values, rows, cols) tuples
        values, rows, cols = [], [], []
        for values_, rows_, cols_ in res:
            values.extend(values_)
            rows.extend(rows_)
            cols.extend(cols_)

        # generate the sparse weight matrix
        self.W_sparse = sps.csc_matrix((values, (rows, cols)),
                                       shape=(n_items, n_items),
                                       dtype=np.float32)

Example #11

0

Show file

File: MatrixFactorization_RMSE.py Project: vittorio96/RecSys

    def __init__(self, URM_train):

        super(FunkSVD, self).__init__()

        self.URM_train = check_matrix(URM_train, 'csr', dtype=np.float32)

Example #12

0

Show file

File: Slim_RMSE.py Project: vittorio96/RecSys

    def fit(self,
            l1_penalty=0.1,
            l2_penalty=0.1,
            positive_only=True,
            topK=100):

        self.l1_penalty = l1_penalty
        self.l2_penalty = l2_penalty
        self.positive_only = positive_only
        self.l1_ratio = self.l1_penalty / (self.l1_penalty + self.l2_penalty)
        self.topK = topK

        X = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = X.shape[1]

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=.001,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        # we'll store the W matrix into a sparse csr_matrix
        # let's initialize the vectors used by the sparse.csc_matrix constructor
        values, rows, cols = [], [], []
        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):
            # get the target column
            y = X[:, currentItem].toarray()
            # set the j-th column of X to zero
            startptr = X.indptr[currentItem]
            endptr = X.indptr[currentItem + 1]
            bak = X.data[startptr:endptr].copy()
            X.data[startptr:endptr] = 0.0
            # fit one ElasticNet model per column
            self.model.fit(X, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values
            # nnz_idx = self.model.coef_ > 0.0

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index
            relevant_items_partition = (-self.model.coef_).argpartition(
                self.topK)[0:self.topK]
            relevant_items_partition_sorting = np.argsort(
                -self.model.coef_[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            notZerosMask = self.model.coef_[ranking] > 0.0
            ranking = ranking[notZerosMask]

            values.extend(self.model.coef_[ranking])
            rows.extend(ranking)
            cols.extend([currentItem] * len(ranking))

            # finally, replace the original values of the j-th column
            X.data[startptr:endptr] = bak

            if time.time() - start_time_printBatch > 300:
                print(
                    "Processed {} ( {:.2f}% ) in {:.2f} minutes. Columns per second: {:.0f}"
                    .format(currentItem, 100.0 * float(currentItem) / n_items,
                            (time.time() - start_time) / 60,
                            float(currentItem) / (time.time() - start_time)))
                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csc_matrix((values, (rows, cols)),
                                       shape=(n_items, n_items),
                                       dtype=np.float32)

Example #13

0

Show file

    def _generateTrainData_low_ram(self):

        print(self.RECOMMENDER_NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        self.similarity = Compute_Similarity(self.ICM.T,
                                             shrink=0,
                                             topK=self.topK,
                                             normalize=False)
        S_matrix_contentKNN = self.similarity.compute_similarity()
        S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr")

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Collaborative S density: {:.2E}, nonzero cells {}".format(
                self.S_matrix_target.nnz /
                self.S_matrix_target.shape[0]**2, self.S_matrix_target.nnz))

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Content S density: {:.2E}, nonzero cells {}".format(
                S_matrix_contentKNN.nnz /
                S_matrix_contentKNN.shape[0]**2, S_matrix_contentKNN.nnz))

        if self.normalize_similarity:

            # Compute sum of squared
            sum_of_squared_features = np.array(
                self.ICM.T.power(2).sum(axis=0)).ravel()
            sum_of_squared_features = np.sqrt(sum_of_squared_features)

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index

                    new_data_value = self.S_matrix_target[row_index, col_index]

                    if self.normalize_similarity:
                        new_data_value *= sum_of_squared_features[
                            row_index] * sum_of_squared_features[col_index]

                    self.data_list[num_samples] = new_data_value

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if time.time(
            ) - start_time_batch > 30 or num_samples == S_matrix_contentKNN.nnz * (
                    1 + self.add_zeros_quota):

                print(self.RECOMMENDER_NAME +
                      ": Generating train data. Sample {} ( {:.2f} %) ".format(
                          num_samples, num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Content S structure has {} out of {} ( {:.2f}%) nonzero collaborative cells"
            .format(num_common_coordinates, S_matrix_contentKNN.nnz,
                    num_common_coordinates / S_matrix_contentKNN.nnz * 100))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self._writeLog(
            self.RECOMMENDER_NAME +
            ": Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz, collaborative_sum /
                collaborative_nnz))

Example #14

0

Show file

File: Compute_Similarity_Python.py Project: vittorio96/RecSys

    def compute_similarity(self, start_col=None, end_col=None, block_size=100):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        values = []
        rows = []
        cols = []

        start_time = time.time()
        start_time_print_batch = start_time
        processedItems = 0

        if self.adjusted_cosine:
            self.applyAdjustedCosine()

        elif self.pearson_correlation:
            self.applyPearsonCorrelation()

        elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient:
            self.useOnlyBooleanInteractions()

        # We explore the matrix column-wise
        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        # Compute sum of squared values to be used in normalization
        sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()

        # Tanimoto does not require the square root to be applied
        if not (self.tanimoto_coefficient or self.dice_coefficient
                or self.tversky_coefficient):
            sumOfSquared = np.sqrt(sumOfSquared)

        if self.asymmetric_cosine:
            sumOfSquared_to_1_minus_alpha = np.power(
                sumOfSquared, 2 * (1 - self.asymmetric_alpha))
            sumOfSquared_to_alpha = np.power(sumOfSquared,
                                             2 * self.asymmetric_alpha)

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col > 0 and start_col < self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col > start_col_local and end_col < self.n_columns:
            end_col_local = end_col

        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            # Add previous block size
            processedItems += this_block_size

            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block - start_col_block

            if time.time(
            ) - start_time_print_batch >= 30 or end_col_block == end_col_local:
                columnPerSec = processedItems / (time.time() - start_time)

                print(
                    "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min"
                    .format(
                        processedItems, processedItems /
                        (end_col_local - start_col_local) * 100, columnPerSec,
                        (time.time() - start_time) / 60))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray().squeeze()

            if self.use_row_weights:
                #item_data = np.multiply(item_data, self.row_weights)
                #item_data = item_data.T.dot(self.row_weights_diag).T
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)

            else:
                # Compute item similarities
                this_block_weights = self.dataMatrix.T.dot(item_data)

            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights
                else:
                    this_column_weights = this_block_weights[:,
                                                             col_index_in_block]

                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:

                    if self.asymmetric_cosine:
                        denominator = sumOfSquared_to_alpha[
                            columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6
                    else:
                        denominator = sumOfSquared[
                            columnIndex] * sumOfSquared + self.shrink + 1e-6

                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # Apply the specific denominator for Tanimoto
                elif self.tanimoto_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.dice_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.tversky_coefficient:
                    denominator = this_column_weights + \
                                  (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \
                                  (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # If no normalization or tanimoto is selected, apply only shrink
                elif self.shrink != 0:
                    this_column_weights = this_column_weights / self.shrink

                #this_column_weights = this_column_weights.toarray().ravel()

                if self.TopK == 0:
                    self.W_dense[:, columnIndex] = this_column_weights

                else:
                    # Sort indices and select TopK
                    # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                    # - Partition the data to extract the set of relevant items
                    # - Sort only the relevant items
                    # - Get the original item index
                    relevant_items_partition = (
                        -this_column_weights).argpartition(self.TopK -
                                                           1)[0:self.TopK]
                    relevant_items_partition_sorting = np.argsort(
                        -this_column_weights[relevant_items_partition])
                    top_k_idx = relevant_items_partition[
                        relevant_items_partition_sorting]

                    # Incrementally build sparse matrix, do not add zeros
                    notZerosMask = this_column_weights[top_k_idx] != 0.0
                    numNotZeros = np.sum(notZerosMask)

                    values.extend(this_column_weights[top_k_idx][notZerosMask])
                    rows.extend(top_k_idx[notZerosMask])
                    cols.extend(np.ones(numNotZeros) * columnIndex)

            start_col_block += block_size

        # End while on columns

        if self.TopK == 0:
            return self.W_dense

        else:

            W_sparse = sps.csr_matrix((values, (rows, cols)),
                                      shape=(self.n_columns, self.n_columns),
                                      dtype=np.float32)

            return W_sparse