Ejemplo n.º 1
0
    def __init__(self, URM_train, UCM, S_matrix_target):

        super(User_CFW_D_Similarity_Linalg, self).__init__(URM_train)

        if (URM_train.shape[0] != UCM.shape[0]):
            raise ValueError(
                "Number of users not consistent. URM contains {} but UCM contains {}"
                .format(URM_train.shape[1], UCM.shape[0]))

        if (S_matrix_target.shape[0] != S_matrix_target.shape[1]):
            raise ValueError(
                "User similarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if (S_matrix_target.shape[0] != UCM.shape[0]):
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], UCM.shape[0]))

        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')
        self.UCM = check_matrix(UCM, 'csr')

        self.n_items = self.URM_train.shape[1]
        self.n_users = self.URM_train.shape[0]
        self.n_features = self.UCM.shape[1]

        self.sparse_weights = True
Ejemplo n.º 2
0
    def fit(self, topK=50, shrink=100, similarity='cosine', normalize=True, feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'".format(
                    self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        similarity = Compute_Similarity(self.URM_train.T, shrink=shrink, topK=topK, normalize=normalize,
                                        similarity=similarity, **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 3
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            interactions_feature_weighting="none",
            **similarity_args):

        if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES,
                        interactions_feature_weighting))

        if interactions_feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif interactions_feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        super().fit(topK=topK,
                    shrink=shrink,
                    similarity=similarity,
                    normalize=normalize,
                    feature_weighting=feature_weighting,
                    **similarity_args)
Ejemplo n.º 4
0
    def get_S_incremental_and_set_W(self):

        self.S_incremental = self.cythonEpoch.get_S()

        if self.train_with_sparse_weights:
            self.W_sparse = self.S_incremental
            self.W_sparse = check_matrix(self.W_sparse, format='csr')
        else:
            self.W_sparse = similarityMatrixTopK(self.S_incremental, k=self.topK)
            self.W_sparse = check_matrix(self.W_sparse, format='csr')
    def __init__(self, URM_train, Similarity_1, Similarity_2, verbose=True):
        super(ItemKNNSimilarityHybridRecommender,
              self).__init__(URM_train, verbose=verbose)

        if Similarity_1.shape != Similarity_2.shape:
            raise ValueError(
                "ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}"
                .format(Similarity_1.shape, Similarity_2.shape))

        # CSR is faster during evaluation
        self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr')
        self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
Ejemplo n.º 6
0
    def fit(self, lambda_user=10, lambda_item=25):

        self.lambda_user = lambda_user
        self.lambda_item = lambda_item
        self.n_items = self.URM_train.shape[1]

        # convert to csc matrix for faster column-wise sum
        self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        # 1) global average
        self.mu = self.URM_train.data.sum(
            dtype=np.float32) / self.URM_train.data.shape[0]

        # 2) item average bias
        # compute the number of non-zero elements for each column
        col_nnz = np.diff(self.URM_train.indptr)

        # it is equivalent to:
        # col_nnz = X.indptr[1:] - X.indptr[:-1]
        # and it is **much faster** than
        # col_nnz = (X != 0).sum(axis=0)

        URM_train_unbiased = self.URM_train.copy()
        URM_train_unbiased.data -= self.mu
        self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz +
                                                           self.lambda_item)
        self.item_bias = np.asarray(self.item_bias).ravel(
        )  # converts 2-d matrix to 1-d array without anycopy

        # 3) user average bias
        # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes.

        # first subtract the item biases from each column
        # then repeat each element of the item bias vector a number of times equal to col_nnz
        # and subtract it from the data vector
        URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz)

        # now convert the csc matrix to csr for efficient row-wise computation
        URM_train_unbiased_csr = URM_train_unbiased.tocsr()
        row_nnz = np.diff(URM_train_unbiased_csr.indptr)
        # finally, let's compute the bias
        self.user_bias = URM_train_unbiased_csr.sum(
            axis=1).ravel() / (row_nnz + self.lambda_user)

        # 4) precompute the item ranking by using the item bias only
        # the global average and user bias won't change the ranking, so there is no need to use them
        #self.item_ranking = np.argsort(self.bi)[::-1]

        self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
Ejemplo n.º 7
0
    def applyPearsonCorrelation(self):
        """
        Remove from every data point the average for the corresponding column
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        interactionsPerCol = np.diff(self.dataMatrix.indptr)

        nonzeroCols = interactionsPerCol > 0
        sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()

        colAverage = np.zeros_like(sumPerCol)
        colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[
            nonzeroCols]

        # Split in blocks to avoid duplicating the whole data structure
        start_col = 0
        end_col = 0

        blockSize = 1000

        while end_col < self.n_columns:

            end_col = min(self.n_columns, end_col + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
                np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])

            start_col += blockSize
Ejemplo n.º 8
0
    def applyAdjustedCosine(self):
        """
        Remove from every data point the average for the corresponding row
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csr')

        interactionsPerRow = np.diff(self.dataMatrix.indptr)

        nonzeroRows = interactionsPerRow > 0
        sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()

        rowAverage = np.zeros_like(sumPerRow)
        rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[
            nonzeroRows]

        # Split in blocks to avoid duplicating the whole data structure
        start_row = 0
        end_row = 0

        blockSize = 1000

        while end_row < self.n_rows:

            end_row = min(self.n_rows, end_row + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
                np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])

            start_row += blockSize
Ejemplo n.º 9
0
    def __init__(self, URM_train, verbose=True):

        super(BaseRecommender, self).__init__()

        self.URM_train = check_matrix(URM_train.copy(),
                                      'csr',
                                      dtype=np.float32)
        self.URM_train.eliminate_zeros()

        self.n_users, self.n_items = self.URM_train.shape
        self.verbose = verbose

        self.filterTopPop = False
        self.filterTopPop_ItemsID = np.array([], dtype=np.int)

        self.items_to_ignore_flag = False
        self.items_to_ignore_ID = np.array([], dtype=np.int)

        self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0

        if self._cold_user_mask.any():
            self._print("URM Detected {} ({:.2f} %) cold users.".format(
                self._cold_user_mask.sum(),
                self._cold_user_mask.sum() / self.n_users * 100))

        self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0

        if self._cold_item_mask.any():
            self._print("URM Detected {} ({:.2f} %) cold items.".format(
                self._cold_item_mask.sum(),
                self._cold_item_mask.sum() / self.n_items * 100))
Ejemplo n.º 10
0
def remove_empty_rows_and_cols(URM, ICM = None):

    URM = check_matrix(URM, "csr")
    rows = URM.indptr
    numRatings = np.ediff1d(rows)
    user_mask = numRatings >= 1

    URM = URM[user_mask,:]

    cols = URM.tocsc().indptr
    numRatings = np.ediff1d(cols)
    item_mask = numRatings >= 1

    URM = URM[:,item_mask]

    removedUsers = np.arange(0, len(user_mask))[np.logical_not(user_mask)]
    removedItems = np.arange(0, len(item_mask))[np.logical_not(item_mask)]

    if ICM is not None:

        ICM = ICM[item_mask,:]

        return URM.tocsr(), ICM.tocsr(), removedUsers, removedItems


    return URM.tocsr(), removedUsers, removedItems
Ejemplo n.º 11
0
    def _build_confidence_matrix(self, confidence_scaling):
        if confidence_scaling == 'linear':
            self.C = self._linear_scaling_confidence()
        else:
            self.C = self._log_scaling_confidence()

        self.C_iu = check_matrix(self.C.transpose().copy(),
                                 format="csr",
                                 dtype=np.float32)
Ejemplo n.º 12
0
    def __init__(self, URM_recommendations_items):
        super(PredefinedListRecommender, self).__init__()

        # convert to csc matrix for faster column-wise sum
        self.URM_recommendations = check_matrix(URM_recommendations_items,
                                                'csr',
                                                dtype=np.int)

        self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
    def fit(self, topK=100, alpha=0.5):

        self.topK = topK
        self.alpha = alpha

        W_sparse = self.Similarity_1 * self.alpha + self.Similarity_2 * (
            1 - self.alpha)

        self.W_sparse = similarityMatrixTopK(W_sparse, k=self.topK)
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 14
0
def apply_feature_weighting(matrix, feature_weighting="none"):
    from course_lib.Base.IR_feature_weighting import okapi_BM_25, TF_IDF
    from course_lib.Base.Recommender_utils import check_matrix

    FEATURE_WEIGHTING_VALUES = ["BM25", "TF-IDF", "none"]

    if feature_weighting not in FEATURE_WEIGHTING_VALUES:
        raise ValueError(
            "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
            .format(FEATURE_WEIGHTING_VALUES, feature_weighting))

    if feature_weighting == "BM25":
        matrix = matrix.astype(np.float32)
        matrix = okapi_BM_25(matrix)
        matrix = check_matrix(matrix, 'csr')
    elif feature_weighting == "TF-IDF":
        matrix = matrix.astype(np.float32)
        matrix = TF_IDF(matrix)
        matrix = check_matrix(matrix, 'csr')
    return matrix
Ejemplo n.º 15
0
    def precompute_best_item_indices(self, URM: sps.csr_matrix):
        URM = URM.copy()
        if self.feature_weighting == "BM25":
            URM = URM.astype(np.float32)
            URM = okapi_BM_25(URM)
            URM = check_matrix(URM, 'csr')

        elif self.feature_weighting == "TF-IDF":
            URM = URM.astype(np.float32)
            URM = TF_IDF(URM)
            URM = check_matrix(URM, 'csr')

        similarity = Compute_Similarity(URM,
                                        shrink=self.shrink,
                                        topK=self.topK,
                                        normalize=self.normalize,
                                        similarity="cosine")
        similarity_matrix = similarity.compute_similarity()
        self.sorted_indices = np.array(
            np.argsort(-similarity_matrix.todense(), axis=1))
Ejemplo n.º 16
0
    def test_mse(self):
        curr_item = 0
        URM_train = check_matrix(self.urm1, 'csc', dtype=np.float32)
        target_column = URM_train[:, curr_item].toarray()

        start_pos = URM_train.indptr[curr_item]
        end_pos = URM_train.indptr[curr_item + 1]
        URM_train.data[start_pos: end_pos] = 0.0

        loss = MSELoss(only_positive=False)
        qubo = loss.get_qubo_problem(urm=URM_train, target_column=target_column)

        self.assertEqual(qubo.tolist(), [[0, 0, 0], [0, -7, 10], [0, 10, -9]])
Ejemplo n.º 17
0
    def test_non_zero_sim_norm_mse(self):
        curr_item = 0
        URM_train = check_matrix(self.urm1, 'csc', dtype=np.float32)
        target_column = URM_train[:, curr_item].toarray()

        start_pos = URM_train.indptr[curr_item]
        end_pos = URM_train.indptr[curr_item + 1]
        URM_train.data[start_pos: end_pos] = 0.0

        loss = NormMSELoss(only_positive=True, is_simplified=True)
        qubo = loss.get_qubo_problem(urm=URM_train, target_column=target_column)

        self.assertEqual(qubo.tolist(), [[0, 0, 0], [-16, -8, -6], [-26, -16, -9]])
    def fit(self, W_sparse, selectTopK=False, topK=100):

        assert W_sparse.shape[0] == W_sparse.shape[1],\
            "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape)

        assert self.URM_train.shape[1] == W_sparse.shape[0],\
            "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \
            "The number of columns in URM_train must be equal to the rows in W_sparse. " \
            "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape)

        if selectTopK:
            W_sparse = similarityMatrixTopK(W_sparse, k=topK)

        self.W_sparse = check_matrix(W_sparse, format='csr')
Ejemplo n.º 19
0
    def fit(self,
            topK=50,
            shrink=100,
            normalize=True,
            feature_weighting="none"):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train.T).T
            self.URM_train = check_matrix(self.URM_train, 'csr')

        denominator = 1 if shrink == 0 else shrink

        self.W_sparse = self.URM_train.T.dot(
            self.URM_train) * (1 / denominator)

        if self.topK >= 0:
            self.W_sparse = userSimilarityMatrixTopK(self.W_sparse,
                                                     k=self.topK).tocsr()

        if normalize:
            self.W_sparse = normalize_sk(self.W_sparse, norm="l2", axis=1)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 20
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.topComputeK = topK + len(self.cold_users)
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        similarity = Compute_Similarity(self.UCM_train.T,
                                        shrink=shrink,
                                        topK=self.topComputeK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = self.W_sparse.tocsc()

        for user in self.cold_users:
            self.W_sparse.data[self.W_sparse.indptr[user]:self.W_sparse.
                               indptr[user + 1]] = 0

        self.W_sparse.eliminate_zeros()
        self.W_sparse = self.W_sparse.tocsr()

        self.W_sparse = similarityMatrixTopK(self.W_sparse,
                                             k=self.topK).tocsr()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')

        # Add identity matrix to the recommender
        self.recommender.W_sparse = self.recommender.W_sparse + sps.identity(
            self.recommender.W_sparse.shape[0], format="csr")
Ejemplo n.º 21
0
def userSimilarityMatrixTopK(user_weights, k=100):
    """
    The function selects the TopK most similar elements, column-wise

    :param user_weights:
    :param k:
    :return:
    """

    assert (user_weights.shape[0] == user_weights.shape[1]
            ), "selectTopK: UserWeights is not a square matrix"

    n_users = user_weights.shape[1]
    k = min(k, n_users)

    # iterate over each column and keep only the top-k similar items
    data, rows_indices, cols_indptr = [], [], []

    user_weights = check_matrix(user_weights, format='csc', dtype=np.float32)

    for user_idx in range(n_users):

        cols_indptr.append(len(data))

        start_position = user_weights.indptr[user_idx]
        end_position = user_weights.indptr[user_idx + 1]

        column_data = user_weights.data[start_position:end_position]
        column_row_index = user_weights.indices[start_position:end_position]

        if min(k, column_data.size) == k:
            top_k_idx = np.argpartition(-column_data,
                                        kth=min(k, column_data.size - 1))[:k]
        else:
            top_k_idx = np.ones(column_data.size, dtype=np.bool)

        data.extend(column_data[top_k_idx])
        rows_indices.extend(column_row_index[top_k_idx])

    cols_indptr.append(len(data))

    # During testing CSR is faster
    W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr),
                              shape=(n_users, n_users),
                              dtype=np.float32)

    return W_sparse
Ejemplo n.º 22
0
    def fit(self,
            l1_ratio=0.1,
            positive_only=True,
            topK=100,
            workers=multiprocessing.cpu_count()):

        assert l1_ratio >= 0 and l1_ratio <= 1, "SLIM_ElasticNet: l1_ratio must be between 0 and 1, provided value was {}".format(
            l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK

        self.workers = workers

        self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)
        n_items = self.URM_train.shape[1]
        # fit item's factors in parallel

        #oggetto riferito alla funzione nel quale predefinisco parte dell'input
        _pfit = partial(self._partial_fit, X=self.URM_train, topK=self.topK)

        #creo un pool con un certo numero di processi
        pool = Pool(processes=self.workers)

        #avvio il pool passando la funzione (con la parte fissa dell'input)
        #e il rimanente parametro, variabile
        res = pool.map(_pfit, np.arange(n_items))

        # res contains a vector of (values, rows, cols) tuples
        values, rows, cols = [], [], []
        for values_, rows_, cols_ in res:
            values.extend(values_)
            rows.extend(rows_)
            cols.extend(cols_)

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix((values, (rows, cols)),
                                       shape=(n_items, n_items),
                                       dtype=np.float32)
Ejemplo n.º 23
0
def removeFeatures(ICM, minOccurrence = 5, maxPercOccurrence = 0.30, reconcile_mapper = None):
    """
    The function eliminates the values associated to feature occurring in less than the minimal percentage of items
    or more then the max. Shape of ICM is reduced deleting features.
    :param ICM:
    :param minPercOccurrence:
    :param maxPercOccurrence:
    :param reconcile_mapper: DICT mapper [token] -> index
    :return: ICM
    :return: deletedFeatures
    :return: DICT mapper [token] -> index
    """

    ICM = check_matrix(ICM, 'csc')

    n_items = ICM.shape[0]

    cols = ICM.indptr
    numOccurrences = np.ediff1d(cols)

    feature_mask = np.logical_and(numOccurrences >= minOccurrence, numOccurrences <= n_items*maxPercOccurrence)

    ICM = ICM[:,feature_mask]

    deletedFeatures = np.arange(0, len(feature_mask))[np.logical_not(feature_mask)]

    print("RemoveFeatures: removed {} features with less then {} occurrencies, removed {} features with more than {} occurrencies".format(
        sum(numOccurrences < minOccurrence), minOccurrence,
        sum(numOccurrences > n_items*maxPercOccurrence), int(n_items*maxPercOccurrence)
    ))

    if reconcile_mapper is not None:
        reconcile_mapper = reconcile_mapper_with_removed_tokens(reconcile_mapper, deletedFeatures)

        return ICM, deletedFeatures, reconcile_mapper


    return ICM, deletedFeatures
Ejemplo n.º 24
0
    def fit(self, agg_strategy="FIRST", filter_sample_method="NONE", topK=5, alpha_multiplier=0,
            constraint_multiplier=1, chain_multiplier=1, filter_items_method="NONE", filter_items_n=100,
            num_reads=100, **filter_items_parameters):
        """
        It fits the data (i.e. URM_train) by solving an optimization problem for each item. Each optimization problem
        is generated from the URM_train without the target column and the target column by means of transformation to
        a QUBO based on "transform_fn" with some regulators; then it is solved by a solver given at the initialization
        of the class.

        Then by using the samples collected from the solver, it builds the item-similarity matrix.

        :param agg_strategy: the post-processing aggregation to be used on the samples
        :param filter_sample_method: the filter technique used before the post-processing aggregation
        :param topK: a regulator number that indicates the number of selected variables forced during the optimization
        :param alpha_multiplier: a multiplier number applied on the constraint of the sparsity regulator term
        :param constraint_multiplier: a multiplier number applied on the constraint strength of the variable
                                      selection regulator
        :param chain_multiplier: a multiplier number applied on the chain strength of the embedding
        :param filter_items_method: name of the filtering method to select a set of items for the resolution of the
                                    optimization problem
        :param filter_items_n: number of items to be selected by the filtering method
        :param num_reads: number of samples to compute from the solver
        :param filter_items_parameters: other parameters regarding the filter items method
        """
        self._check_fit_parameters(agg_strategy, filter_items_method, filter_sample_method)
        if filter_items_method == "COSINE":
            self.FILTER_ITEMS_METHODS["COSINE"] = ItemSelectorByCosineSimilarity(**filter_items_parameters)
        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]
        item_pop = np.array((URM_train > 0).sum(axis=0)).flatten()

        # Need a labeling of variables to order the variables from 0 to n_items. With variable leading zeros based on
        # the highest number of digits
        leading_zeros = len(str(n_items - 1))
        variables = ["a{:0{}d}".format(i, leading_zeros) for i in range(n_items)]

        if self.to_resume:
            start_item = self.df_responses[self.ITEM_ID_COLUMN_NAME].max()
        else:
            self.df_responses = pd.DataFrame()
            start_item = 0

        self.FILTER_ITEMS_METHODS[filter_items_method].precompute_best_item_indices(URM_train)
        matrix_builder = IncrementalSparseMatrix(n_rows=n_items, n_cols=n_items)

        for curr_item in tqdm(range(start_item, n_items), desc="%s: Computing W_sparse matrix" % self.RECOMMENDER_NAME):
            # get the target column
            target_column = URM_train[:, curr_item].toarray()

            # set the "curr_item"-th column of URM_train to zero
            start_pos = URM_train.indptr[curr_item]
            end_pos = URM_train.indptr[curr_item + 1]
            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # select items to be used in the QUBO optimization problem
            URM = URM_train.copy()
            URM, mapping_array = self.FILTER_ITEMS_METHODS[filter_items_method].filter_items(URM, target_column,
                                                                                             curr_item,
                                                                                             filter_items_n)
            n_variables = len(mapping_array)

            # get BQM/QUBO problem for the current item
            qubo = self.LOSSES[self.obj_function].get_qubo_problem(URM, target_column)
            qubo = qubo + (np.log1p(item_pop[curr_item]) ** 2 + 1) * alpha_multiplier * (np.max(qubo) - np.min(qubo)) \
                   * np.identity(n_variables)
            if topK > -1:
                constraint_strength = max(self.MIN_CONSTRAINT_STRENGTH,
                                          constraint_multiplier * (np.max(qubo) - np.min(qubo)))
                # avoid using the "combinations" function of dimod in order to speed up the computation
                qubo += -2 * constraint_strength * topK * np.identity(n_variables) + constraint_strength * np.ones(
                    (n_variables, n_variables))

            # Generation of the BQM with qubo in a quicker way checked with some performance measuring. On a test of
            # 2000 n_items, this method is quicker w.r.t. from_numpy_matrix function of dimod
            bqm = dimod.BinaryQuadraticModel.empty(dimod.BINARY)
            bqm.add_variables_from(dict(zip(variables, np.diag(qubo))))

            for i in range(n_variables):
                values = np.array(qubo[i, i + 1:]).flatten() + np.array(qubo[i + 1:, i]).flatten()
                keys = [(variables[i], variables[j]) for j in range(i + 1, n_variables)]
                bqm.add_interactions_from(dict(zip(keys, values)))

            self._print("The BQM for item {} is {}".format(curr_item, bqm))

            # solve the problem with the solver
            try:
                if ("child_properties" in self.solver.properties and
                    self.solver.properties["child_properties"]["category"] == "qpu") \
                        or "qpu_properties" in self.solver.properties:
                    chain_strength = max(self.MIN_CONSTRAINT_STRENGTH,
                                         chain_multiplier * (np.max(qubo) - np.min(qubo)))
                    response = self.solver.sample(bqm, chain_strength=chain_strength, num_reads=num_reads)
                    self._print("Break chain percentage of item {} is {}"
                                .format(curr_item, list(response.data(fields=["chain_break_fraction"]))))
                    self._print("Timing of QPU is %s" % response.info["timing"])
                else:
                    response = self.solver.sample(bqm, num_reads=num_reads)

                self._print("The response for item {} is {}".format(curr_item, response.aggregate()))
            except OSError as err:
                traceback.print_exc()
                raise err

            # save response in self.responses if self.do_save_responses is True; otherwise apply post-processing
            # and put the results in the matrix builder
            response_df = response.to_pandas_dataframe()
            response_df[self.ITEM_ID_COLUMN_NAME] = curr_item
            if self.do_save_responses:
                self.df_responses = self.df_responses.append(response_df, ignore_index=True)
                self.mapping_matrix.append(mapping_array)
            else:
                self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1)
                self.add_sample_responses_to_matrix_builder(matrix_builder, agg_strategy, filter_sample_method,
                                                            response_df, curr_item, mapping_array)

            # restore URM_train
            URM_train.data[start_pos:end_pos] = current_item_data_backup

        if self.do_save_responses:
            self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1)
            self.W_sparse = self.build_similarity_matrix(self.df_responses, agg_strategy, filter_sample_method,
                                                         self.mapping_matrix)
        else:
            self.W_sparse = matrix_builder.get_SparseMatrix()
Ejemplo n.º 25
0
    def fit(self,
            topK=100,
            alpha=1.,
            min_rating=0,
            implicit=False,
            normalize_similarity=False):

        self.topK = topK
        self.alpha = alpha
        self.min_rating = min_rating
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity

        #
        # if X.dtype != np.float32:
        #     print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size,
                                              dtype=np.float32)

        self.Pui = sps.hstack([self.user_W_sparse, self.URM_train],
                              format="csr")
        self.Piu = sps.hstack([self.URM_train.T, self.item_W_sparse],
                              format="csr")
        self.P = sps.vstack([self.Pui, self.Piu], format="csr")

        # Pui is the row-normalized urm
        Pui = normalize(self.P.copy(), norm='l1', axis=1)

        # Piu is the column-normalized
        X_bool = self.P.copy()
        X_bool.data = np.ones(X_bool.data.size, np.float32)
        Piu = normalize(X_bool, norm='l1', axis=0)
        del (X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[
                current_block_start_row:current_block_start_row +
                block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = similarity_block[row_in_block, :]
                row_data[current_block_start_row + row_in_block] = 0

                best = row_data.argsort()[::-1][:self.topK]

                notZerosMask = row_data[best] != 0.0

                values_to_add = row_data[best][notZerosMask]
                cols_to_add = best[notZerosMask]

                for index in range(len(values_to_add)):

                    if numCells == len(rows):
                        rows = np.concatenate(
                            (rows, np.zeros(dataBlock, dtype=np.int32)))
                        cols = np.concatenate(
                            (cols, np.zeros(dataBlock, dtype=np.int32)))
                        values = np.concatenate(
                            (values, np.zeros(dataBlock, dtype=np.float32)))

                    rows[numCells] = current_block_start_row + row_in_block
                    cols[numCells] = cols_to_add[index]
                    values[numCells] = values_to_add[index]

                    numCells += 1

            if time.time() - start_time_printBatch > 60:
                self._print(
                    "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}"
                    .format(
                        current_block_start_row,
                        100.0 * float(current_block_start_row) / Pui.shape[1],
                        (time.time() - start_time) / 60,
                        float(current_block_start_row) /
                        (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # Set rows, cols, values
        rows = rows[:numCells]
        cols = cols[:numCells]
        values = values[:numCells]

        self.W_sparse = sps.csr_matrix((values, (rows, cols)),
                                       shape=self.P.shape)

        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)

        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')
    def fit(self,
            user_topK=50,
            user_shrink=100,
            user_similarity_type='cosine',
            user_normalize=True,
            user_feature_weighting="none",
            user_asymmetric_alpha=0.5,
            item_topK=50,
            item_shrink=100,
            item_similarity_type='cosine',
            item_normalize=True,
            item_feature_weighting="none",
            item_asymmetric_alpha=0.5,
            interactions_feature_weighting="none"):

        if interactions_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES,
                        interactions_feature_weighting))

        if interactions_feature_weighting == "BM25":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = okapi_BM_25(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        elif interactions_feature_weighting == "TF-IDF":
            self.URM_train = self.URM_train.astype(np.float32)
            self.URM_train = TF_IDF(self.URM_train)
            self.URM_train = check_matrix(self.URM_train, 'csr')

        # User Similarity Computation
        self.user_topK = user_topK
        self.user_shrink = user_shrink

        if user_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, user_feature_weighting))

        if user_feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif user_feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        kwargs = {"asymmetric_alpha": user_asymmetric_alpha}
        user_similarity_compute = Compute_Similarity(
            self.UCM_train.T,
            shrink=user_shrink,
            topK=user_topK,
            normalize=user_normalize,
            similarity=user_similarity_type,
            **kwargs)

        self.user_W_sparse = user_similarity_compute.compute_similarity()
        self.user_W_sparse = check_matrix(self.user_W_sparse, format='csr')

        # Item Similarity Computation
        self.item_topK = item_topK
        self.item_shrink = item_shrink

        if item_feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, item_feature_weighting))

        if item_feature_weighting == "BM25":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = okapi_BM_25(self.ICM_train)

        elif item_feature_weighting == "TF-IDF":
            self.ICM_train = self.ICM_train.astype(np.float32)
            self.ICM_train = TF_IDF(self.ICM_train)

        kwargs = {"asymmetric_alpha": item_asymmetric_alpha}
        item_similarity_compute = Compute_Similarity(
            self.ICM_train.T,
            shrink=item_shrink,
            topK=item_topK,
            normalize=item_normalize,
            similarity=item_similarity_type,
            **kwargs)

        self.item_W_sparse = item_similarity_compute.compute_similarity()
        self.item_W_sparse = check_matrix(self.item_W_sparse, format='csr')
Ejemplo n.º 27
0
    def fit(self,
            alpha=1.,
            beta=0.6,
            min_rating=0,
            topK=100,
            implicit=False,
            normalize_similarity=True):

        self.alpha = alpha
        self.beta = beta
        self.min_rating = min_rating
        self.topK = topK
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity

        # if X.dtype != np.float32:
        #     print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size,
                                              dtype=np.float32)

        # Pui is the row-normalized urm
        Pui_raw = sps.hstack([self.URM_train, self.UCM_train], format="csr")
        Pui_raw = TF_IDF(Pui_raw).tocsr()
        Pui = normalize(Pui_raw, norm='l1', axis=1)

        # Piu is the column-normalized, "boolean" urm transposed
        # X_bool = self.URM_train.transpose(copy=True)
        X_bool = Pui_raw.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)

        # Taking the degree of each item to penalize top popular
        # Some rows might be zero, make sure their degree remains zero
        X_bool_sum = np.array(X_bool.sum(axis=1)).ravel()

        degree = np.zeros(Pui_raw.shape[1])

        nonZeroMask = X_bool_sum != 0.0

        degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta)

        # ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del (X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[
                current_block_start_row:current_block_start_row +
                block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = np.multiply(similarity_block[row_in_block, :],
                                       degree)
                row_data[current_block_start_row + row_in_block] = 0

                best = row_data.argsort()[::-1][:self.topK]

                notZerosMask = row_data[best] != 0.0

                values_to_add = row_data[best][notZerosMask]
                cols_to_add = best[notZerosMask]

                for index in range(len(values_to_add)):

                    if numCells == len(rows):
                        rows = np.concatenate(
                            (rows, np.zeros(dataBlock, dtype=np.int32)))
                        cols = np.concatenate(
                            (cols, np.zeros(dataBlock, dtype=np.int32)))
                        values = np.concatenate(
                            (values, np.zeros(dataBlock, dtype=np.float32)))

                    rows[numCells] = current_block_start_row + row_in_block
                    cols[numCells] = cols_to_add[index]
                    values[numCells] = values_to_add[index]

                    numCells += 1

            if time.time() - start_time_printBatch > 60:
                self._print(
                    "Processed {} ( {:.2f}% ) in {:.2f} minutes. Rows per second: {:.0f}"
                    .format(
                        current_block_start_row,
                        100.0 * float(current_block_start_row) / Pui.shape[1],
                        (time.time() - start_time) / 60,
                        float(current_block_start_row) /
                        (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(Pui.shape[1], Pui.shape[1]))

        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)

        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 28
0
 def _log_scaling_confidence(self):
     C = check_matrix(self.URM_train, format="csr", dtype=np.float32)
     C.data = self.alpha * np.log(1.0 + C.data / self.epsilon)
     return C
Ejemplo n.º 29
0
 def _linear_scaling_confidence(self):
     C = check_matrix(self.URM_train, format="csr", dtype=np.float32)
     C.data = self.alpha * C.data
     return C
Ejemplo n.º 30
0
    def compute_similarity(self, start_col=None, end_col=None, block_size=100):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        values = []
        rows = []
        cols = []

        start_time = time.time()
        start_time_print_batch = start_time
        processedItems = 0

        if self.adjusted_cosine:
            self.applyAdjustedCosine()

        elif self.pearson_correlation:
            self.applyPearsonCorrelation()

        elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient:
            self.useOnlyBooleanInteractions()

        # We explore the matrix column-wise
        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        # Compute sum of squared values to be used in normalization
        sumOfSquared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()

        # Tanimoto does not require the square root to be applied
        if not (self.tanimoto_coefficient or self.dice_coefficient
                or self.tversky_coefficient):
            sumOfSquared = np.sqrt(sumOfSquared)

        if self.asymmetric_cosine:
            sumOfSquared_to_1_minus_alpha = np.power(
                sumOfSquared, 2 * (1 - self.asymmetric_alpha))
            sumOfSquared_to_alpha = np.power(sumOfSquared,
                                             2 * self.asymmetric_alpha)

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col > 0 and start_col < self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col > start_col_local and end_col < self.n_columns:
            end_col_local = end_col

        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block - start_col_block

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray().squeeze()

            # If only 1 feature avoid last dimension to disappear
            if item_data.ndim == 1:
                item_data = np.atleast_2d(item_data)

            if self.use_row_weights:
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)

            else:
                # Compute item similarities
                this_block_weights = self.dataMatrix.T.dot(item_data)

            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights
                else:
                    this_column_weights = this_block_weights[:,
                                                             col_index_in_block]

                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:

                    if self.asymmetric_cosine:
                        denominator = sumOfSquared_to_alpha[
                            columnIndex] * sumOfSquared_to_1_minus_alpha + self.shrink + 1e-6
                    else:
                        denominator = sumOfSquared[
                            columnIndex] * sumOfSquared + self.shrink + 1e-6

                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # Apply the specific denominator for Tanimoto
                elif self.tanimoto_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared - this_column_weights + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.dice_coefficient:
                    denominator = sumOfSquared[
                        columnIndex] + sumOfSquared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                elif self.tversky_coefficient:
                    denominator = this_column_weights + \
                                  (sumOfSquared[columnIndex] - this_column_weights)*self.tversky_alpha + \
                                  (sumOfSquared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights,
                                                      1 / denominator)

                # If no normalization or tanimoto is selected, apply only shrink
                elif self.shrink != 0:
                    this_column_weights = this_column_weights / self.shrink

                #this_column_weights = this_column_weights.toarray().ravel()

                # Sort indices and select TopK
                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # - Partition the data to extract the set of relevant items
                # - Sort only the relevant items
                # - Get the original item index
                relevant_items_partition = (
                    -this_column_weights).argpartition(self.TopK -
                                                       1)[0:self.TopK]
                relevant_items_partition_sorting = np.argsort(
                    -this_column_weights[relevant_items_partition])
                top_k_idx = relevant_items_partition[
                    relevant_items_partition_sorting]

                # Incrementally build sparse matrix, do not add zeros
                notZerosMask = this_column_weights[top_k_idx] != 0.0
                numNotZeros = np.sum(notZerosMask)

                values.extend(this_column_weights[top_k_idx][notZerosMask])
                rows.extend(top_k_idx[notZerosMask])
                cols.extend(np.ones(numNotZeros) * columnIndex)

            # Add previous block size
            processedItems += this_block_size

            if time.time(
            ) - start_time_print_batch >= 30 or end_col_block == end_col_local:
                columnPerSec = processedItems / (time.time() - start_time +
                                                 1e-9)

                print(
                    "Similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min"
                    .format(
                        processedItems, processedItems /
                        (end_col_local - start_col_local) * 100, columnPerSec,
                        (time.time() - start_time) / 60))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()

            start_col_block += block_size

        # End while on columns

        W_sparse = sps.csr_matrix((values, (rows, cols)),
                                  shape=(self.n_columns, self.n_columns),
                                  dtype=np.float32)

        return W_sparse