コード例 #1
0
    def __init__(self, URM_train, ICM, S_matrix_target):

        super(HP3_Similarity_Cython, self).__init__(URM_train)

        if (URM_train.shape[1] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. URM contains {} but ICM contains {}"
                .format(URM_train.shape[1], ICM.shape[0]))

        if (S_matrix_target.shape[0] != S_matrix_target.shape[1]):
            raise ValueError(
                "Items imilarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if (S_matrix_target.shape[0] != ICM.shape[0]):
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], ICM.shape[0]))

        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')
        self.ICM = check_matrix(ICM, 'csr')

        self.n_features = self.ICM.shape[1]

        self.D_incremental = np.ones(self.n_features, dtype=np.float64)
        self.D_best = self.D_incremental.copy()
コード例 #2
0
    def __init__(self, URM_train, ICM, S_matrix_target):

        super(CFW_DVV_Similarity_Cython, self).__init__(URM_train)

        self.ICM = check_matrix(ICM, 'csr')
        self.n_features = self.ICM.shape[1]

        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')
コード例 #3
0
    def fit(self,
            topK=None,
            l2_norm=1e3,
            normalize_matrix=False,
            verbose=True):

        self.verbose = verbose

        start_time = time.time()
        self._print("Fitting model... ")

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm='l2', axis=1)
            self.URM_train = normalize(self.URM_train, norm='l2', axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X^t X, compute dot product
        similarity = Compute_Similarity(self.URM_train,
                                        shrink=0,
                                        topK=self.URM_train.shape[1],
                                        normalize=False,
                                        similarity="cosine")
        grahm_matrix = similarity.compute_similarity().toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero
        # in this case we need the diagonal as well, which is just the item popularity
        item_popularity = np.ediff1d(self.URM_train.tocsc().indptr)
        grahm_matrix[diag_indices] = item_popularity + l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Fitting model... done in {:.2f} {}".format(
            new_time_value, new_time_unit))

        # Check if the matrix should be saved in a sparse or dense format
        # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota %
        if topK is not None:
            B = similarityMatrixTopK(B, k=topK, verbose=False)

        if self._is_content_sparse_check(B):
            self._print("Detected model matrix to be sparse, changing format.")
            self.W_sparse = check_matrix(B, format='csr', dtype=np.float32)

        else:
            self.W_sparse = check_matrix(B, format='npy', dtype=np.float32)
            self._W_sparse_format_checked = True
            self._compute_item_score = self._compute_score_W_dense
コード例 #4
0
    def get_S_incremental_and_set_W(self):

        self.S_incremental = self.cythonEpoch.get_S()

        if self.train_with_sparse_weights:
            self.W_sparse = self.S_incremental
            self.W_sparse = check_matrix(self.W_sparse, format='csr')
        else:
            self.W_sparse = similarityMatrixTopK(self.S_incremental, k = self.topK)
            self.W_sparse = check_matrix(self.W_sparse, format='csr')
    def __init__(self, URM_train, Similarity_1, Similarity_2, verbose=True):
        super(ItemKNNSimilarityHybridRecommender,
              self).__init__(URM_train, verbose=verbose)

        if Similarity_1.shape != Similarity_2.shape:
            raise ValueError(
                "ItemKNNSimilarityHybridRecommender: similarities have different size, S1 is {}, S2 is {}"
                .format(Similarity_1.shape, Similarity_2.shape))

        # CSR is faster during evaluation
        self.Similarity_1 = check_matrix(Similarity_1.copy(), 'csr')
        self.Similarity_2 = check_matrix(Similarity_2.copy(), 'csr')
コード例 #6
0
    def fit(self, lambda_user=10, lambda_item=25):

        self.lambda_user = lambda_user
        self.lambda_item = lambda_item
        self.n_items = self.URM_train.shape[1]

        # convert to csc matrix for faster column-wise sum
        self.URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        # 1) global average
        self.mu = self.URM_train.data.sum(
            dtype=np.float32) / self.URM_train.data.shape[0]

        # 2) item average bias
        # compute the number of non-zero elements for each column
        col_nnz = np.diff(self.URM_train.indptr)

        # it is equivalent to:
        # col_nnz = X.indptr[1:] - X.indptr[:-1]
        # and it is **much faster** than
        # col_nnz = (X != 0).sum(axis=0)

        URM_train_unbiased = self.URM_train.copy()
        URM_train_unbiased.data -= self.mu
        self.item_bias = URM_train_unbiased.sum(axis=0) / (col_nnz +
                                                           self.lambda_item)
        self.item_bias = np.asarray(self.item_bias).ravel(
        )  # converts 2-d matrix to 1-d array without anycopy

        # 3) user average bias
        # NOTE: the user bias is *useless* for the sake of ranking items. We just show it here for educational purposes.

        # first subtract the item biases from each column
        # then repeat each element of the item bias vector a number of times equal to col_nnz
        # and subtract it from the data vector
        URM_train_unbiased.data -= np.repeat(self.item_bias, col_nnz)

        # now convert the csc matrix to csr for efficient row-wise computation
        URM_train_unbiased_csr = URM_train_unbiased.tocsr()
        row_nnz = np.diff(URM_train_unbiased_csr.indptr)
        # finally, let's compute the bias
        self.user_bias = URM_train_unbiased_csr.sum(
            axis=1).ravel() / (row_nnz + self.lambda_user)

        # 4) precompute the item ranking by using the item bias only
        # the global average and user bias won't change the ranking, so there is no need to use them
        #self.item_ranking = np.argsort(self.bi)[::-1]

        self.URM_train = check_matrix(self.URM_train, 'csr', dtype=np.float32)
コード例 #7
0
    def applyAdjustedCosine(self):
        """
        Remove from every data point the average for the corresponding row
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csr')


        interactionsPerRow = np.diff(self.dataMatrix.indptr)

        nonzeroRows = interactionsPerRow > 0
        sumPerRow = np.asarray(self.dataMatrix.sum(axis=1)).ravel()

        rowAverage = np.zeros_like(sumPerRow)
        rowAverage[nonzeroRows] = sumPerRow[nonzeroRows] / interactionsPerRow[nonzeroRows]


        # Split in blocks to avoid duplicating the whole data structure
        start_row = 0
        end_row= 0

        blockSize = 1000


        while end_row < self.n_rows:

            end_row = min(self.n_rows, end_row + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_row]:self.dataMatrix.indptr[end_row]] -= \
                np.repeat(rowAverage[start_row:end_row], interactionsPerRow[start_row:end_row])

            start_row += blockSize
コード例 #8
0
    def set_URM_train(self, URM_train_new, estimate_item_similarity_for_cold_users = False, **kwargs):
        """

        :param URM_train_new:
        :param estimate_item_similarity_for_cold_users: Set to TRUE if you want to estimate the USER_factors for cold users
        :param kwargs:
        :return:
        """

        assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format(self.RECOMMENDER_NAME)

        if len(kwargs)>0:
            self._print("set_URM_train keyword arguments not supported for this recommender class. Received: {}".format(kwargs))

        self.URM_train = check_matrix(URM_train_new.copy(), 'csr', dtype=np.float32)
        self.URM_train.eliminate_zeros()

        # No need to ever use a knn model
        self._cold_user_KNN_model_available = False
        self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0

        if estimate_item_similarity_for_cold_users:

            self._print("Estimating USER_factors for cold users...")

            self.USER_factors = self._estimate_user_factors(self.ITEM_factors_Y_best)

            self._print("Estimating USER_factors for cold users... done!")
コード例 #9
0
    def applyPearsonCorrelation(self):
        """
        Remove from every data point the average for the corresponding column
        :return:
        """

        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')


        interactionsPerCol = np.diff(self.dataMatrix.indptr)

        nonzeroCols = interactionsPerCol > 0
        sumPerCol = np.asarray(self.dataMatrix.sum(axis=0)).ravel()

        colAverage = np.zeros_like(sumPerCol)
        colAverage[nonzeroCols] = sumPerCol[nonzeroCols] / interactionsPerCol[nonzeroCols]


        # Split in blocks to avoid duplicating the whole data structure
        start_col = 0
        end_col= 0

        blockSize = 1000


        while end_col < self.n_columns:

            end_col = min(self.n_columns, end_col + blockSize)

            self.dataMatrix.data[self.dataMatrix.indptr[start_col]:self.dataMatrix.indptr[end_col]] -= \
                np.repeat(colAverage[start_col:end_col], interactionsPerCol[start_col:end_col])

            start_col += blockSize
コード例 #10
0
    def compute_W_sparse(self, model_to_use="best"):

        if model_to_use == "last":
            feature_weights = self.D_incremental
        elif model_to_use == "best":
            feature_weights = self.D_best
        else:
            assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format(
                self.RECOMMENDER_NAME)

        block_dim = 300
        d_t = self.ICM * sps.diags([feature_weights.squeeze()], [0])
        icm_t = self.ICM.astype(np.bool).T
        indptr, indices, data = [0], [], []
        for r in range(0, self.n_items, block_dim):
            if r + block_dim > self.n_items:
                block_dim = self.n_items - r
            sim = d_t[r:r + block_dim, :] * icm_t
            for s in range(block_dim):
                row = sim[s].toarray().ravel()
                row[r + s] = 0
                best = row.argsort()[::-1][:self.topK]
                indices.extend(best)
                indptr.append(len(indices))
                data.extend(row[best].flatten().tolist())

        self.W_sparse = normalize(sps.csr_matrix(
            (data, indices, indptr), shape=(self.n_items, self.n_items)),
                                  norm="l1",
                                  axis=1)
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #11
0
    def __init__(self, URM_train, verbose=True):

        super(BaseRecommender, self).__init__()

        self.URM_train = check_matrix(URM_train.copy(),
                                      'csr',
                                      dtype=np.float32)
        self.URM_train.eliminate_zeros()

        self.n_users, self.n_items = self.URM_train.shape
        self.verbose = verbose

        self.filterTopPop = False
        self.filterTopPop_ItemsID = np.array([], dtype=np.int)

        self.items_to_ignore_flag = False
        self.items_to_ignore_ID = np.array([], dtype=np.int)

        self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0

        if self._cold_user_mask.any():
            self._print(
                "URM Detected {} ({:4.1f}%) users with no interactions.".
                format(self._cold_user_mask.sum(),
                       self._cold_user_mask.sum() / self.n_users * 100))

        self._cold_item_mask = np.ediff1d(self.URM_train.tocsc().indptr) == 0

        if self._cold_item_mask.any():
            self._print(
                "URM Detected {} ({:4.1f}%) items with no interactions.".
                format(self._cold_item_mask.sum(),
                       self._cold_item_mask.sum() / self.n_items * 100))
コード例 #12
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        similarity = Compute_Similarity(self.UCM_train.T,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #13
0
def remove_empty_rows_and_cols(URM, ICM=None):

    URM = check_matrix(URM, "csr")
    rows = URM.indptr
    numRatings = np.ediff1d(rows)
    user_mask = numRatings >= 1

    URM = URM[user_mask, :]

    cols = URM.tocsc().indptr
    numRatings = np.ediff1d(cols)
    item_mask = numRatings >= 1

    URM = URM[:, item_mask]

    removedUsers = np.arange(0, len(user_mask))[np.logical_not(user_mask)]
    removedItems = np.arange(0, len(item_mask))[np.logical_not(item_mask)]

    if ICM is not None:

        ICM = ICM[item_mask, :]

        return URM.tocsr(), ICM.tocsr(), removedUsers, removedItems

    return URM.tocsr(), removedUsers, removedItems
コード例 #14
0
    def __init__(self, URM_train, ICM):

        super(FBSM_Rating_Cython, self).__init__(URM_train)

        self.n_items_icm, self.n_features = ICM.shape

        self.ICM = check_matrix(ICM, 'csr')
コード例 #15
0
    def _add_zeros_in_train_data_row_wise(self):
        """
        This function uses a set of tuples to ensure the zero elements to be added are not already existent
        :return:
        """

        if self.verbose:
            print(self.RECOMMENDER_NAME + ": Adding zeros in train data...")

        self.S_matrix_target = check_matrix(self.S_matrix_target, "csr")

        numSamples = self.S_matrix_target.nnz
        n_items = self.S_matrix_target.shape[0]

        zeros_to_add_global = int(numSamples * self.add_zeros_quota)
        zeros_added_global = 0

        if zeros_to_add_global + numSamples >= n_items**2:
            raise ValueError(
                self.RECOMMENDER_NAME +
                ": Too many zeros to add, not enough unique coordinates in matrix"
            )

        zeros_to_add_per_item = int(zeros_to_add_global / self.n_items)

        while zeros_added_global < zeros_to_add_global:

            for current_item_row in range(self.n_items):

                start_pos = self.S_matrix_target.indptr[current_item_row]
                end_pos = self.S_matrix_target.indptr[current_item_row + 1]

                nonzero_coordinates = set(
                    self.S_matrix_target.indices[start_pos:end_pos])
                zeros_added_per_item = 0

                while zeros_added_per_item < zeros_to_add_per_item and zeros_added_global < zeros_to_add_global:

                    new_coordinate = np.random.randint(0, n_items)

                    if new_coordinate not in nonzero_coordinates:

                        nonzero_coordinates.add(new_coordinate)

                        self.row_list[numSamples +
                                      zeros_added_global] = current_item_row
                        self.col_list[numSamples +
                                      zeros_added_global] = new_coordinate
                        self.data_list[numSamples + zeros_added_global] = 0.0

                        zeros_added_per_item += 1
                        zeros_added_global += 1

        if self.verbose:
            print("Added: {} zeros. Average per item is: {} ".format(
                zeros_added_global, zeros_to_add_per_item))
            print(self.RECOMMENDER_NAME +
                  ": Added zeros, data points are {}".format(
                      len(self.data_list)))
コード例 #16
0
    def _build_confidence_matrix(self, confidence_scaling):

        if confidence_scaling == 'linear':
            self.C = self._linear_scaling_confidence()
        else:
            self.C = self._log_scaling_confidence()

        self.C_csc= check_matrix(self.C.copy(), format="csc", dtype = np.float32)
コード例 #17
0
    def __init__(self, URM_recommendations_items):
        super(PredefinedListRecommender, self).__init__()

        # convert to csc matrix for faster column-wise sum
        self.URM_recommendations = check_matrix(URM_recommendations_items,
                                                'csr',
                                                dtype=np.int)

        self.URM_train = sps.csr_matrix((self.URM_recommendations.shape))
コード例 #18
0
    def __init__(self, URM_train, ICM=None, UCM=None):

        super(SVDFeature, self).__init__()

        self.URM_train = check_matrix(URM_train, "csr")
        self.ICM = ICM
        self.UCM = UCM
        self.n_users, self.n_items = URM_train.shape
        self.normalize = False
    def fit(self, topK=100, alpha=0.5):

        self.topK = topK
        self.alpha = alpha

        W_sparse = self.Similarity_1 * self.alpha + self.Similarity_2 * (
            1 - self.alpha)

        self.W_sparse = similarityMatrixTopK(W_sparse, k=self.topK)
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #20
0
    def compute_W_sparse(self):

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=self.D_best)

        self.W_sparse = self.similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #21
0
    def __init__(self, URM_train, ICM_train, S_matrix_target):
        super(CFW_D_Similarity_Linalg, self).__init__(URM_train, ICM_train)

        if (URM_train.shape[1] != ICM_train.shape[0]):
            raise ValueError(
                "Number of items not consistent. URM contains {} but ICM contains {}"
                .format(URM_train.shape[1], ICM_train.shape[0]))

        if (S_matrix_target.shape[0] != S_matrix_target.shape[1]):
            raise ValueError(
                "Items imilarity matrix is not square: rows are {}, columns are {}"
                .format(S_matrix_target.shape[0], S_matrix_target.shape[1]))

        if (S_matrix_target.shape[0] != ICM_train.shape[0]):
            raise ValueError(
                "Number of items not consistent. S_matrix contains {} but ICM contains {}"
                .format(S_matrix_target.shape[0], ICM_train.shape[0]))

        self.S_matrix_target = check_matrix(S_matrix_target, 'csr')

        self.ICM = check_matrix(ICM_train, 'csr')
        self.n_features = self.ICM.shape[1]
コード例 #22
0
    def _write_feature_format_file(self):

        self.n_item_features = self.n_items
        if self.ICM is not None:
            self.ICM = check_matrix(self.ICM, "csr")
            self.n_item_features += self.ICM.shape[1]

        self.n_user_features = self.n_users
        if self.UCM is not None:
            self.UCM = check_matrix(self.UCM, "csr")
            self.n_user_features += self.UCM.shape[1]

        nnz_rows, nnz_cols = self.URM_train.nonzero()

        with open(self.temp_file_folder + self.FILE_MODEL_NAME, "w") as fileout:

            for i in tqdm(range(len(nnz_rows))):

                userid, itemid = nnz_rows[i], nnz_cols[i]
                output = self._get_feature_format(userid, itemid)

                print(output, file=fileout)
    def fit(self, W_sparse, selectTopK=False, topK=100):

        assert W_sparse.shape[0] == W_sparse.shape[1],\
            "ItemKNNCustomSimilarityRecommender: W_sparse matrice is not square. Current shape is {}".format(W_sparse.shape)

        assert self.URM_train.shape[1] == W_sparse.shape[0],\
            "ItemKNNCustomSimilarityRecommender: URM_train and W_sparse matrices are not consistent. " \
            "The number of columns in URM_train must be equal to the rows in W_sparse. " \
            "Current shapes are: URM_train {}, W_sparse {}".format(self.URM_train.shape, W_sparse.shape)

        if selectTopK:
            W_sparse = similarityMatrixTopK(W_sparse, k=topK)

        self.W_sparse = check_matrix(W_sparse, format='csr')
コード例 #24
0
def remove_features(ICM,
                    min_occurrence=5,
                    max_percentage_occurrence=0.30,
                    reconcile_mapper=None):
    """
    The function eliminates the values associated to feature occurring in less than the minimal percentage of items
    or more then the max. Shape of ICM is reduced deleting features.
    :param ICM:
    :param minPercOccurrence:
    :param max_percentage_occurrence:
    :param reconcile_mapper: DICT mapper [token] -> index
    :return: ICM
    :return: deletedFeatures
    :return: DICT mapper [token] -> index
    """

    ICM = check_matrix(ICM, 'csc')

    n_items = ICM.shape[0]

    cols = ICM.indptr
    numOccurrences = np.ediff1d(cols)

    feature_mask = np.logical_and(
        numOccurrences >= min_occurrence,
        numOccurrences <= n_items * max_percentage_occurrence)

    ICM = ICM[:, feature_mask]

    deletedFeatures = np.arange(
        0, len(feature_mask))[np.logical_not(feature_mask)]

    print(
        "RemoveFeatures: removed {} features with less then {} occurrences, removed {} features with more than {} occurrencies"
        .format(sum(numOccurrences < min_occurrence), min_occurrence,
                sum(numOccurrences > n_items * max_percentage_occurrence),
                int(n_items * max_percentage_occurrence)))

    if reconcile_mapper is not None:
        reconcile_mapper = reconcile_mapper_with_removed_tokens(
            reconcile_mapper, deletedFeatures)

        return ICM, deletedFeatures, reconcile_mapper

    return ICM, deletedFeatures
コード例 #25
0
    def compute_W_sparse(self, model_to_use="best"):

        if model_to_use == "last":
            feature_weights = self.D_incremental
        elif model_to_use == "best":
            feature_weights = self.D_best
        else:
            assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format(
                self.RECOMMENDER_NAME)

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=feature_weights)

        self.W_sparse = self.similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #26
0
    def __init__(self, URM_train, UCM_train, verbose=True):
        super(BaseUserCBFRecommender, self).__init__(URM_train,
                                                     verbose=verbose)

        assert self.n_users == UCM_train.shape[
            0], "{}: URM_train has {} users but UCM_train has {}".format(
                self.RECOMMENDER_NAME, self.n_items, UCM_train.shape[0])

        self.UCM_train = check_matrix(UCM_train.copy(),
                                      'csr',
                                      dtype=np.float32)
        self.UCM_train.eliminate_zeros()

        _, self.n_features = self.UCM_train.shape

        self._cold_user_CBF_mask = np.ediff1d(self.UCM_train.indptr) == 0

        if self._cold_user_CBF_mask.any():
            print("{}: UCM Detected {} ({:4.1f}%) cold users.".format(
                self.RECOMMENDER_NAME, self._cold_user_CBF_mask.sum(),
                self._cold_user_CBF_mask.sum() / self.n_users * 100))
コード例 #27
0
    def set_URM_train(self, URM_train_new, **kwargs):

        assert self.URM_train.shape == URM_train_new.shape, "{}: set_URM_train old and new URM train have different shapes".format(
            self.RECOMMENDER_NAME)

        if len(kwargs) > 0:
            self._print(
                "set_URM_train keyword arguments not supported for this recommender class. Received: {}"
                .format(kwargs))

        self.URM_train = check_matrix(URM_train_new.copy(),
                                      'csr',
                                      dtype=np.float32)
        self.URM_train.eliminate_zeros()

        self._cold_user_mask = np.ediff1d(self.URM_train.indptr) == 0

        if self._cold_user_mask.any():
            self._print(
                "Detected {} ({:4.1f}%) users with no interactions.".format(
                    self._cold_user_mask.sum(),
                    self._cold_user_mask.sum() / len(self._cold_user_mask) *
                    100))
コード例 #28
0
    def fit(self, alpha=1., beta=0.6, min_rating=0, topK=100, implicit=False, normalize_similarity=True):

        self.alpha = alpha
        self.beta = beta
        self.min_rating = min_rating
        self.topK = topK
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity

        
        # if X.dtype != np.float32:
        #     print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32)

        #Pui is the row-normalized urm
        Pui = normalize(self.URM_train, norm='l1', axis=1)

        #Piu is the column-normalized, "boolean" urm transposed
        X_bool = self.URM_train.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)

        # Taking the degree of each item to penalize top popular
        # Some rows might be zero, make sure their degree remains zero
        X_bool_sum = np.array(X_bool.sum(axis=1)).ravel()

        degree = np.zeros(self.URM_train.shape[1])

        nonZeroMask = X_bool_sum!=0.0

        degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta)

        #ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del(X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu


        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0


        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = np.multiply(similarity_block[row_in_block, :], degree)
                row_data[current_block_start_row + row_in_block] = 0

                best = row_data.argsort()[::-1][:self.topK]

                notZerosMask = row_data[best] != 0.0

                values_to_add = row_data[best][notZerosMask]
                cols_to_add = best[notZerosMask]

                for index in range(len(values_to_add)):

                    if numCells == len(rows):
                        rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                        cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                        values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                    rows[numCells] = current_block_start_row + row_in_block
                    cols[numCells] = cols_to_add[index]
                    values[numCells] = values_to_add[index]

                    numCells += 1


            if time.time() - start_time_printBatch > 300:
                new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)

                self._print("Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}".format(
                     current_block_start_row + block_dim,
                    100.0 * float( current_block_start_row + block_dim) / Pui.shape[1],
                    float( current_block_start_row + block_dim) / (time.time() - start_time),
                    new_time_value, new_time_unit))


                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()


        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1]))

        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)


        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)


        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #29
0
    def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100):

        assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(
            self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos:end_pos].copy()
            URM_train.data[start_pos:end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value) - 1, self.topK)

            relevant_items_partition = (
                -nonzero_model_coef_value
            ).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(
                -nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate(
                        (rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate(
                        (cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate(
                        (values, np.zeros(dataBlock, dtype=np.float32)))

                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if time.time(
            ) - start_time_printBatch > 300 or currentItem == n_items - 1:
                self._print(
                    "Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}"
                    .format(currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            new_time_value, new_time_unit,
                            float(currentItem) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(n_items, n_items),
            dtype=np.float32)
コード例 #30
0
    def _generate_train_data(self):

        if self.verbose:
            print(self.RECOMMENDER_NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        self.compute_W_sparse()
        S_matrix_contentKNN = check_matrix(self.W_sparse, "csr")

        self.write_log(
            "Collaborative S density: {:.2E}, nonzero cells {}".format(
                self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2,
                self.S_matrix_target.nnz))

        self.write_log("Content S density: {:.2E}, nonzero cells {}".format(
            S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2,
            S_matrix_contentKNN.nnz))

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = self.S_matrix_target[
                        row_index, col_index]

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if self.verbose and (time.time() - start_time_batch > 30
                                 or num_samples == S_matrix_contentKNN.nnz *
                                 (1 + self.add_zeros_quota)):

                print(self.RECOMMENDER_NAME +
                      ": Generating train data. Sample {} ({:4.1f}%) ".format(
                          num_samples, num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self.write_log(
            "Content S structure has {} out of {} ({:4.1f}%) nonzero collaborative cells"
            .format(num_common_coordinates, S_matrix_contentKNN.nnz,
                    num_common_coordinates / S_matrix_contentKNN.nnz * 100))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self.write_log(
            "Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz,
                collaborative_sum / collaborative_nnz))