Ejemplo n.º 1
0
    def fit(self,
            topK=50,
            shrink=100,
            similarity='cosine',
            normalize=True,
            feature_weighting="none",
            **similarity_args):

        self.topK = topK
        self.shrink = shrink

        if feature_weighting not in self.FEATURE_WEIGHTING_VALUES:
            raise ValueError(
                "Value for 'feature_weighting' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.FEATURE_WEIGHTING_VALUES, feature_weighting))

        if feature_weighting == "BM25":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = okapi_BM_25(self.UCM_train)

        elif feature_weighting == "TF-IDF":
            self.UCM_train = self.UCM_train.astype(np.float32)
            self.UCM_train = TF_IDF(self.UCM_train)

        similarity = Compute_Similarity(self.UCM_train.T,
                                        shrink=shrink,
                                        topK=topK,
                                        normalize=normalize,
                                        similarity=similarity,
                                        **similarity_args)

        self.W_sparse = similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 2
0
    def fit(self,
            topK=None,
            l2_norm=1e3,
            normalize_matrix=False,
            verbose=True):

        self.verbose = verbose

        start_time = time.time()
        self._print("Fitting model... ")

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm='l2', axis=1)
            self.URM_train = normalize(self.URM_train, norm='l2', axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X^t X, compute dot product
        similarity = Compute_Similarity(self.URM_train,
                                        shrink=0,
                                        topK=self.URM_train.shape[1],
                                        normalize=False,
                                        similarity="cosine")
        grahm_matrix = similarity.compute_similarity().toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero
        # in this case we need the diagonal as well, which is just the item popularity
        item_popularity = np.ediff1d(self.URM_train.tocsc().indptr)
        grahm_matrix[diag_indices] = item_popularity + l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Fitting model... done in {:.2f} {}".format(
            new_time_value, new_time_unit))

        # Check if the matrix should be saved in a sparse or dense format
        # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota %
        if topK is not None:
            B = similarityMatrixTopK(B, k=topK, verbose=False)

        if self._is_content_sparse_check(B):
            self._print("Detected model matrix to be sparse, changing format.")
            self.W_sparse = check_matrix(B, format='csr', dtype=np.float32)

        else:
            self.W_sparse = check_matrix(B, format='npy', dtype=np.float32)
            self._W_sparse_format_checked = True
            self._compute_item_score = self._compute_score_W_dense
    def compute_W_sparse(self):

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=self.D_best)

        self.W_sparse = self.similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 4
0
    def compute_W_sparse(self, model_to_use="best"):

        if model_to_use == "last":
            feature_weights = self.D_incremental
        elif model_to_use == "best":
            feature_weights = self.D_best
        else:
            assert False, "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format(
                self.RECOMMENDER_NAME)

        self.similarity = Compute_Similarity(
            self.ICM.T,
            shrink=0,
            topK=self.topK,
            normalize=self.normalize_similarity,
            row_weights=feature_weights)

        self.W_sparse = self.similarity.compute_similarity()
        self.W_sparse = check_matrix(self.W_sparse, format='csr')
Ejemplo n.º 5
0
    def _generate_train_data(self):

        if self.verbose:
            print(self.RECOMMENDER_NAME + ": Generating train data")

        start_time_batch = time.time()

        # Here is important only the structure
        self.similarity = Compute_Similarity(self.ICM.T,
                                             shrink=0,
                                             topK=self.topK,
                                             normalize=False)
        S_matrix_contentKNN = self.similarity.compute_similarity()
        S_matrix_contentKNN = check_matrix(S_matrix_contentKNN, "csr")

        self._print("Collaborative S density: {:.2E}, nonzero cells {}".format(
            self.S_matrix_target.nnz / self.S_matrix_target.shape[0]**2,
            self.S_matrix_target.nnz))

        self._print("Content S density: {:.2E}, nonzero cells {}".format(
            S_matrix_contentKNN.nnz / S_matrix_contentKNN.shape[0]**2,
            S_matrix_contentKNN.nnz))

        if self.normalize_similarity:

            # Compute sum of squared
            sum_of_squared_features = np.array(
                self.ICM.T.power(2).sum(axis=0)).ravel()
            sum_of_squared_features = np.sqrt(sum_of_squared_features)

        num_common_coordinates = 0

        estimated_n_samples = int(S_matrix_contentKNN.nnz *
                                  (1 + self.add_zeros_quota) * 1.2)

        self.row_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.col_list = np.zeros(estimated_n_samples, dtype=np.int32)
        self.data_list = np.zeros(estimated_n_samples, dtype=np.float64)

        num_samples = 0

        for row_index in range(self.n_items):

            start_pos_content = S_matrix_contentKNN.indptr[row_index]
            end_pos_content = S_matrix_contentKNN.indptr[row_index + 1]

            content_coordinates = S_matrix_contentKNN.indices[
                start_pos_content:end_pos_content]

            start_pos_target = self.S_matrix_target.indptr[row_index]
            end_pos_target = self.S_matrix_target.indptr[row_index + 1]

            target_coordinates = self.S_matrix_target.indices[
                start_pos_target:end_pos_target]

            # Chech whether the content coordinate is associated to a non zero target value
            # If true, the content coordinate has a collaborative non-zero value
            # if false, the content coordinate has a collaborative zero value
            is_common = np.in1d(content_coordinates, target_coordinates)

            num_common_in_current_row = is_common.sum()
            num_common_coordinates += num_common_in_current_row

            for index in range(len(is_common)):

                if num_samples == estimated_n_samples:
                    dataBlock = 1000000
                    self.row_list = np.concatenate(
                        (self.row_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.col_list = np.concatenate(
                        (self.col_list, np.zeros(dataBlock, dtype=np.int32)))
                    self.data_list = np.concatenate(
                        (self.data_list, np.zeros(dataBlock,
                                                  dtype=np.float64)))

                if is_common[index]:
                    # If cell exists in target matrix, add its value
                    # Otherwise it will remain zero with a certain probability

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index

                    new_data_value = self.S_matrix_target[row_index, col_index]

                    if self.normalize_similarity:
                        new_data_value *= sum_of_squared_features[
                            row_index] * sum_of_squared_features[col_index]

                    self.data_list[num_samples] = new_data_value

                    num_samples += 1

                elif np.random.rand() <= self.add_zeros_quota:

                    col_index = content_coordinates[index]

                    self.row_list[num_samples] = row_index
                    self.col_list[num_samples] = col_index
                    self.data_list[num_samples] = 0.0

                    num_samples += 1

            if self.verbose and (time.time() - start_time_batch > 30
                                 or num_samples == S_matrix_contentKNN.nnz *
                                 (1 + self.add_zeros_quota)):

                print(self.RECOMMENDER_NAME +
                      ": Generating train data. Sample {} ({:4.1f}%) ".format(
                          num_samples, num_samples / S_matrix_contentKNN.nnz *
                          (1 + self.add_zeros_quota) * 100))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_batch = time.time()

        self._print(
            "Content S structure has {} out of {} ({:4.1f}%) nonzero collaborative cells"
            .format(num_common_coordinates, S_matrix_contentKNN.nnz,
                    num_common_coordinates / S_matrix_contentKNN.nnz * 100))

        # Discard extra cells at the left of the array
        self.row_list = self.row_list[:num_samples]
        self.col_list = self.col_list[:num_samples]
        self.data_list = self.data_list[:num_samples]

        data_nnz = sum(np.array(self.data_list) != 0)
        data_sum = sum(self.data_list)

        collaborative_nnz = self.S_matrix_target.nnz
        collaborative_sum = sum(self.S_matrix_target.data)

        self._print(
            "Nonzero collaborative cell sum is: {:.2E}, average is: {:.2E}, "
            "average over all collaborative data is {:.2E}".format(
                data_sum, data_sum / data_nnz,
                collaborative_sum / collaborative_nnz))
    def compute_W_sparse(self, use_D=True, use_V=True, model_to_use="best"):

        assert model_to_use in [
            "last", "best"
        ], "{}: compute_W_sparse, 'model_to_use' parameter not recognized".format(
            self.RECOMMENDER_NAME)

        if self.verbose:
            print("FBSM_Rating_Cython: Building similarity matrix...")

        start_time = time.time()
        start_time_print_batch = start_time

        # Diagonal
        if use_D:

            if model_to_use == "last":
                D = self.D_incremental
            else:
                D = self.D_best

            similarity = Compute_Similarity(self.ICM.T,
                                            shrink=0,
                                            topK=self.topK,
                                            normalize=False,
                                            row_weights=D)
            self.W_sparse = similarity.compute_similarity()
        else:
            self.W_sparse = sps.csr_matrix((self.n_items, self.n_items))

        if use_V:

            if model_to_use == "last":
                V = self.V_incremental
            else:
                V = self.V_best

            # V * V.T
            W1 = self.ICM.dot(V.T)

            #self.W_sparse += W1.dot(W1.T)

            # Use array as it reduces memory requirements compared to lists
            dataBlock = 10000000

            values = np.zeros(dataBlock, dtype=np.float32)
            rows = np.zeros(dataBlock, dtype=np.int32)
            cols = np.zeros(dataBlock, dtype=np.int32)

            numCells = 0

            for numItem in range(self.n_items):

                V_weights = W1[numItem, :].dot(W1.T)
                V_weights[numItem] = 0.0

                relevant_items_partition = (
                    -V_weights).argpartition(self.topK - 1)[0:self.topK]
                relevant_items_partition_sorting = np.argsort(
                    -V_weights[relevant_items_partition])
                top_k_idx = relevant_items_partition[
                    relevant_items_partition_sorting]

                # Incrementally build sparse matrix, do not add zeros
                notZerosMask = V_weights[top_k_idx] != 0.0
                numNotZeros = np.sum(notZerosMask)

                values_to_add = V_weights[top_k_idx][notZerosMask]
                rows_to_add = top_k_idx[notZerosMask]
                cols_to_add = np.ones(numNotZeros) * numItem

                for index in range(len(values_to_add)):

                    if numCells == len(rows):
                        rows = np.concatenate(
                            (rows, np.zeros(dataBlock, dtype=np.int32)))
                        cols = np.concatenate(
                            (cols, np.zeros(dataBlock, dtype=np.int32)))
                        values = np.concatenate(
                            (values, np.zeros(dataBlock, dtype=np.float32)))

                    rows[numCells] = rows_to_add[index]
                    cols[numCells] = cols_to_add[index]
                    values[numCells] = values_to_add[index]

                    numCells += 1

                if self.verbose and (time.time() - start_time_print_batch >= 30
                                     or numItem == self.n_items - 1):
                    columnPerSec = numItem / (time.time() - start_time)

                    print(
                        "Weighted similarity column {} ( {:2.0f} % ), {:.2f} column/sec, elapsed time {:.2f} min"
                        .format(numItem, numItem / self.n_items * 100,
                                columnPerSec, (time.time() - start_time) / 60))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print_batch = time.time()

            V_weights = sps.csr_matrix(
                (values[:numCells], (rows[:numCells], cols[:numCells])),
                shape=(self.n_items, self.n_items),
                dtype=np.float32)

            self.W_sparse += V_weights
            self.W_sparse = check_matrix(self.W_sparse, format='csr')

        if self.verbose:
            print("FBSM_Rating_Cython: Building similarity matrix... complete")