コード例 #1
0
    def __init__(self, max_total_time_seconds, current_total_time):
        max_total_time_seconds_value, max_total_time_seconds_unit = seconds_to_biggest_unit(
            max_total_time_seconds)
        current_total_time_seconds_value, current_total_time_seconds_unit = seconds_to_biggest_unit(
            current_total_time)

        message = "Total training and evaluation time is {:.2f} {}, exceeding the maximum threshold of {:.2f} {}".format(
            current_total_time_seconds_value, current_total_time_seconds_unit,
            max_total_time_seconds_value, max_total_time_seconds_unit)

        super().__init__(message)
コード例 #2
0
    def compute_eigenvalues(self, lamda = None, U = None):

        start_time = time.time()

        if lamda is None or U is None:

            print("SpectralCF: Computing adjacient_matrix...")
            self.A = self._adjacient_matrix(self_connection=True)

            print("SpectralCF: Computing degree_matrix...")
            self.D = self._degree_matrix()

            print("SpectralCF: Computing laplacian_matrix...")
            self.L = self._laplacian_matrix(normalized=True)

            print("SpectralCF: Computing eigenvalues...")
            self.lamda, self.U = np.linalg.eig(self.L)
            self.lamda = np.diag(self.lamda)

            new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)
            print("SpectralCF: Initialization complete in {:.2f} {}".format(new_time_value, new_time_unit))

        else:

            self.lamda = lamda
            self.U = U
コード例 #3
0
    def fit(self,
            topK=None,
            l2_norm=1e3,
            normalize_matrix=False,
            verbose=True):

        self.verbose = verbose

        start_time = time.time()
        self._print("Fitting model... ")

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm='l2', axis=1)
            self.URM_train = normalize(self.URM_train, norm='l2', axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X^t X, compute dot product
        similarity = Compute_Similarity(self.URM_train,
                                        shrink=0,
                                        topK=self.URM_train.shape[1],
                                        normalize=False,
                                        similarity="cosine")
        grahm_matrix = similarity.compute_similarity().toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero
        # in this case we need the diagonal as well, which is just the item popularity
        item_popularity = np.ediff1d(self.URM_train.tocsc().indptr)
        grahm_matrix[diag_indices] = item_popularity + l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Fitting model... done in {:.2f} {}".format(
            new_time_value, new_time_unit))

        # Check if the matrix should be saved in a sparse or dense format
        # The matrix is sparse, regardless of the presence of the topK, if nonzero cells are less than sparse_threshold_quota %
        if topK is not None:
            B = similarityMatrixTopK(B, k=topK, verbose=False)

        if self._is_content_sparse_check(B):
            self._print("Detected model matrix to be sparse, changing format.")
            self.W_sparse = check_matrix(B, format='csr', dtype=np.float32)

        else:
            self.W_sparse = check_matrix(B, format='npy', dtype=np.float32)
            self._W_sparse_format_checked = True
            self._compute_item_score = self._compute_score_W_dense
コード例 #4
0
def _convert_sec_list_into_biggest_unit(data_list):

    mean_sec, stddev_sec = _mean_and_stdd_of_list(data_list)

    _, new_time_unit, data_list = seconds_to_biggest_unit(mean_sec,
                                                          data_array=data_list)

    mean_new_unit, stddev_new_unit = _mean_and_stdd_of_list(data_list)

    return mean_sec, stddev_sec, new_time_unit, mean_new_unit, stddev_new_unit
コード例 #5
0
    def fit(self,
            topK=None,
            l2_norm=1e3,
            normalize_matrix=False,
            verbose=True):

        self.verbose = verbose

        start_time = time.time()
        self._print("Fitting model... ")

        if normalize_matrix:
            # Normalize rows and then columns
            self.URM_train = normalize(self.URM_train, norm='l2', axis=1)
            self.URM_train = normalize(self.URM_train, norm='l2', axis=0)
            self.URM_train = sps.csr_matrix(self.URM_train)

        # Grahm matrix is X^t X, compute dot product
        similarity = Compute_Similarity(self.URM_train,
                                        shrink=0,
                                        topK=self.URM_train.shape[1],
                                        normalize=False,
                                        similarity="cosine")
        grahm_matrix = similarity.compute_similarity().toarray()

        diag_indices = np.diag_indices(grahm_matrix.shape[0])

        # The Compute_Similarity object ensures the diagonal of the similarity matrix is zero
        # in this case we need the diagonal as well, which is just the item popularity
        item_popularity = np.ediff1d(self.URM_train.tocsc().indptr)
        grahm_matrix[diag_indices] = item_popularity + l2_norm

        P = np.linalg.inv(grahm_matrix)

        B = P / (-np.diag(P))

        B[diag_indices] = 0.0

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Fitting model... done in {:.2f} {}".format(
            new_time_value, new_time_unit))

        if topK is None:
            self.W_sparse = B
            self._W_sparse_format_checked = True
            self._compute_item_score = self._compute_score_W_dense

        else:
            self.W_sparse = similarityMatrixTopK(B, k=topK, verbose=False)
            self.W_sparse = sps.csr_matrix(self.W_sparse)
コード例 #6
0
    def fit(self,
            num_factors=100,
            l1_ratio=0.5,
            solver="multiplicative_update",
            init_type="random",
            beta_loss="frobenius",
            verbose=False,
            random_seed=None):

        assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(
            self.RECOMMENDER_NAME, l1_ratio)

        if solver not in self.SOLVER_VALUES:
            raise ValueError(
                "Value for 'solver' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.SOLVER_VALUES.keys(), solver))

        if init_type not in self.INIT_VALUES:
            raise ValueError(
                "Value for 'init_type' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.INIT_VALUES, init_type))

        if beta_loss not in self.BETA_LOSS_VALUES:
            raise ValueError(
                "Value for 'beta_loss' not recognized. Acceptable values are {}, provided was '{}'"
                .format(self.BETA_LOSS_VALUES, beta_loss))

        start_time = time.time()
        self._print("Computing NMF decomposition...")

        nmf_solver = NMF(n_components=num_factors,
                         init=init_type,
                         solver=self.SOLVER_VALUES[solver],
                         beta_loss=beta_loss,
                         random_state=random_seed,
                         l1_ratio=l1_ratio,
                         shuffle=True,
                         verbose=verbose,
                         max_iter=500)

        nmf_solver.fit(self.URM_train)

        self.ITEM_factors = nmf_solver.components_.copy().T
        self.USER_factors = nmf_solver.transform(self.URM_train)

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Computing NMF decomposition... done in {:.2f} {}".format(
            new_time_value, new_time_unit))
コード例 #7
0
def _convert_sec_list_into_biggest_unit(data_list):
    """
    Converts a list containing seconds into an equivalent list with a bigger time unit
    adjusting standard deviation as well
    :param data_list:
    :return:
    """

    data_array = np.array(data_list)

    mean_sec, stddev_sec = _mean_and_stdd_of_array(data_array)

    _, new_time_unit, data_array = seconds_to_biggest_unit(
        mean_sec, data_array=data_array)

    mean_new_unit, stddev_new_unit = _mean_and_stdd_of_array(data_array)

    return mean_sec, stddev_sec, new_time_unit, mean_new_unit, stddev_new_unit
    def fit(self, num_factors=100, random_seed=None):

        start_time = time.time()
        self._print("Computing SVD decomposition...")

        U, Sigma, QT = randomized_svd(
            self.URM_train,
            n_components=num_factors,
            #n_iter=5,
            random_state=random_seed)

        U_s = U * sps.diags(Sigma)

        self.USER_factors = U_s
        self.ITEM_factors = QT.T

        new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() -
                                                                start_time)
        self._print("Computing SVD decomposition... done in {:.2f} {}".format(
            new_time_value, new_time_unit))
コード例 #9
0
    def _train_with_early_stopping(
            self,
            epochs_max,
            epochs_min=0,
            validation_every_n=None,
            stop_on_validation=False,
            validation_metric=None,
            lower_validations_allowed=None,
            evaluator_object=None,
            algorithm_name="Incremental_Training_Early_Stopping"):
        """

        :param epochs_max:                  max number of epochs the training will last
        :param epochs_min:                  min number of epochs the training will last
        :param validation_every_n:          number of epochs after which the model will be evaluated and a best_model selected
        :param stop_on_validation:          [True/False] whether to stop the training before the max number of epochs
        :param validation_metric:           which metric to use when selecting the best model, higher values are better
        :param lower_validations_allowed:    number of contiguous validation steps required for the tranining to early-stop
        :param evaluator_object:            evaluator instance used to compute the validation metrics.
                                                If multiple cutoffs are available, the first one is used
        :param algorithm_name:              name of the algorithm to be displayed in the output updates
        :return: -


        Supported uses:

        - Train for max number of epochs with no validation nor early stopping:

            _train_with_early_stopping(epochs_max = 100,
                                        evaluator_object = None
                                        epochs_min,                 not used
                                        validation_every_n,         not used
                                        stop_on_validation,         not used
                                        validation_metric,          not used
                                        lower_validations_allowed,   not used
                                        )


        - Train for max number of epochs with validation but NOT early stopping:

            _train_with_early_stopping(epochs_max = 100,
                                        evaluator_object = evaluator
                                        stop_on_validation = False
                                        validation_every_n = int value
                                        validation_metric = metric name string
                                        epochs_min,                 not used
                                        lower_validations_allowed,   not used
                                        )


        - Train for max number of epochs with validation AND early stopping:

            _train_with_early_stopping(epochs_max = 100,
                                        epochs_min = int value
                                        evaluator_object = evaluator
                                        stop_on_validation = True
                                        validation_every_n = int value
                                        validation_metric = metric name string
                                        lower_validations_allowed = int value
                                        )



        """

        assert epochs_max > 0, "{}: Number of epochs_max must be > 0, passed was {}".format(
            algorithm_name, epochs_max)
        assert epochs_min >= 0, "{}: Number of epochs_min must be >= 0, passed was {}".format(
            algorithm_name, epochs_min)
        assert epochs_min <= epochs_max, "{}: epochs_min must be <= epochs_max, passed are epochs_min {}, epochs_max {}".format(
            algorithm_name, epochs_min, epochs_max)

        # Train for max number of epochs with no validation nor early stopping
        # OR Train for max number of epochs with validation but NOT early stopping
        # OR Train for max number of epochs with validation AND early stopping
        assert evaluator_object is None or \
               (evaluator_object is not None and not stop_on_validation and validation_every_n is not None and validation_metric is not None) or \
               (evaluator_object is not None and stop_on_validation and validation_every_n is not None and validation_metric is not None and lower_validations_allowed is not None), \
            "{}: Inconsistent parameters passed, please check the supported uses".format(algorithm_name)

        start_time = time.time()

        self.best_validation_metric = None
        lower_validatons_count = 0
        convergence = False

        self.epochs_best = 0

        epochs_current = 0

        while epochs_current < epochs_max and not convergence:

            self._run_epoch(epochs_current)

            # If no validation required, always keep the latest
            if evaluator_object is None:

                self.epochs_best = epochs_current

            # Determine whether a validaton step is required
            elif (epochs_current + 1) % validation_every_n == 0:

                print("{}: Validation begins...".format(algorithm_name))

                self._prepare_model_for_validation()

                # If the evaluator validation has multiple cutoffs, choose the first one
                results_run, results_run_string = evaluator_object.evaluateRecommender(
                    self)
                results_run = results_run[list(results_run.keys())[0]]

                print("{}: {}".format(algorithm_name, results_run_string))

                # Update optimal model
                current_metric_value = results_run[validation_metric]

                if self.best_validation_metric is None or self.best_validation_metric < current_metric_value:

                    print("{}: New best model found! Updating.".format(
                        algorithm_name))

                    self.best_validation_metric = current_metric_value

                    self._update_best_model()

                    self.epochs_best = epochs_current + 1
                    lower_validatons_count = 0

                else:
                    lower_validatons_count += 1

                if stop_on_validation and lower_validatons_count >= lower_validations_allowed and epochs_current >= epochs_min:
                    convergence = True

                    elapsed_time = time.time() - start_time
                    new_time_value, new_time_unit = seconds_to_biggest_unit(
                        elapsed_time)

                    print(
                        "{}: Convergence reached! Terminating at epoch {}. Best value for '{}' at epoch {} is {:.4f}. Elapsed time {:.2f} {}"
                        .format(algorithm_name, epochs_current + 1,
                                validation_metric, self.epochs_best,
                                self.best_validation_metric, new_time_value,
                                new_time_unit))

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            print("{}: Epoch {} of {}. Elapsed time {:.2f} {}".format(
                algorithm_name, epochs_current + 1, epochs_max, new_time_value,
                new_time_unit))

            epochs_current += 1

            sys.stdout.flush()
            sys.stderr.flush()

        # If no validation required, keep the latest
        if evaluator_object is None:
            self._prepare_model_for_validation()
            self._update_best_model()

        # Stop when max epochs reached and not early-stopping
        if not convergence:
            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if evaluator_object is not None:
                print(
                    "{}: Terminating at epoch {}. Best value for '{}' at epoch {} is {:.4f}. Elapsed time {:.2f} {}"
                    .format(algorithm_name, epochs_current, validation_metric,
                            self.epochs_best, self.best_validation_metric,
                            new_time_value, new_time_unit))
            else:
                print("{}: Terminating at epoch {}. Elapsed time {:.2f} {}".
                      format(algorithm_name, epochs_current, new_time_value,
                             new_time_unit))
コード例 #10
0
    def fit(self, alpha=1., beta=0.6, min_rating=0, topK=100, implicit=False, normalize_similarity=True):

        self.alpha = alpha
        self.beta = beta
        self.min_rating = min_rating
        self.topK = topK
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity

        
        # if X.dtype != np.float32:
        #     print("RP3beta fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size, dtype=np.float32)

        #Pui is the row-normalized urm
        Pui = normalize(self.URM_train, norm='l1', axis=1)

        #Piu is the column-normalized, "boolean" urm transposed
        X_bool = self.URM_train.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)

        # Taking the degree of each item to penalize top popular
        # Some rows might be zero, make sure their degree remains zero
        X_bool_sum = np.array(X_bool.sum(axis=1)).ravel()

        degree = np.zeros(self.URM_train.shape[1])

        nonZeroMask = X_bool_sum!=0.0

        degree[nonZeroMask] = np.power(X_bool_sum[nonZeroMask], -self.beta)

        #ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del(X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu


        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0


        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[current_block_start_row:current_block_start_row + block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = np.multiply(similarity_block[row_in_block, :], degree)
                row_data[current_block_start_row + row_in_block] = 0

                best = row_data.argsort()[::-1][:self.topK]

                notZerosMask = row_data[best] != 0.0

                values_to_add = row_data[best][notZerosMask]
                cols_to_add = best[notZerosMask]

                for index in range(len(values_to_add)):

                    if numCells == len(rows):
                        rows = np.concatenate((rows, np.zeros(dataBlock, dtype=np.int32)))
                        cols = np.concatenate((cols, np.zeros(dataBlock, dtype=np.int32)))
                        values = np.concatenate((values, np.zeros(dataBlock, dtype=np.float32)))


                    rows[numCells] = current_block_start_row + row_in_block
                    cols[numCells] = cols_to_add[index]
                    values[numCells] = values_to_add[index]

                    numCells += 1


            if time.time() - start_time_printBatch > 300:
                new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)

                self._print("Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}".format(
                     current_block_start_row + block_dim,
                    100.0 * float( current_block_start_row + block_dim) / Pui.shape[1],
                    float( current_block_start_row + block_dim) / (time.time() - start_time),
                    new_time_value, new_time_unit))


                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()


        self.W_sparse = sps.csr_matrix((values[:numCells], (rows[:numCells], cols[:numCells])), shape=(Pui.shape[1], Pui.shape[1]))

        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)


        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)


        self.W_sparse = check_matrix(self.W_sparse, format='csr')
コード例 #11
0
    def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100):

        assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(
            self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos:end_pos].copy()
            URM_train.data[start_pos:end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value) - 1, self.topK)

            relevant_items_partition = (
                -nonzero_model_coef_value
            ).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(
                -nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate(
                        (rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate(
                        (cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate(
                        (values, np.zeros(dataBlock, dtype=np.float32)))

                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if time.time(
            ) - start_time_printBatch > 300 or currentItem == n_items - 1:
                self._print(
                    "Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}"
                    .format(currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            new_time_value, new_time_unit,
                            float(currentItem) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix
        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(n_items, n_items),
            dtype=np.float32)
コード例 #12
0
def print_time_statistics_latex_table(result_folder_path,
                                      dataset_name,
                                      results_file_prefix_name,
                                      other_algorithm_list=list(),
                                      n_validation_users=None,
                                      n_test_users=None,
                                      KNN_similarity_to_report_list=list(
                                          ["cosine"]),
                                      ICM_names_to_report_list=list(),
                                      n_decimals=4):

    import pickle
    import numpy as np

    results_file_root_name = "{}_{}".format(results_file_prefix_name,
                                            dataset_name)

    results_file = open(
        result_folder_path + "..//" + results_file_root_name +
        "_latex_time.txt", "w")

    def mean_and_stdd_of_list(data_list):

        data_list = np.array(data_list)

        mean = np.mean(data_list)

        if len(data_list) == 1:
            stddev = 0.0
        else:
            stddev = np.std(data_list, ddof=1)

        return mean, stddev

    # Write columns
    columns_datasets_list = "\t&"
    columns_metrics_list = "\t\t&"

    column_name_list = [
        "\\begin{tabular}{@{}c@{}}Train time\\end{tabular}",
        #"validation_time",
        #"\\begin{tabular}{@{}c@{}}validation \\ usr/sec\end{tabular}",
        "\\begin{tabular}{@{}c@{}}Recommendation\\end{tabular}",
        "\\begin{tabular}{@{}c@{}}Recommendation \\\\ {[usr/s]}\\end{tabular}"
    ]

    for dataset_name in [dataset_name]:

        colum_width = len(column_name_list)

        columns_datasets_list += " \\multicolumn{{{}}}{{c}}{{{}}}  \t".format(
            colum_width, dataset_name)

        if dataset_name != [dataset_name][-1]:
            columns_datasets_list += "&"

        for column_name in column_name_list:

            columns_metrics_list += " {} \t".format(column_name)

            if column_name != column_name_list[-1]:
                columns_metrics_list += "\n\t&"

    columns_datasets_list += "\\\\ \n"
    columns_metrics_list += "\\\\ \n"
    results_file.write(columns_datasets_list)
    results_file.write(columns_metrics_list)
    results_file.flush()

    algorithm_data_to_print_list = get_algorithm_data_to_print_list(
        KNN_similarity_to_report_list=KNN_similarity_to_report_list,
        ICM_names_to_report_list=ICM_names_to_report_list,
        other_algorithm_list=other_algorithm_list)

    for row_index in range(len(algorithm_data_to_print_list)):

        algorithm = algorithm_data_to_print_list[row_index]["algorithm"]
        algorithm_row_label = algorithm_data_to_print_list[row_index][
            "algorithm_row_label"]
        algorithm_file_name = algorithm_data_to_print_list[row_index][
            "algorithm_file_name"]

        result_row_string = algorithm_row_label + "\t&"

        for experiment_subfolder in [result_folder_path]:

            try:
                result_dict = pickle.load(
                    open(
                        experiment_subfolder + algorithm_file_name +
                        "_metadata", "rb"))
            except:
                result_dict = None

            if result_dict is not None:

                data_list = result_dict["train_time_list"]
                data_list = np.array(data_list)

                data_list_not_none_mask = np.array(
                    [val is not None for val in data_list])
                data_list = data_list[data_list_not_none_mask]

                mean, stddev = mean_and_stdd_of_list(data_list)

                if len(result_dict["train_time_list"]) > 1:
                    result_row_string += "{:.{n_decimals}f} $\\pm$ {:.{n_decimals}f} [s]\t&".format(
                        mean, stddev, n_decimals=n_decimals)
                else:
                    new_time_value, new_time_unit = seconds_to_biggest_unit(
                        mean)

                    if new_time_unit == "s":
                        result_row_string += "{:.{n_decimals}f} [{}] \t&".format(
                            mean, "s", n_decimals=n_decimals)
                    else:
                        result_row_string += "{:.{n_decimals}f} [{}] / {:.{n_decimals}f} [{}]\t&".format(
                            mean,
                            "s",
                            new_time_value,
                            new_time_unit,
                            n_decimals=n_decimals)

                # mean, stddev = mean_and_stdd_of_list(result_dict["evaluation_time_list"])
                # result_row_string+= "{:.{n_decimals}f} $\pm$ {:.{n_decimals}f}\t&".format(mean, stddev, n_decimals=n_decimals)
                #
                # if n_validation_users is not None:
                #     evaluation_time = result_dict["evaluation_time_list"][-1]
                #     result_row_string+= "{:.0f}\t&".format(n_validation_users/evaluation_time)
                # else:
                #     result_row_string+=" - \t&"

                non_nan_test_time = []
                for value in result_dict["evaluation_test_time_list"]:
                    if value is not None:
                        non_nan_test_time.append(value)

                mean, stddev = mean_and_stdd_of_list(non_nan_test_time)

                if len(non_nan_test_time) > 1:
                    result_row_string += "{:.{n_decimals}f} $\\pm$ {:.{n_decimals}f}  [s]\t&".format(
                        mean, stddev, n_decimals=n_decimals)
                else:
                    new_time_value, new_time_unit = seconds_to_biggest_unit(
                        mean)

                    # result_row_string+= "{:.{n_decimals}f} \t&".format(mean, n_decimals=n_decimals)
                    if new_time_unit == "s":
                        result_row_string += "{:.{n_decimals}f} [{}] \t&".format(
                            mean, "s", n_decimals=n_decimals)
                    else:
                        result_row_string += "{:.{n_decimals}f} [{}] / {:.{n_decimals}f} [{}]\t&".format(
                            mean,
                            "s",
                            new_time_value,
                            new_time_unit,
                            n_decimals=n_decimals)

                if n_test_users is not None:
                    evaluation_time = non_nan_test_time[-1]
                    result_row_string += "{:.0f}\t".format(n_test_users /
                                                           evaluation_time)
                else:
                    result_row_string += " - \t"

            else:
                result_row_string += " - \t& - \t& - "

        result_row_string += "\\\\ \n"
        results_file.write(result_row_string)
        results_file.flush()

    results_file.close()
コード例 #13
0
ファイル: Evaluator.py プロジェクト: caroprese/CMN_CNR
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a BaseRecommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_users_evaluated = 0

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)

        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)
            relevant_items_rating = self.get_user_test_ratings(test_user)

            n_users_evaluated += 1

            items_to_compute = self._get_user_specific_items_to_compute(
                test_user)

            recommended_items, all_items_predicted_ratings = recommender_object.recommend(
                np.atleast_1d(test_user),
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                items_to_compute=items_to_compute,
                remove_CustomItems_flag=self.ignore_items_flag,
                return_scores=True)

            assert len(
                recommended_items
            ) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format(
                self.EVALUATOR_NAME, len(recommended_items), 1)

            assert all_items_predicted_ratings.shape[
                0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format(
                    self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0],
                    1)

            assert all_items_predicted_ratings.shape[
                1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format(
                    self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1],
                    self.n_items)

            recommended_items = np.array(recommended_items[0])
            user_rmse = rmse(all_items_predicted_ratings[0], relevant_items,
                             relevant_items_rating)

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            # (CNR) -------------------------------------------------
            weighted_hits = np.zeros(len(recommended_items))
            log_weighted_hits = np.zeros(len(recommended_items))

            pos_weighted_hits = np.zeros(len(recommended_items))
            pos_log_weighted_hits = np.zeros(len(recommended_items))
            '''
            alpha, beta, scale, pi = 100, 0.03, 1 / 15, np.pi
            percentile = get_percentile(a, 45)

            f = 1 / (beta * np.sqrt(2 * pi))

            y_a = np.tanh(alpha * a) + scale * f * np.exp(-1 / (2 * (beta ** 2)) * (a - percentile) ** 2)
            y_a = y_a / max(y_a)
            '''

            # es. recommended_items = [2, 7, 10, 70, 5464]
            # es. is_relevant       = [0, 0,  1,  0,    0]
            for i in range(len(recommended_items)):
                if is_relevant[i]:
                    weighted_hits[i] = 1 / (
                        1 + Settings.popularity[recommended_items[i]])
                    # es. weighted_hits = [1, 7, 10, 70, 5464]
                    pos_weighted_hits[i] = 1 / (
                        1 + i + Settings.popularity[recommended_items[i]])

                    log_weighted_hits[i] = 1 / (1 + math.log(
                        1 + Settings.popularity[recommended_items[i]]))
                    pos_log_weighted_hits[i] = 1 / (1 + i + math.log(
                        1 + Settings.popularity[recommended_items[i]]))
                    # -------------------------------------------------------

            number_of_guessed_items = 0

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                # Questo array e' fondamentale.
                # Dato un utente, results_current_cutoff e' nella forma [0, 1, 0, 0, 0]
                # La sua lunghezza e' pari al cutoff.
                # Nell'esempio riportato, il vettore ci dice che l'oggetto
                # raccomandato in seconda posizione e' un hit, cioe' e' presente nel test set.
                # Attenzione pero'. Gli oggetti presi in esame sono quelli del test set piu' quelli negativi
                # (che sappiamo non piacere all'utente).

                is_relevant_current_cutoff = is_relevant[0:cutoff]

                # (CNR) -------------------------------------------------------
                custom_hits = np.zeros(len(recommended_items))

                for i in range(len(recommended_items)):
                    if is_relevant[i]:
                        # print('Luciano > Computing custom weight. Parameters (pop, pos, cutoff):', Settings.popularity[recommended_items[i]], i, cutoff)
                        custom_hits[i] = y_custom(
                            Settings.popularity[recommended_items[i]], i,
                            cutoff)

                        if custom_hits[i] > 1:
                            print(
                                '=============================================================='
                            )
                            print(
                                'Luciano > WARNING! custom_hits[{}]={}'.format(
                                    i, custom_hits[i]))
                            print(
                                '=============================================================='
                            )

                weighted_hits_current_cutoff = weighted_hits[0:cutoff]
                log_weighted_hits_current_cutoff = log_weighted_hits[0:cutoff]
                pos_weighted_hits_current_cutoff = pos_weighted_hits[0:cutoff]
                pos_log_weighted_hits_current_cutoff = pos_log_weighted_hits[
                    0:cutoff]
                custom_hits_current_cutoff = custom_hits[0:cutoff]
                # -------------------------------------------------------------

                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.
                    value] += precision_recall_min_denominator(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                number_of_guessed_items = is_relevant_current_cutoff.sum()
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()

                # (CNR) -------------------------------------------------
                verbose = False
                if verbose and is_relevant_current_cutoff.sum() > 1:
                    print(
                        '============================================================================='
                    )
                    print('user:'******'is_relevant_current_cutoff:',
                          is_relevant_current_cutoff)
                    print('recommended_items_current_cutoff:',
                          recommended_items_current_cutoff)
                    print('Warning! is_relevant_current_cutoff.sum()>1:',
                          is_relevant_current_cutoff.sum())
                    print('relevant_items:', relevant_items)
                    print('relevant_items_rating:', relevant_items_rating)
                    print('items_to_compute:', items_to_compute)
                    print(
                        '============================================================================='
                    )

                results_current_cutoff[
                    EvaluatorMetrics.WEIGHTED_HIT_RATE.
                    value] += weighted_hits_current_cutoff.sum()
                results_current_cutoff[
                    EvaluatorMetrics.LOG_WEIGHTED_HIT_RATE.
                    value] += log_weighted_hits_current_cutoff.sum()

                results_current_cutoff[
                    EvaluatorMetrics.POS_WEIGHTED_HIT_RATE.
                    value] += pos_weighted_hits_current_cutoff.sum()
                results_current_cutoff[
                    EvaluatorMetrics.POS_LOG_WEIGHTED_HIT_RATE.
                    value] += pos_log_weighted_hits_current_cutoff.sum()

                results_current_cutoff[
                    EvaluatorMetrics.CUSTOM_HIT_RATE.
                    value] += custom_hits_current_cutoff.sum()
                # -------------------------------------------------------

                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.RMSE.value] += user_rmse

                results_current_cutoff[
                    EvaluatorMetrics.MRR.value].add_recommendations(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.MAP.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            verbose = False
            if verbose and test_user % 1000 == 0:
                if number_of_guessed_items > 0:
                    print(
                        'Test ======================================================='
                    )
                    print('user:'******'relevant items:', relevant_items)
                    print('relevant items rating:', relevant_items_rating)
                    print('items_to_compute:\n', items_to_compute)
                    print('len(items_to_compute):', len(items_to_compute))
                    print('recommended_items:', recommended_items)
                    print('is_relevant:', is_relevant)
                    print('number_of_guessed_items:', number_of_guessed_items)
                    print(
                        '============================================================'
                    )
                else:
                    print('.')

            if time.time() - start_time_print > 30 or n_users_evaluated == len(
                    self.usersToEvaluate):
                elapsed_time = time.time() - start_time
                new_time_value, new_time_unit = seconds_to_biggest_unit(
                    elapsed_time)

                print(
                    "{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}"
                    .format(
                        self.EVALUATOR_NAME, n_users_evaluated, 100.0 *
                        float(n_users_evaluated) / len(self.usersToEvaluate),
                        new_time_value, new_time_unit,
                        float(n_users_evaluated) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        if (n_users_evaluated > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value / n_users_evaluated

                precision_ = results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (
                        precision_ * recall_) / (precision_ + recall_)

        else:
            print(
                "WARNING: No users had a sufficient number of relevant items")

        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()

        results_run_string = get_result_string(results_dict)

        return (results_dict, results_run_string)
コード例 #14
0
    def _compute_metrics_on_recommendation_list(self, test_user_batch_array,
                                                recommended_items_batch_list,
                                                scores_batch, results_dict):

        assert len(recommended_items_batch_list) == len(
            test_user_batch_array
        ), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format(
            self.EVALUATOR_NAME, len(recommended_items_batch_list),
            len(test_user_batch_array))

        assert scores_batch.shape[0] == len(
            test_user_batch_array
        ), "{}: scores_batch contained scores for {} users, expected was {}".format(
            self.EVALUATOR_NAME, scores_batch.shape[0],
            len(test_user_batch_array))

        assert scores_batch.shape[
            1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items)

        # Compute recommendation quality for each user in batch
        for batch_user_index in range(len(recommended_items_batch_list)):

            test_user = test_user_batch_array[batch_user_index]

            #print(f"Evaluating user: {test_user}")

            relevant_items = self.get_user_relevant_items(test_user)

            # Add the RMSE to the global object, no need to loop through the various cutoffs
            # This repository is not designed to ensure proper RMSE optimization
            # relevant_items_rating = self.get_user_test_ratings(test_user)
            #
            # all_items_predicted_ratings = scores_batch[batch_user_index]
            # global_RMSE_object = results_dict[self.cutoff_list[0]][EvaluatorMetrics.RMSE.value]
            # global_RMSE_object.add_recommendations(all_items_predicted_ratings, relevant_items, relevant_items_rating)

            # Being the URM CSR, the indices are the non-zero column indexes
            recommended_items = recommended_items_batch_list[batch_user_index]

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            #print(f"Is Relevant: {is_relevant}")

            self._n_users_evaluated += 1

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.
                    value] += precision_recall_min_denominator(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.MRR.value].add_recommendations(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.MAP.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                #print(results_current_cutoff[EvaluatorMetrics.MAP.value])
                #print("----------------------------------------------------------------------------------------------------")
                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM_CORRECT.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff,
                                           is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_USER_CORRECT.
                                       value].add_recommendations(
                                           is_relevant_current_cutoff,
                                           test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

        if time.time(
        ) - self._start_time_print > 30 or self._n_users_evaluated == len(
                self.users_to_evaluate):

            elapsed_time = time.time() - self._start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            self._print(
                "Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}"
                .format(
                    self._n_users_evaluated,
                    100.0 * float(self._n_users_evaluated) /
                    len(self.users_to_evaluate), new_time_value, new_time_unit,
                    float(self._n_users_evaluated) / elapsed_time))

            sys.stdout.flush()
            sys.stderr.flush()

            self._start_time_print = time.time()

        return results_dict
    def compute_similarity(self, start_col=None, end_col=None, block_size=100):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        values = []
        rows = []
        cols = []

        start_time = time.time()
        start_time_print_batch = start_time
        processed_items = 0

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col > 0 and start_col < self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col > start_col_local and end_col < self.n_columns:
            end_col_local = end_col

        # Compute sum of squared values
        item_distance_initial = np.array(
            self.dataMatrix.power(2).sum(axis=0)).ravel()
        sumOfSquared = np.sqrt(item_distance_initial)

        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            # Compute block first and last column
            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block - start_col_block

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray()

            # Compute item similarities
            if self.use_row_weights:
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)
            else:
                this_block_weights = self.dataMatrix.T.dot(item_data)

            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights.ravel()
                else:
                    this_column_weights = this_block_weights[:,
                                                             col_index_in_block]

                columnIndex = col_index_in_block + start_col_block

                # (a-b)^2 = a^2 + b^2 - 2ab
                item_distance = item_distance_initial.copy()
                item_distance += item_distance_initial[columnIndex]

                item_distance -= 2 * this_column_weights
                item_distance[columnIndex] = 0.0

                if self.use_row_weights:
                    item_distance = np.multiply(item_distance,
                                                self.row_weights)

                if self.normalize:
                    denominator = sumOfSquared[columnIndex] * sumOfSquared
                    item_distance[denominator != 0.0] /= denominator[
                        denominator != 0.0]

                if self.normalize_avg_row:
                    item_distance /= self.n_rows

                nonzero_distance_mask = item_distance > 0.0
                item_distance[nonzero_distance_mask] = np.sqrt(
                    item_distance[nonzero_distance_mask])

                if self.similarity_is_exp:
                    item_similarity = 1 / (np.exp(item_distance) +
                                           self.shrink + 1e-9)

                elif self.similarity_is_lin:
                    item_similarity = 1 / (item_distance + self.shrink + 1e-9)

                elif self.similarity_is_log:
                    item_similarity = 1 / (np.log(item_distance + 1) +
                                           self.shrink + 1e-9)

                else:
                    assert False

                item_similarity[columnIndex] = 0.0
                this_column_weights = item_similarity

                # Sort indices and select TopK
                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # - Partition the data to extract the set of relevant items
                # - Sort only the relevant items
                # - Get the original item index
                relevant_items_partition = (
                    -this_column_weights).argpartition(self.TopK -
                                                       1)[0:self.TopK]
                relevant_items_partition_sorting = np.argsort(
                    -this_column_weights[relevant_items_partition])
                top_k_idx = relevant_items_partition[
                    relevant_items_partition_sorting]

                # Incrementally build sparse matrix, do not add zeros
                notZerosMask = this_column_weights[top_k_idx] != 0.0
                numNotZeros = np.sum(notZerosMask)

                values.extend(this_column_weights[top_k_idx][notZerosMask])
                rows.extend(top_k_idx[notZerosMask])
                cols.extend(np.ones(numNotZeros) * columnIndex)

            # Add previous block size
            start_col_block += this_block_size
            processed_items += this_block_size

            if time.time(
            ) - start_time_print_batch >= 300 or end_col_block == end_col_local:
                column_per_sec = processed_items / (time.time() - start_time +
                                                    1e-9)
                new_time_value, new_time_unit = seconds_to_biggest_unit(
                    time.time() - start_time)

                print(
                    "Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}"
                    .format(
                        processed_items, processed_items /
                        (end_col_local - start_col_local) * 100,
                        column_per_sec, new_time_value, new_time_unit))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()

        # End while on columns

        W_sparse = sps.csr_matrix((values, (rows, cols)),
                                  shape=(self.n_columns, self.n_columns),
                                  dtype=np.float32)

        return W_sparse
コード例 #16
0
    def _compute_metrics_on_recommendation_list(self, test_user_batch_array,
                                                recommended_items_batch_list,
                                                scores_batch, results_dict):

        assert len(recommended_items_batch_list) == len(
            test_user_batch_array
        ), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format(
            self.EVALUATOR_NAME, len(recommended_items_batch_list),
            len(test_user_batch_array))

        assert scores_batch.shape[0] == len(
            test_user_batch_array
        ), "{}: scores_batch contained scores for {} users, expected was {}".format(
            self.EVALUATOR_NAME, scores_batch.shape[0],
            len(test_user_batch_array))

        assert scores_batch.shape[
            1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items)

        # Compute recommendation quality for each user in batch
        for batch_user_index in range(len(recommended_items_batch_list)):

            test_user = test_user_batch_array[batch_user_index]

            relevant_items = self.get_user_relevant_items(test_user)

            # Being the URM CSR, the indices are the non-zero column indexes
            recommended_items = recommended_items_batch_list[batch_user_index]
            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            self._n_users_evaluated += 1

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.
                    value] += precision_recall_min_denominator(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.ARHR.value] += arhr_all_hits(
                        is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.MRR.value].add_recommendations(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.MAP.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[
                    EvaluatorMetrics.MAP_MIN_DEN.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.value].add_recommendations(
                        is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM_HIT.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff,
                                           is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_USER_HIT.
                                       value].add_recommendations(
                                           is_relevant_current_cutoff,
                                           test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                results_current_cutoff[EvaluatorMetrics.RATIO_SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.RATIO_DIVERSITY_HERFINDAHL.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.RATIO_DIVERSITY_GINI.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.RATIO_NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.RATIO_AVERAGE_POPULARITY.
                    value].add_recommendations(
                        recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

        if time.time(
        ) - self._start_time_print > 300 or self._n_users_evaluated == len(
                self.users_to_evaluate):

            elapsed_time = time.time() - self._start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            self._print(
                "Processed {} ({:4.1f}%) in {:.2f} {}. Users per second: {:.0f}"
                .format(
                    self._n_users_evaluated,
                    100.0 * float(self._n_users_evaluated) /
                    len(self.users_to_evaluate), new_time_value, new_time_unit,
                    float(self._n_users_evaluated) /
                    elapsed_time if elapsed_time > 0.0 else np.nan))

            sys.stdout.flush()
            sys.stderr.flush()

            self._start_time_print = time.time()

        return results_dict
コード例 #17
0
    def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size = None):


        if block_size is None:
            block_size = min(1000, int(1e8/self.n_items))



        start_time = time.time()
        start_time_print = time.time()


        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users,
                                                             recommender_object.get_URM_train(),
                                                             self.ignore_items_ID,
                                                             self.ignore_users_ID,
                                                             cutoff,
                                                             self.diversity_object)

        n_users_evaluated = 0

        # Start from -block_size to ensure it to be 0 at the first block
        user_batch_start = 0
        user_batch_end = 0

        while user_batch_start < len(self.usersToEvaluate):

            user_batch_end = user_batch_start + block_size
            user_batch_end = min(user_batch_end, len(usersToEvaluate))

            test_user_batch_array = np.array(usersToEvaluate[user_batch_start:user_batch_end])
            user_batch_start = user_batch_end

            # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time
            recommended_items_batch_list, scores_batch = recommender_object.recommend(test_user_batch_array,
                                                                      remove_seen_flag=self.exclude_seen,
                                                                      cutoff = self.max_cutoff,
                                                                      remove_top_pop_flag=False,
                                                                      remove_CustomItems_flag=self.ignore_items_flag,
                                                                      return_scores = True
                                                                     )


            assert len(recommended_items_batch_list) == len(test_user_batch_array), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format(
                self.EVALUATOR_NAME, len(recommended_items_batch_list), len(test_user_batch_array))

            assert scores_batch.shape[0] == len(test_user_batch_array), "{}: scores_batch contained scores for {} users, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[0], len(test_user_batch_array))

            assert scores_batch.shape[1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items)


            # Compute recommendation quality for each user in batch
            for batch_user_index in range(len(recommended_items_batch_list)):

                test_user = test_user_batch_array[batch_user_index]

                relevant_items = self.get_user_relevant_items(test_user)
                relevant_items_rating = self.get_user_test_ratings(test_user)

                all_items_predicted_ratings = scores_batch[batch_user_index]
                user_rmse = rmse(all_items_predicted_ratings, relevant_items, relevant_items_rating)

                # Being the URM CSR, the indices are the non-zero column indexes
                recommended_items = recommended_items_batch_list[batch_user_index]
                is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

                n_users_evaluated += 1

                for cutoff in self.cutoff_list:

                    results_current_cutoff = results_dict[cutoff]

                    is_relevant_current_cutoff = is_relevant[0:cutoff]
                    recommended_items_current_cutoff = recommended_items[0:cutoff]

                    results_current_cutoff[EvaluatorMetrics.ROC_AUC.value]              += roc_auc(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.PRECISION.value]            += precision(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value]   += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items))
                    results_current_cutoff[EvaluatorMetrics.RECALL.value]               += recall(is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.NDCG.value]                 += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff)
                    results_current_cutoff[EvaluatorMetrics.HIT_RATE.value]             += is_relevant_current_cutoff.sum()
                    results_current_cutoff[EvaluatorMetrics.ARHR.value]                 += arhr(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.RMSE.value]                 += user_rmse

                    results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user)
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff)

                    if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                        results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff)


                if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate):

                    elapsed_time = time.time()-start_time
                    new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)

                    print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format(
                                  self.EVALUATOR_NAME,
                                  n_users_evaluated,
                                  100.0* float(n_users_evaluated)/len(self.usersToEvaluate),
                                  new_time_value, new_time_unit,
                                  float(n_users_evaluated)/elapsed_time))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print = time.time()



        return results_dict, n_users_evaluated
コード例 #18
0
    def run_fit(self):
        # Display ConvergenceWarning only once and not for every item it occurs
        warnings.simplefilter("once", category=ConvergenceWarning)

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=1e-4,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        # Use array as it reduces memory requirements compared to lists
        dataBlock = 10000000

        rows = np.zeros(dataBlock, dtype=np.int32)
        cols = np.zeros(dataBlock, dtype=np.int32)
        values = np.zeros(dataBlock, dtype=np.float32)

        numCells = 0

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            if y.sum() == 0.0:
                continue

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos:end_pos].copy()
            URM_train.data[start_pos:end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value) - 1, self.topK)

            relevant_items_partition = (
                -nonzero_model_coef_value
            ).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(
                -nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            for index in range(len(ranking)):

                if numCells == len(rows):
                    rows = np.concatenate(
                        (rows, np.zeros(dataBlock, dtype=np.int32)))
                    cols = np.concatenate(
                        (cols, np.zeros(dataBlock, dtype=np.int32)))
                    values = np.concatenate(
                        (values, np.zeros(dataBlock, dtype=np.float32)))

                rows[numCells] = nonzero_model_coef_index[ranking[index]]
                cols[numCells] = currentItem
                values[numCells] = nonzero_model_coef_value[ranking[index]]

                numCells += 1

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if time.time(
            ) - start_time_printBatch > 300 or currentItem == n_items - 1:
                print(
                    "Processed {} ( {:.2f}% ) in {:.2f} {}. Items per second: {:.2f}"
                    .format(currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            new_time_value, new_time_unit,
                            float(currentItem) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        # generate the sparse weight matrix

        self.W_sparse = sps.csr_matrix(
            (values[:numCells], (rows[:numCells], cols[:numCells])),
            shape=(n_items, n_items),
            dtype=np.float32)
コード例 #19
0
    def search(
        self,
        recommender_input_args,
        hyperparameter_search_space,
        metric_to_optimize=None,
        cutoff_to_optimize=None,
        n_cases=None,
        n_random_starts=None,
        output_folder_path=None,
        output_file_name_root=None,
        save_model="best",
        save_metadata=True,
        resume_from_saved=False,
        recommender_input_args_last_test=None,
        evaluate_on_test="best",
        max_total_time=None,
        terminate_on_memory_error=True,
    ):
        """

        :param recommender_input_args:
        :param hyperparameter_search_space:
        :param metric_to_optimize:
        :param cutoff_to_optimize:
        :param n_cases:
        :param n_random_starts:
        :param output_folder_path:
        :param output_file_name_root:
        :param save_model:          "no"    don't save anything
                                    "all"   save every model
                                    "best"  save the best model trained on train data alone and on last, if present
                                    "last"  save only last, if present
        :param save_metadata:
        :param recommender_input_args_last_test:
        :return:
        """

        ### default hyperparameters for BayesianSkopt are set here
        self._set_skopt_params()

        self._set_search_attributes(
            recommender_input_args, recommender_input_args_last_test,
            hyperparameter_search_space.keys(), metric_to_optimize,
            cutoff_to_optimize, output_folder_path, output_file_name_root,
            resume_from_saved, save_metadata, save_model, evaluate_on_test,
            n_cases, terminate_on_memory_error)

        self.n_random_starts = n_random_starts
        self.n_calls = n_cases
        self.n_jobs = 1
        self.n_loaded_counter = 0

        self.max_total_time = max_total_time

        if self.max_total_time is not None:
            total_time_value, total_time_unit = seconds_to_biggest_unit(
                self.max_total_time)
            self._print(
                "{}: The search has a maximum allotted time of {:.2f} {}".
                format(self.ALGORITHM_NAME, total_time_value, total_time_unit))

        self.hyperparams = dict()
        self.hyperparams_names = list()
        self.hyperparams_values = list()

        skopt_types = [Real, Integer, Categorical]

        for name, hyperparam in hyperparameter_search_space.items():

            if any(
                    isinstance(hyperparam, sko_type)
                    for sko_type in skopt_types):
                self.hyperparams_names.append(name)
                self.hyperparams_values.append(hyperparam)
                self.hyperparams[name] = hyperparam

            else:
                raise ValueError(
                    "{}: Unexpected hyperparameter type: {} - {}".format(
                        self.ALGORITHM_NAME, str(name), str(hyperparam)))

        try:
            if self.resume_from_saved:
                hyperparameters_list_input, result_on_validation_list_saved = self._resume_from_saved(
                )
                self.x0 = hyperparameters_list_input
                self.y0 = result_on_validation_list_saved

                self.n_loaded_counter = self.model_counter

            if self.n_calls - self.model_counter > 0:
                # When resuming an incomplete search the gp_minimize will continue to tell you "Evaluating function at random point" instead
                # of "Searching for the next optimal point". This may be due to a bug in the print rather than the underlying process
                # https://github.com/scikit-optimize/scikit-optimize/issues/949
                self.result = gp_minimize(
                    self._objective_function_list_input,
                    self.hyperparams_values,
                    base_estimator=None,
                    n_calls=max(0, self.n_calls - self.model_counter),
                    n_initial_points=max(
                        0, self.n_random_starts - self.model_counter),
                    initial_point_generator="random",
                    acq_func=self.acq_func,
                    acq_optimizer=self.acq_optimizer,
                    x0=self.x0,
                    y0=self.y0,
                    random_state=self.random_state,
                    verbose=self.verbose,
                    callback=None,
                    n_points=self.n_point,
                    n_restarts_optimizer=self.n_restarts_optimizer,
                    xi=self.xi,
                    kappa=self.kappa,
                    noise=self.noise,
                    n_jobs=self.n_jobs)

        except ValueError as e:
            self._write_log(
                "{}: Search interrupted due to ValueError. The evaluated configurations may have had all the same value.\n"
                .format(self.ALGORITHM_NAME))
            return

        except NoValidConfigError as e:
            self._write_log("{}: Search interrupted. {}\n".format(
                self.ALGORITHM_NAME, e))
            return

        except TimeoutError as e:
            # When in TimeoutError, stop search but continue to train the _last model, if requested
            self._write_log("{}: Search interrupted. {}\n".format(
                self.ALGORITHM_NAME, e))

        if self.n_loaded_counter < self.model_counter:
            self._write_log(
                "{}: Search complete. Best config is {}: {}\n".format(
                    self.ALGORITHM_NAME,
                    self.metadata_dict["hyperparameters_best_index"],
                    self.metadata_dict["hyperparameters_best"]))

        if self.recommender_input_args_last_test is not None:
            self._evaluate_on_test_with_data_last()
コード例 #20
0
    def compute_similarity(self, start_col=None, end_col=None, block_size = 100):
        """
        Compute the similarity for the given dataset
        :param self:
        :param start_col: column to begin with
        :param end_col: column to stop before, end_col is excluded
        :return:
        """

        similarity_builder = Incremental_Similarity_Builder(self.n_columns, initial_data_block=self.n_columns*self.topK, dtype = np.float32)

        start_time = time.time()
        start_time_print_batch = start_time
        processed_items = 0


        if self.adjusted_cosine:
            self.applyAdjustedCosine()

        elif self.pearson_correlation:
            self.applyPearsonCorrelation()

        elif self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient:
            self.useOnlyBooleanInteractions()


        # We explore the matrix column-wise
        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')


        # Compute sum of squared values to be used in normalization
        sum_of_squared = np.array(self.dataMatrix.power(2).sum(axis=0)).ravel()

        # Tanimoto does not require the square root to be applied
        if not (self.tanimoto_coefficient or self.dice_coefficient or self.tversky_coefficient):
            sum_of_squared = np.sqrt(sum_of_squared)


        if self.asymmetric_cosine:
            sum_of_squared_to_alpha = np.power(sum_of_squared + 1e-6, 2 * self.asymmetric_alpha)
            sum_of_squared_to_1_minus_alpha = np.power(sum_of_squared + 1e-6, 2 * (1 - self.asymmetric_alpha))


        self.dataMatrix = check_matrix(self.dataMatrix, 'csc')

        start_col_local = 0
        end_col_local = self.n_columns

        if start_col is not None and start_col>0 and start_col<self.n_columns:
            start_col_local = start_col

        if end_col is not None and end_col>start_col_local and end_col<self.n_columns:
            end_col_local = end_col




        start_col_block = start_col_local

        this_block_size = 0

        # Compute all similarities for each item using vectorization
        while start_col_block < end_col_local:

            # Compute block first and last column
            end_col_block = min(start_col_block + block_size, end_col_local)
            this_block_size = end_col_block-start_col_block

            # All data points for a given item
            item_data = self.dataMatrix[:, start_col_block:end_col_block]
            item_data = item_data.toarray()

            # Compute item similarities
            if self.use_row_weights:
                this_block_weights = self.dataMatrix_weighted.T.dot(item_data)
            else:
                this_block_weights = self.dataMatrix.T.dot(item_data)


            for col_index_in_block in range(this_block_size):

                if this_block_size == 1:
                    this_column_weights = this_block_weights.ravel()
                else:
                    this_column_weights = this_block_weights[:,col_index_in_block]


                columnIndex = col_index_in_block + start_col_block
                this_column_weights[columnIndex] = 0.0

                # Apply normalization and shrinkage, ensure denominator != 0
                if self.normalize:

                    if self.asymmetric_cosine:
                        denominator = sum_of_squared_to_alpha[columnIndex] * sum_of_squared_to_1_minus_alpha + self.shrink + 1e-6
                    else:
                        denominator = sum_of_squared[columnIndex] * sum_of_squared + self.shrink + 1e-6

                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)


                # Apply the specific denominator for Tanimoto
                elif self.tanimoto_coefficient:
                    denominator = sum_of_squared[columnIndex] + sum_of_squared - this_column_weights + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                elif self.dice_coefficient:
                    denominator = sum_of_squared[columnIndex] + sum_of_squared + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                elif self.tversky_coefficient:
                    denominator = this_column_weights + \
                                  (sum_of_squared[columnIndex] - this_column_weights)*self.tversky_alpha + \
                                  (sum_of_squared - this_column_weights)*self.tversky_beta + self.shrink + 1e-6
                    this_column_weights = np.multiply(this_column_weights, 1 / denominator)

                # If no normalization or tanimoto is selected, apply only shrink
                elif self.shrink != 0:
                    this_column_weights = this_column_weights/self.shrink



                # Sort indices and select topK
                # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
                # - Partition the data to extract the set of relevant items
                # - Sort only the relevant items
                # - Get the original item index
                relevant_items_partition = (-this_column_weights).argpartition(self.topK - 1)[0:self.topK]
                relevant_items_partition_sorting = np.argsort(-this_column_weights[relevant_items_partition])
                top_k_idx = relevant_items_partition[relevant_items_partition_sorting]

                # Incrementally build sparse matrix, do not add zeros
                notZerosMask = this_column_weights[top_k_idx] != 0.0
                numNotZeros = np.sum(notZerosMask)

                similarity_builder.add_data_lists(row_list_to_add=top_k_idx[notZerosMask],
                                                  col_list_to_add=np.ones(numNotZeros) * columnIndex,
                                                  data_list_to_add=this_column_weights[top_k_idx][notZerosMask])

            # Add previous block size
            start_col_block += this_block_size
            processed_items += this_block_size

            if time.time() - start_time_print_batch >= 300 or end_col_block==end_col_local:
                column_per_sec = processed_items / (time.time() - start_time + 1e-9)
                new_time_value, new_time_unit = seconds_to_biggest_unit(time.time() - start_time)

                print("Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}".format(
                    processed_items, processed_items / (end_col_local - start_col_local) * 100, column_per_sec, new_time_value, new_time_unit))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print_batch = time.time()


        # End while on columns
        W_sparse = similarity_builder.get_SparseMatrix()

        return W_sparse
コード例 #21
0
    def _train_with_early_stopping(self, epochs_max, epochs_min=0,
                                   validation_every_n=None, stop_on_validation=False,
                                   validation_metric=None, lower_validations_allowed=None, evaluator_object=None,
                                   algorithm_name="Incremental_Training_Early_Stopping"):

        start_time = time.time()

        self.best_validation_metric = None
        lower_validatons_count = 0
        convergence = False

        self.epochs_best = 0

        epochs_current = 0

        while epochs_current < epochs_max and not convergence:

            self._run_epoch(epochs_current)

            # If no validation required, always keep the latest
            if evaluator_object is None:

                self.epochs_best = epochs_current

            # Determine whether a validaton step is required
            elif (epochs_current + 1) % validation_every_n == 0:

                print("{}: Validation begins...".format(algorithm_name))

                self._prepare_model_for_validation()

                # If the evaluator validation has multiple cutoffs, choose the first one
                results_run, results_run_string = evaluator_object.evaluateRecommender(self)
                results_run = results_run[list(results_run.keys())[0]]

                print("{}: {}".format(algorithm_name, results_run_string))

                # Update optimal model
                current_metric_value = results_run[validation_metric]

                if not np.isfinite(current_metric_value):
                    if isinstance(self, BaseTempFolder):
                        # If the recommender uses BaseTempFolder, clean the temp folder
                        self._clean_temp_folder(temp_file_folder=self.temp_file_folder)

                    assert False, "{}: metric value is not a finite number, terminating!".format(RECOMMENDER_NAME)


                if self.best_validation_metric is None or self.best_validation_metric < current_metric_value:

                    print("{}: New best model found! Updating.".format(algorithm_name))

                    self.best_validation_metric = current_metric_value

                    self._update_best_model()

                    self.epochs_best = epochs_current +1
                    lower_validatons_count = 0

                else:
                    lower_validatons_count += 1


                if stop_on_validation and lower_validatons_count >= lower_validations_allowed and epochs_current >= epochs_min:
                    convergence = True

                    elapsed_time = time.time() - start_time
                    new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)

                    print("{}: Convergence reached! Terminating at epoch {}. Best value for '{}' at epoch {} is {:.4f}. Elapsed time {:.2f} {}".format(
                        algorithm_name, epochs_current+1, validation_metric, self.epochs_best, self.best_validation_metric, new_time_value, new_time_unit))


            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)


            print("{}: Epoch {} of {}. Elapsed time {:.2f} {}".format(
                algorithm_name, epochs_current+1, epochs_max, new_time_value, new_time_unit))


            epochs_current += 1

            sys.stdout.flush()
            sys.stderr.flush()

        # If no validation required, keep the latest
        if evaluator_object is None:

            self._prepare_model_for_validation()
            self._update_best_model()

        # Stop when max epochs reached and not early-stopping
        if not convergence:
            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)

            if evaluator_object is not None and self.best_validation_metric is not None:
                print("{}: Terminating at epoch {}. Best value for '{}' at epoch {} is {:.4f}. Elapsed time {:.2f} {}".format(
                    algorithm_name, epochs_current, validation_metric, self.epochs_best, self.best_validation_metric, new_time_value, new_time_unit))
            else:
                print("{}: Terminating at epoch {}. Elapsed time {:.2f} {}".format(
                    algorithm_name, epochs_current, new_time_value, new_time_unit))
コード例 #22
0
    def fit(self, l1_ratio=0.1, alpha=1.0, positive_only=True, topK=100):

        assert l1_ratio >= 0 and l1_ratio <= 1, "{}: l1_ratio must be between 0 and 1, provided value was {}".format(
            self.RECOMMENDER_NAME, l1_ratio)

        self.l1_ratio = l1_ratio
        self.positive_only = positive_only
        self.topK = topK

        # initialize the ElasticNet model
        self.model = ElasticNet(alpha=alpha,
                                l1_ratio=self.l1_ratio,
                                positive=self.positive_only,
                                fit_intercept=False,
                                copy_X=False,
                                precompute=True,
                                selection='random',
                                max_iter=100,
                                tol=1e-4)

        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]

        similarity_builder = Incremental_Similarity_Builder(
            self.n_items,
            initial_data_block=self.n_items * self.topK,
            dtype=np.float32)

        start_time = time.time()
        start_time_printBatch = start_time

        # fit each item's factors sequentially (not in parallel)
        for currentItem in range(n_items):

            # get the target column
            y = URM_train[:, currentItem].toarray()

            # set the j-th column of X to zero
            start_pos = URM_train.indptr[currentItem]
            end_pos = URM_train.indptr[currentItem + 1]

            current_item_data_backup = URM_train.data[start_pos:end_pos].copy()
            URM_train.data[start_pos:end_pos] = 0.0

            # fit one ElasticNet model per column
            self.model.fit(URM_train, y)

            # self.model.coef_ contains the coefficient of the ElasticNet model
            # let's keep only the non-zero values

            # Select topK values
            # Sorting is done in three steps. Faster then plain np.argsort for higher number of items
            # - Partition the data to extract the set of relevant items
            # - Sort only the relevant items
            # - Get the original item index

            nonzero_model_coef_index = self.model.sparse_coef_.indices
            nonzero_model_coef_value = self.model.sparse_coef_.data

            local_topK = min(len(nonzero_model_coef_value) - 1, self.topK)

            relevant_items_partition = (
                -nonzero_model_coef_value
            ).argpartition(local_topK)[0:local_topK]
            relevant_items_partition_sorting = np.argsort(
                -nonzero_model_coef_value[relevant_items_partition])
            ranking = relevant_items_partition[
                relevant_items_partition_sorting]

            similarity_builder.add_data_lists(
                row_list_to_add=nonzero_model_coef_index[ranking],
                col_list_to_add=np.ones(len(nonzero_model_coef_index)) *
                currentItem,
                data_list_to_add=nonzero_model_coef_value[ranking])

            # finally, replace the original values of the j-th column
            URM_train.data[start_pos:end_pos] = current_item_data_backup

            elapsed_time = time.time() - start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            if time.time(
            ) - start_time_printBatch > 300 or currentItem == n_items - 1:
                self._print(
                    "Processed {} ({:4.1f}%) in {:.2f} {}. Items per second: {:.2f}"
                    .format(currentItem + 1,
                            100.0 * float(currentItem + 1) / n_items,
                            new_time_value, new_time_unit,
                            float(currentItem) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        self.W_sparse = similarity_builder.get_SparseMatrix()
コード例 #23
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a BaseRecommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """



        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users,
                                                             recommender_object.URM_train,
                                                             self.ignore_items_ID,
                                                             self.ignore_users_ID,
                                                             cutoff,
                                                             self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_users_evaluated = 0

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)


        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)
            relevant_items_rating = self.get_user_test_ratings(test_user)

            n_users_evaluated += 1

            items_to_compute = self._get_user_specific_items_to_compute(test_user)

            recommended_items, all_items_predicted_ratings = recommender_object.recommend(np.atleast_1d(test_user),
                                                              remove_seen_flag=self.exclude_seen,
                                                              cutoff = self.max_cutoff,
                                                              remove_top_pop_flag=False,
                                                              items_to_compute = items_to_compute,
                                                              remove_CustomItems_flag=self.ignore_items_flag,
                                                              return_scores = True
                                                             )


            assert len(recommended_items) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format(
                self.EVALUATOR_NAME, len(recommended_items), 1)

            assert all_items_predicted_ratings.shape[0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format(
                self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0], 1)

            assert all_items_predicted_ratings.shape[1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1], self.n_items)



            recommended_items = np.array(recommended_items[0])
            user_rmse = rmse(all_items_predicted_ratings[0], relevant_items, relevant_items_rating)

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)



            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[EvaluatorMetrics.ROC_AUC.value]              += roc_auc(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.PRECISION.value]            += precision(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value]   += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[EvaluatorMetrics.RECALL.value]               += recall(is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value]                 += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff)
                results_current_cutoff[EvaluatorMetrics.HIT_RATE.value]             += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value]                 += arhr(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.RMSE.value]                 += user_rmse

                results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff)



            if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate):
                elapsed_time = time.time()-start_time
                new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)

                print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format(
                              self.EVALUATOR_NAME,
                              n_users_evaluated,
                              100.0* float(n_users_evaluated)/len(self.usersToEvaluate),
                              new_time_value, new_time_unit,
                              float(n_users_evaluated)/elapsed_time))


                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()


        if (n_users_evaluated > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value/n_users_evaluated

                precision_ = results_current_cutoff[EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (precision_ * recall_) / (precision_ + recall_)


        else:
            print("WARNING: No users had a sufficient number of relevant items")


        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()



        results_run_string = get_result_string(results_dict)

        return (results_dict, results_run_string)
コード例 #24
0
    def fit(self,
            topK=100,
            alpha=1.,
            min_rating=0,
            implicit=False,
            normalize_similarity=False):

        self.topK = topK
        self.alpha = alpha
        self.min_rating = min_rating
        self.implicit = implicit
        self.normalize_similarity = normalize_similarity

        #
        # if X.dtype != np.float32:
        #     print("P3ALPHA fit: For memory usage reasons, we suggest to use np.float32 as dtype for the dataset")

        if self.min_rating > 0:
            self.URM_train.data[self.URM_train.data < self.min_rating] = 0
            self.URM_train.eliminate_zeros()
            if self.implicit:
                self.URM_train.data = np.ones(self.URM_train.data.size,
                                              dtype=np.float32)

        #Pui is the row-normalized urm
        Pui = normalize(self.URM_train, norm='l1', axis=1)

        #Piu is the column-normalized, "boolean" urm transposed
        X_bool = self.URM_train.transpose(copy=True)
        X_bool.data = np.ones(X_bool.data.size, np.float32)
        #ATTENTION: axis is still 1 because i transposed before the normalization
        Piu = normalize(X_bool, norm='l1', axis=1)
        del (X_bool)

        # Alfa power
        if self.alpha != 1.:
            Pui = Pui.power(self.alpha)
            Piu = Piu.power(self.alpha)

        # Final matrix is computed as Pui * Piu * Pui
        # Multiplication unpacked for memory usage reasons
        block_dim = 200
        d_t = Piu

        similarity_builder = Incremental_Similarity_Builder(
            Pui.shape[1],
            initial_data_block=Pui.shape[1] * self.topK,
            dtype=np.float32)

        start_time = time.time()
        start_time_printBatch = start_time

        for current_block_start_row in range(0, Pui.shape[1], block_dim):

            if current_block_start_row + block_dim > Pui.shape[1]:
                block_dim = Pui.shape[1] - current_block_start_row

            similarity_block = d_t[
                current_block_start_row:current_block_start_row +
                block_dim, :] * Pui
            similarity_block = similarity_block.toarray()

            for row_in_block in range(block_dim):
                row_data = similarity_block[row_in_block, :]
                row_data[current_block_start_row + row_in_block] = 0

                best = row_data.argsort()[::-1][:self.topK]

                notZerosMask = row_data[best] != 0.0

                values_to_add = row_data[best][notZerosMask]
                cols_to_add = best[notZerosMask]

                similarity_builder.add_data_lists(
                    row_list_to_add=np.ones(len(values_to_add)) *
                    (current_block_start_row + row_in_block),
                    col_list_to_add=cols_to_add,
                    data_list_to_add=values_to_add)

            if time.time() - start_time_printBatch > 300:
                new_time_value, new_time_unit = seconds_to_biggest_unit(
                    time.time() - start_time)

                self._print(
                    "Similarity column {} ({:4.1f}%), {:.2f} column/sec. Elapsed time {:.2f} {}"
                    .format(
                        current_block_start_row + block_dim,
                        100.0 * float(current_block_start_row + block_dim) /
                        Pui.shape[1],
                        float(current_block_start_row + block_dim) /
                        (time.time() - start_time), new_time_value,
                        new_time_unit))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_printBatch = time.time()

        self.W_sparse = similarity_builder.get_SparseMatrix()

        if self.normalize_similarity:
            self.W_sparse = normalize(self.W_sparse, norm='l1', axis=1)

        if self.topK != False:
            self.W_sparse = similarityMatrixTopK(self.W_sparse, k=self.topK)

        self.W_sparse = check_matrix(self.W_sparse, format='csr')