def build_similarity_matrix(self, df_responses, agg_strategy, filter_sample_method, mapping_matrix):
        """
        It builds the similarity matrix by using a dataframe with all the samples collected from the solver in the
        fit function.

        The samples obtained from the solver are post-processed with a filtering operation (i.e. filter_strategy) and
        an aggregation operation (i.e. agg_strategy). At the end of this pipeline, it outputs a single list containing
        a column of the similarity matrix.

        :param df_responses: a dataframe containing the samples collected from the solver
        :param agg_strategy: the post-processing aggregation to be used on the samples
        :param filter_sample_method: the filter technique used before the post-processing aggregation
        :param mapping_matrix: list of np.ndarray containing the mapping of the samples variables into the original
                               variables for each item problem
        :return: the similarity matrix built from the dataframe given
        """
        n_items = self.URM_train.shape[1]
        if mapping_matrix is None:
            mapping_matrix = np.repeat(np.reshape(np.arange(0, n_items), newshape=(1, n_items)), repeats=n_items, axis=0)
        matrix_builder = IncrementalSparseMatrix(n_rows=n_items, n_cols=n_items)

        for currentItem in range(n_items):
            response_df = df_responses[df_responses.item_id == currentItem].copy()
            self.add_sample_responses_to_matrix_builder(matrix_builder, agg_strategy, filter_sample_method, response_df,
                                                        currentItem, mapping_matrix[currentItem])

        return sps.csr_matrix(matrix_builder.get_SparseMatrix())
Esempio n. 2
0
def apply_feature_engineering_ICM(ICM_dict: dict, URM, UCM_dict: dict,
                                  ICM_names_to_count: list,
                                  UCM_names_to_list: list):
    if ~np.all(np.in1d(list(ICM_names_to_count), list(ICM_dict.keys()))):
        raise KeyError("Mapper contains wrong ICM names")

    if ~np.all(np.in1d(UCM_names_to_list, list(UCM_dict.keys()))):
        raise KeyError("Mapper contains wrong UCM names")

    for ICM_name in ICM_names_to_count:
        ICM_object: sps.csr_matrix = ICM_dict[ICM_name]
        column = ICM_object.tocoo().col
        uniques, counts = np.unique(column, return_counts=True)

        new_ICM_name = "{}_count".format(ICM_name)
        new_row = np.array(ICM_object.tocoo().row, dtype=int)
        new_col = np.array([0] * len(new_row), dtype=int)
        new_data = np.array(counts[column], dtype=np.float32)

        ICM_builder = IncrementalSparseMatrix()
        ICM_builder.add_data_lists(new_row, new_col, new_data)
        ICM_dict[new_ICM_name] = ICM_builder.get_SparseMatrix()

    for UCM_name in UCM_names_to_list:
        UCM_object = UCM_dict[UCM_name]
        UCM_suffix_name = UCM_name.replace("UCM", "")

        new_ICM = URM.T.dot(UCM_object)
        new_ICM_name = "ICM{}".format(UCM_suffix_name)

        ICM_dict[new_ICM_name] = new_ICM.tocsr()
    return ICM_dict
    def test_IncrementalSparseMatrix_add_rows(self):

        import numpy as np

        n_rows = 100
        n_cols = 200

        randomMatrix = sps.random(n_rows, n_cols, density=0.01, format='csr')

        incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)


        for row in range(n_rows):

            row_data = randomMatrix.indices[randomMatrix.indptr[row]:randomMatrix.indptr[row+1]]

            incrementalMatrix.add_single_row(row,
                                             row_data,
                                             5.0)


        randomMatrix.data = np.ones_like(randomMatrix.data)*5.0

        randomMatrix_incremental = incrementalMatrix.get_SparseMatrix()

        assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
Esempio n. 4
0
def apply_feature_entropy_UCM(UCM_dict: dict, UCM_names_to_entropy: list):
    if ~np.all(np.in1d(UCM_names_to_entropy, list(UCM_dict.keys()))):
        raise KeyError("Mapper contains wrong UCM names")

    for UCM_name in UCM_names_to_entropy:
        UCM_object: sps.csr_matrix = UCM_dict[UCM_name]

        total_interactions_each_row = np.array(
            UCM_object.sum(axis=1)).squeeze()
        interactions_each_row = UCM_object.indptr[1:] - UCM_object.indptr[:-1]
        total_interactions = np.repeat(total_interactions_each_row,
                                       interactions_each_row)
        UCM_object.data = UCM_object.data / total_interactions

        log_UCM_object = UCM_object.copy()
        log_UCM_object.data = np.log2(log_UCM_object.data)

        entropy_matrix = UCM_object.multiply(log_UCM_object)
        entropy = np.array(entropy_matrix.sum(axis=1)).squeeze()

        new_UCM_name = "{}_entropy".format(UCM_name)
        new_row = np.arange(UCM_object.shape[0])
        new_col = np.array([0] * len(new_row), dtype=int)
        new_data = entropy

        UCM_builder = IncrementalSparseMatrix()
        UCM_builder.add_data_lists(new_row, new_col, new_data)
        UCM_dict[new_UCM_name] = UCM_builder.get_SparseMatrix()

    return UCM_dict
Esempio n. 5
0
def transform_numerical_to_discretized_sparse_matrix(row: np.ndarray,
                                                     data: np.ndarray,
                                                     bins=20):
    """
    Transform a numerical data array into a discretized sparse matrix with a certain amount of bins

    :param data: array of numerical values
    :param bins: number of labels in the output
    :return: sparse matrix containing the discretized values
    """
    if row.size != data.size:
        raise ValueError("Row size has to be the same of data size")

    eps = 10e-6
    norm_x = (data - data.min()) / (data.max() - data.min() + eps) * 100
    step = (norm_x.max() / bins)
    bins_list = [i * step for i in range(bins)]

    sparse_matrix_builder = IncrementalSparseMatrix(n_rows=np.max(row) + 1,
                                                    n_cols=bins + 1)
    for i, x in enumerate(norm_x):
        x_dist = np.random.normal(loc=x, scale=step, size=1000)
        label_x_dist = np.digitize(x_dist, bins_list, right=True)
        unique_label_x_dist, unique_counts = np.unique(label_x_dist,
                                                       return_counts=True)
        unique_counts = unique_counts / np.max(
            unique_counts)  # Normalize unique counts
        size = unique_label_x_dist.size

        sparse_matrix_builder.add_data_lists([row[i]] * size,
                                             unique_label_x_dist,
                                             unique_counts)
    sparse_matrix = sparse_matrix_builder.get_SparseMatrix()
    return sparse_matrix
def sample_negative_interactions_uniformly(negative_sample_size,
                                           URM,
                                           batch_size=10000):
    n_users = URM.shape[0]
    n_items = URM.shape[1]

    invalid_users = np.array(URM.tocoo().row, dtype=np.uint64)
    invalid_items = np.array(URM.tocoo().col, dtype=np.uint64)

    # Convert users and items into a unique integers
    shifted_invalid_items = np.left_shift(invalid_items,
                                          np.uint64(np.log2(n_users) + 1))
    invalid_tuples = np.bitwise_or(invalid_users, shifted_invalid_items)
    negative_URM_builder = IncrementalSparseMatrix(n_rows=n_users,
                                                   n_cols=n_items)
    with tqdm(desc="Sampling negative interactions",
              total=negative_sample_size) as p_bar:
        sampled = 0
        while sampled < negative_sample_size:
            # Sample a batch of users and items
            users = np.random.randint(low=0,
                                      high=n_users,
                                      size=batch_size,
                                      dtype=np.uint64)
            items = np.random.randint(low=0,
                                      high=n_items,
                                      size=batch_size,
                                      dtype=np.uint64)

            # Convert into unique integers
            shifted_items = np.left_shift(items,
                                          np.uint64(np.log2(n_users) + 1))
            tuples = np.bitwise_or(users, shifted_items)
            unique_tuples, indices = np.unique(tuples, return_index=True)

            # Remove couple of user and items which are already inside the chosen ones
            invalid_tuples_mask = np.in1d(unique_tuples,
                                          invalid_tuples,
                                          assume_unique=True)
            valid_indices = indices[~invalid_tuples_mask]
            valid_users = users[valid_indices]
            valid_items = items[valid_indices]

            # Cap the size of batch size if it is the last batch
            if sampled + len(valid_users) > negative_sample_size:
                remaining_sample_size = negative_sample_size - sampled
                valid_users = valid_users[:remaining_sample_size]
                valid_items = valid_items[:remaining_sample_size]

            # Update builder, sampled elements and progress bar
            negative_URM_builder.add_data_lists(valid_users, valid_items,
                                                np.ones(len(valid_users)))
            sampled += len(valid_users)
            p_bar.update(len(valid_users))

            # Update invalid users and items
            invalid_tuples = np.concatenate(
                [invalid_tuples, tuples[valid_indices]])
    return negative_URM_builder.get_SparseMatrix().tocsr()
def format_URM_slice_uncompressed(users, items_per_users, max_user_id, n_cols):
    fm_matrix_builder = IncrementalSparseMatrix(n_cols=n_cols)
    row_list = np.repeat(np.arange(items_per_users.shape[0] *
                                   items_per_users.shape[1]),
                         repeats=2)
    col_list = np.zeros(shape=items_per_users.shape[0] *
                        items_per_users.shape[1] * 2)
    user_col_list = np.repeat(users, repeats=items_per_users.shape[1])
    items_col_list = np.array(items_per_users).flatten() + max_user_id
    col_list[np.arange(items_per_users.shape[0] * items_per_users.shape[1]) *
             2] = user_col_list
    col_list[np.arange(items_per_users.shape[0] * items_per_users.shape[1]) * 2
             + 1] = items_col_list
    fm_matrix_builder.add_data_lists(row_list_to_add=row_list,
                                     col_list_to_add=col_list,
                                     data_list_to_add=np.ones(len(row_list)))
    return fm_matrix_builder.get_SparseMatrix()
    def test_IncrementalSparseMatrix_add_lists(self):

        n_rows = 100
        n_cols = 200

        randomMatrix = sps.random(n_rows, n_cols, density=0.01, format='coo')

        incrementalMatrix = IncrementalSparseMatrix(n_rows=n_rows, n_cols=n_cols)


        incrementalMatrix.add_data_lists(randomMatrix.row.copy(),
                                         randomMatrix.col.copy(),
                                         randomMatrix.data.copy())


        randomMatrix_incremental = incrementalMatrix.get_SparseMatrix()

        assert sparse_are_equals(randomMatrix, randomMatrix_incremental)
Esempio n. 9
0
def apply_discretization_UCM(UCM_dict, UCM_name_to_bins_mapper: dict):
    if ~np.all(
            np.in1d(list(UCM_name_to_bins_mapper.keys()), list(
                UCM_dict.keys()))):
        raise KeyError("Mapper contains wrong UCM names")

    for UCM_name, bins in UCM_name_to_bins_mapper.items():
        UCM_object: sps.csr_matrix = UCM_dict[UCM_name]
        if UCM_object.shape[1] != 1:
            raise KeyError(
                "Given UCM name is not regarding a single feature, thus, it cannot be discretized"
            )

        x = np.array(UCM_object.data)
        labelled_x = transform_numerical_to_label(x, bins)

        UCM_builder = IncrementalSparseMatrix(n_rows=UCM_object.shape[0])
        UCM_builder.add_data_lists(UCM_object.tocoo().row, labelled_x,
                                   np.ones(len(labelled_x), dtype=np.float32))

        UCM_dict[UCM_name] = UCM_builder.get_SparseMatrix()
    return UCM_dict
Esempio n. 10
0
def load_CSV_into_SparseBuilder (filePath, header = False, separator="::"):


    matrixBuilder = IncrementalSparseMatrix(auto_create_col_mapper = True, auto_create_row_mapper = True)

    fileHandle = open(filePath, "r")
    numCells = 0

    if header:
        fileHandle.readline()

    for line in fileHandle:
        numCells += 1
        if (numCells % 1000000 == 0):
            print("Processed {} cells".format(numCells))

        if (len(line)) > 1:
            line = line.split(separator)

            line[-1] = line[-1].replace("\n", "")

            try:
                user_id = line[0]
                item_id = line[1]


                try:
                    value = float(line[2])

                    if value != 0.0:

                        matrixBuilder.add_data_lists([user_id], [item_id], [value])

                except ValueError:
                    print("load_CSV_into_SparseBuilder: Cannot parse as float value '{}'".format(line[2]))


            except IndexError:
                print("load_CSV_into_SparseBuilder: Index out of bound in line '{}'".format(line))


    fileHandle.close()



    return  matrixBuilder.get_SparseMatrix(), matrixBuilder.get_column_token_to_id_mapper(), matrixBuilder.get_row_token_to_id_mapper()
    def fit(self, agg_strategy="FIRST", filter_sample_method="NONE", topK=5, alpha_multiplier=0,
            constraint_multiplier=1, chain_multiplier=1, filter_items_method="NONE", filter_items_n=100,
            num_reads=100, **filter_items_parameters):
        """
        It fits the data (i.e. URM_train) by solving an optimization problem for each item. Each optimization problem
        is generated from the URM_train without the target column and the target column by means of transformation to
        a QUBO based on "transform_fn" with some regulators; then it is solved by a solver given at the initialization
        of the class.

        Then by using the samples collected from the solver, it builds the item-similarity matrix.

        :param agg_strategy: the post-processing aggregation to be used on the samples
        :param filter_sample_method: the filter technique used before the post-processing aggregation
        :param topK: a regulator number that indicates the number of selected variables forced during the optimization
        :param alpha_multiplier: a multiplier number applied on the constraint of the sparsity regulator term
        :param constraint_multiplier: a multiplier number applied on the constraint strength of the variable
                                      selection regulator
        :param chain_multiplier: a multiplier number applied on the chain strength of the embedding
        :param filter_items_method: name of the filtering method to select a set of items for the resolution of the
                                    optimization problem
        :param filter_items_n: number of items to be selected by the filtering method
        :param num_reads: number of samples to compute from the solver
        :param filter_items_parameters: other parameters regarding the filter items method
        """
        self._check_fit_parameters(agg_strategy, filter_items_method, filter_sample_method)
        if filter_items_method == "COSINE":
            self.FILTER_ITEMS_METHODS["COSINE"] = ItemSelectorByCosineSimilarity(**filter_items_parameters)
        URM_train = check_matrix(self.URM_train, 'csc', dtype=np.float32)

        n_items = URM_train.shape[1]
        item_pop = np.array((URM_train > 0).sum(axis=0)).flatten()

        # Need a labeling of variables to order the variables from 0 to n_items. With variable leading zeros based on
        # the highest number of digits
        leading_zeros = len(str(n_items - 1))
        variables = ["a{:0{}d}".format(i, leading_zeros) for i in range(n_items)]

        if self.to_resume:
            start_item = self.df_responses[self.ITEM_ID_COLUMN_NAME].max()
        else:
            self.df_responses = pd.DataFrame()
            start_item = 0

        self.FILTER_ITEMS_METHODS[filter_items_method].precompute_best_item_indices(URM_train)
        matrix_builder = IncrementalSparseMatrix(n_rows=n_items, n_cols=n_items)

        for curr_item in tqdm(range(start_item, n_items), desc="%s: Computing W_sparse matrix" % self.RECOMMENDER_NAME):
            # get the target column
            target_column = URM_train[:, curr_item].toarray()

            # set the "curr_item"-th column of URM_train to zero
            start_pos = URM_train.indptr[curr_item]
            end_pos = URM_train.indptr[curr_item + 1]
            current_item_data_backup = URM_train.data[start_pos: end_pos].copy()
            URM_train.data[start_pos: end_pos] = 0.0

            # select items to be used in the QUBO optimization problem
            URM = URM_train.copy()
            URM, mapping_array = self.FILTER_ITEMS_METHODS[filter_items_method].filter_items(URM, target_column,
                                                                                             curr_item,
                                                                                             filter_items_n)
            n_variables = len(mapping_array)

            # get BQM/QUBO problem for the current item
            qubo = self.LOSSES[self.obj_function].get_qubo_problem(URM, target_column)
            qubo = qubo + (np.log1p(item_pop[curr_item]) ** 2 + 1) * alpha_multiplier * (np.max(qubo) - np.min(qubo)) \
                   * np.identity(n_variables)
            if topK > -1:
                constraint_strength = max(self.MIN_CONSTRAINT_STRENGTH,
                                          constraint_multiplier * (np.max(qubo) - np.min(qubo)))
                # avoid using the "combinations" function of dimod in order to speed up the computation
                qubo += -2 * constraint_strength * topK * np.identity(n_variables) + constraint_strength * np.ones(
                    (n_variables, n_variables))

            # Generation of the BQM with qubo in a quicker way checked with some performance measuring. On a test of
            # 2000 n_items, this method is quicker w.r.t. from_numpy_matrix function of dimod
            bqm = dimod.BinaryQuadraticModel.empty(dimod.BINARY)
            bqm.add_variables_from(dict(zip(variables, np.diag(qubo))))

            for i in range(n_variables):
                values = np.array(qubo[i, i + 1:]).flatten() + np.array(qubo[i + 1:, i]).flatten()
                keys = [(variables[i], variables[j]) for j in range(i + 1, n_variables)]
                bqm.add_interactions_from(dict(zip(keys, values)))

            self._print("The BQM for item {} is {}".format(curr_item, bqm))

            # solve the problem with the solver
            try:
                if ("child_properties" in self.solver.properties and
                    self.solver.properties["child_properties"]["category"] == "qpu") \
                        or "qpu_properties" in self.solver.properties:
                    chain_strength = max(self.MIN_CONSTRAINT_STRENGTH,
                                         chain_multiplier * (np.max(qubo) - np.min(qubo)))
                    response = self.solver.sample(bqm, chain_strength=chain_strength, num_reads=num_reads)
                    self._print("Break chain percentage of item {} is {}"
                                .format(curr_item, list(response.data(fields=["chain_break_fraction"]))))
                    self._print("Timing of QPU is %s" % response.info["timing"])
                else:
                    response = self.solver.sample(bqm, num_reads=num_reads)

                self._print("The response for item {} is {}".format(curr_item, response.aggregate()))
            except OSError as err:
                traceback.print_exc()
                raise err

            # save response in self.responses if self.do_save_responses is True; otherwise apply post-processing
            # and put the results in the matrix builder
            response_df = response.to_pandas_dataframe()
            response_df[self.ITEM_ID_COLUMN_NAME] = curr_item
            if self.do_save_responses:
                self.df_responses = self.df_responses.append(response_df, ignore_index=True)
                self.mapping_matrix.append(mapping_array)
            else:
                self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1)
                self.add_sample_responses_to_matrix_builder(matrix_builder, agg_strategy, filter_sample_method,
                                                            response_df, curr_item, mapping_array)

            # restore URM_train
            URM_train.data[start_pos:end_pos] = current_item_data_backup

        if self.do_save_responses:
            self.df_responses = self.df_responses.reindex(sorted(self.df_responses.columns), axis=1)
            self.W_sparse = self.build_similarity_matrix(self.df_responses, agg_strategy, filter_sample_method,
                                                         self.mapping_matrix)
        else:
            self.W_sparse = matrix_builder.get_SparseMatrix()
def split_train_in_two_percentage_user_wise(URM_train,
                                            train_percentage=0.1,
                                            verbose=False):
    """
    The function splits an URM in two matrices selecting the number of interactions one user at a time
    :param URM_train:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage <= 1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(
        train_percentage)

    from course_lib.Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    # ensure to use csr matrix or we get big problem
    URM_train = URM_train.tocsr()

    num_users, num_items = URM_train.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                n_cols=num_items,
                                                auto_create_col_mapper=False,
                                                auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(
        n_rows=num_users,
        n_cols=num_items,
        auto_create_col_mapper=False,
        auto_create_row_mapper=False)

    user_no_item_train = 0
    user_no_item_validation = 0

    for user_id in range(URM_train.shape[0]):

        start_pos = URM_train.indptr[user_id]
        end_pos = URM_train.indptr[user_id + 1]

        user_profile_items = URM_train.indices[start_pos:end_pos]
        user_profile_ratings = URM_train.data[start_pos:end_pos]
        user_profile_length = len(user_profile_items)

        n_train_items = round(user_profile_length * train_percentage)

        if n_train_items == len(user_profile_items) and n_train_items > 1:
            n_train_items -= 1

        indices_for_sampling = np.arange(0, user_profile_length, dtype=np.int)
        np.random.shuffle(indices_for_sampling)

        train_items = user_profile_items[indices_for_sampling[0:n_train_items]]
        train_ratings = user_profile_ratings[
            indices_for_sampling[0:n_train_items]]

        validation_items = user_profile_items[
            indices_for_sampling[n_train_items:]]
        validation_ratings = user_profile_ratings[
            indices_for_sampling[n_train_items:]]

        if len(train_items) == 0:
            if verbose: print("User {} has 0 train items".format(user_id))
            user_no_item_train += 1

        if len(validation_items) == 0:
            if verbose: print("User {} has 0 validation items".format(user_id))
            user_no_item_validation += 1

        URM_train_builder.add_data_lists([user_id] * len(train_items),
                                         train_items, train_ratings)
        URM_validation_builder.add_data_lists(
            [user_id] * len(validation_items), validation_items,
            validation_ratings)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(
            user_no_item_train, user_no_item_train / num_users * 100,
            num_users))
    if user_no_item_validation != 0:
        print(
            "Warning: {} ({:.2f} %) of {} users have no sampled items".format(
                user_no_item_validation,
                user_no_item_validation / num_users * 100, num_users))

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    return URM_train, URM_validation
def split_train_in_two_percentage_global_sample(URM_all, train_percentage=0.1):
    """
    The function splits an URM in two matrices selecting the number of interactions globally
    :param URM_all:
    :param train_percentage:
    :param verbose:
    :return:
    """

    assert train_percentage >= 0.0 and train_percentage <= 1.0, "train_percentage must be a value between 0.0 and 1.0, provided was '{}'".format(
        train_percentage)

    from course_lib.Data_manager.IncrementalSparseMatrix import IncrementalSparseMatrix

    num_users, num_items = URM_all.shape

    URM_train_builder = IncrementalSparseMatrix(n_rows=num_users,
                                                n_cols=num_items,
                                                auto_create_col_mapper=False,
                                                auto_create_row_mapper=False)
    URM_validation_builder = IncrementalSparseMatrix(
        n_rows=num_users,
        n_cols=num_items,
        auto_create_col_mapper=False,
        auto_create_row_mapper=False)

    URM_train = sps.coo_matrix(URM_all)

    indices_for_sampling = np.arange(0, URM_all.nnz, dtype=np.int)
    np.random.shuffle(indices_for_sampling)

    n_train_interactions = round(URM_all.nnz * train_percentage)

    indices_for_train = indices_for_sampling[
        indices_for_sampling[0:n_train_interactions]]
    indices_for_validation = indices_for_sampling[
        indices_for_sampling[n_train_interactions:]]

    URM_train_builder.add_data_lists(URM_train.row[indices_for_train],
                                     URM_train.col[indices_for_train],
                                     URM_train.data[indices_for_train])

    URM_validation_builder.add_data_lists(
        URM_train.row[indices_for_validation],
        URM_train.col[indices_for_validation],
        URM_train.data[indices_for_validation])

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_validation = URM_validation_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    URM_validation = sps.csr_matrix(URM_validation)

    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)
    user_no_item_validation = np.sum(np.ediff1d(URM_validation.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no train items".format(
            user_no_item_train, user_no_item_train / num_users * 100,
            num_users))
    if user_no_item_validation != 0:
        print(
            "Warning: {} ({:.2f} %) of {} users have no sampled items".format(
                user_no_item_validation,
                user_no_item_validation / num_users * 100, num_users))

    return URM_train, URM_validation
Esempio n. 14
0
def split_train_leave_k_out_user_wise(URM,
                                      k_out=1,
                                      use_validation_set=True,
                                      leave_random_out=True):
    """
    The function splits an URM in two matrices selecting the k_out interactions one user at a time
    :param URM:
    :param k_out:
    :param use_validation_set:
    :param leave_random_out:
    :return:
    """

    assert k_out > 0, "k_out must be a value greater than 0, provided was '{}'".format(
        k_out)

    URM = sps.csr_matrix(URM)
    n_users, n_items = URM.shape

    URM_train_builder = IncrementalSparseMatrix(auto_create_row_mapper=False,
                                                n_rows=n_users,
                                                auto_create_col_mapper=False,
                                                n_cols=n_items)

    URM_test_builder = IncrementalSparseMatrix(auto_create_row_mapper=False,
                                               n_rows=n_users,
                                               auto_create_col_mapper=False,
                                               n_cols=n_items)

    if use_validation_set:
        URM_validation_builder = IncrementalSparseMatrix(
            auto_create_row_mapper=False,
            n_rows=n_users,
            auto_create_col_mapper=False,
            n_cols=n_items)

    for user_id in range(n_users):

        start_user_position = URM.indptr[user_id]
        end_user_position = URM.indptr[user_id + 1]

        user_profile = URM.indices[start_user_position:end_user_position]

        if leave_random_out:
            indices_to_suffle = np.arange(len(user_profile), dtype=np.int)

            np.random.shuffle(indices_to_suffle)

            user_interaction_items = user_profile[indices_to_suffle]
            user_interaction_data = URM.data[
                start_user_position:end_user_position][indices_to_suffle]

        else:

            # The first will be sampled so the last interaction must be the first one
            interaction_position = URM.data[
                start_user_position:end_user_position]

            sort_interaction_index = np.argsort(-interaction_position)

            user_interaction_items = user_profile[sort_interaction_index]
            user_interaction_data = URM.data[
                start_user_position:end_user_position][sort_interaction_index]

        #Test interactions
        user_interaction_items_test = user_interaction_items[0:k_out]
        user_interaction_data_test = user_interaction_data[0:k_out]

        URM_test_builder.add_data_lists(
            [user_id] * len(user_interaction_items_test),
            user_interaction_items_test, user_interaction_data_test)

        #validation interactions
        if use_validation_set:
            user_interaction_items_validation = user_interaction_items[
                k_out:k_out * 2]
            user_interaction_data_validation = user_interaction_data[
                k_out:k_out * 2]

            URM_validation_builder.add_data_lists(
                [user_id] * k_out, user_interaction_items_validation,
                user_interaction_data_validation)

        #Train interactions
        train_limit = k_out * 2 if use_validation_set else k_out
        user_interaction_items_train = user_interaction_items[train_limit:]
        user_interaction_data_train = user_interaction_data[train_limit:]

        URM_train_builder.add_data_lists(
            [user_id] * len(user_interaction_items_train),
            user_interaction_items_train, user_interaction_data_train)

    URM_train = URM_train_builder.get_SparseMatrix()
    URM_test = URM_test_builder.get_SparseMatrix()

    URM_train = sps.csr_matrix(URM_train)
    user_no_item_train = np.sum(np.ediff1d(URM_train.indptr) == 0)

    if user_no_item_train != 0:
        print("Warning: {} ({:.2f} %) of {} users have no Train items".format(
            user_no_item_train, user_no_item_train / n_users * 100, n_users))

    if use_validation_set:
        URM_validation = URM_validation_builder.get_SparseMatrix()

        URM_validation = sps.csr_matrix(URM_validation)
        user_no_item_validation = np.sum(
            np.ediff1d(URM_validation.indptr) == 0)

        if user_no_item_validation != 0:
            print(
                "Warning: {} ({:.2f} %) of {} users have no Validation items".
                format(user_no_item_validation,
                       user_no_item_validation / n_users * 100, n_users))

        return URM_train, URM_validation, URM_test

    return URM_train, URM_test