Ejemplo n.º 1
0
    def get_statistics_URM(self):

        self._assert_is_initialized()

        n_users, n_items = self.SPLIT_URM_DICT["URM_train"].shape

        statistics_string = "DataReader: {}\n" \
                            "\tNum items: {}\n" \
                            "\tNum users: {}\n" \
                            "\tTrain \t\tinteractions {}, \tdensity {:.2E}\n".format(
                            self.dataReader_object._get_dataset_name(),
                            n_items,
                            n_users,
                            self.SPLIT_URM_DICT["URM_train"].nnz, compute_density(self.SPLIT_URM_DICT["URM_train"]))

        if self.use_validation_set:
            statistics_string += "\tValidation \tinteractions {}, \tdensity {:.2E}\n".format(
                self.SPLIT_URM_DICT["URM_validation"].nnz,
                compute_density(self.SPLIT_URM_DICT["URM_validation"]))

        statistics_string += "\tTest \t\tinteractions {}, \tdensity {:.2E}\n".format(
            self.SPLIT_URM_DICT["URM_test"].nnz,
            compute_density(self.SPLIT_URM_DICT["URM_test"]))

        self._print(statistics_string)

        print("\n")
    def get_statistics_UCM(self):

        self._assert_is_initialized()

        if len(self.dataReader_object.get_loaded_UCM_names()) > 0:

            for UCM_name, UCM_object in self.SPLIT_UCM_DICT.items():

                n_items, n_features = UCM_object.shape

                statistics_string = "\tUCM name: {}, Num features: {}, feature occurrences: {}, density {:.2E}".format(
                    UCM_name, n_features, UCM_object.nnz,
                    compute_density(UCM_object))

                print(statistics_string)
Ejemplo n.º 3
0
    def print_statistics(self):

        self._assert_is_initialized()

        URM_all = self.get_URM_all()

        n_users, n_items = URM_all.shape

        n_interactions = URM_all.nnz

        URM_all = sps.csr_matrix(URM_all)
        user_profile_length = np.ediff1d(URM_all.indptr)

        max_interactions_per_user = user_profile_length.max()
        avg_interactions_per_user = n_interactions / n_users
        min_interactions_per_user = user_profile_length.min()

        URM_all = sps.csc_matrix(URM_all)
        item_profile_length = np.ediff1d(URM_all.indptr)

        max_interactions_per_item = item_profile_length.max()
        avg_interactions_per_item = n_interactions / n_items
        min_interactions_per_item = item_profile_length.min()

        print("DataReader: current dataset is: {}\n"
              "\tNumber of items: {}\n"
              "\tNumber of users: {}\n"
              "\tNumber of interactions in URM_all: {}\n"
              "\tValue range in URM_all: {:.2f}-{:.2f}\n"
              "\tInteraction density: {:.2E}\n"
              "\tInteractions per user:\n"
              "\t\t Min: {:.2E}\n"
              "\t\t Avg: {:.2E}\n"
              "\t\t Max: {:.2E}\n"
              "\tInteractions per item:\n"
              "\t\t Min: {:.2E}\n"
              "\t\t Avg: {:.2E}\n"
              "\t\t Max: {:.2E}\n"
              "\tGini Index: {:.2f}\n".format(
                  self.__class__,
                  n_items,
                  n_users,
                  n_interactions,
                  np.min(URM_all.data),
                  np.max(URM_all.data),
                  compute_density(URM_all),
                  min_interactions_per_user,
                  avg_interactions_per_user,
                  max_interactions_per_user,
                  min_interactions_per_item,
                  avg_interactions_per_item,
                  max_interactions_per_item,
                  gini_index(user_profile_length),
              ))

        if self._HAS_ICM:

            for ICM_name, ICM_object in self.AVAILABLE_ICM.items():

                n_items, n_features = ICM_object.shape

                min_value = np.min(ICM_object.data)
                max_value = np.max(ICM_object.data)

                format_string = "2E" if np.max(
                    [np.abs(min_value), np.abs(max_value)]) > 100 else "2f"

                statistics_string = "\tICM name: {}, Value range: {:.{format_string}} / {:.{format_string}}, Num features: {}, feature occurrences: {}, density {:.2E}".format(
                    ICM_name,
                    min_value,
                    max_value,
                    n_features,
                    ICM_object.nnz,
                    compute_density(ICM_object),
                    format_string=format_string)

                print(statistics_string)

            print("\n")
Ejemplo n.º 4
0
    def get_statistics_URM(self):

        self._assert_is_initialized()

        n_users, n_items = self.SPLIT_URM_DICT["URM_train"].shape

        statistics_string = "DataReader: {}\n" \
                            "\tNum items: {}\n" \
                            "\tNum users: {}\n" \
                            "\tTrain \t\tquota {:.2f} ({:.2f}), \tinteractions {}, \tdensity {:.2E}\n" \
                            "\tValidation \tquota {:.2f} ({:.2f}), \tinteractions {}, \tdensity {:.2E}\n" \
                            "\tTest \t\tquota {:.2f} ({:.2f}), \tinteractions {}, \tdensity {:.2E}\n".format(
            self.dataReader_object._get_dataset_name(),
            n_items,
            n_users,
            self.input_split_interaction_quota_list[0], self.actual_split_interaction_quota_list[0], self.SPLIT_URM_DICT["URM_train"].nnz, compute_density(self.SPLIT_URM_DICT["URM_train"]),
            self.input_split_interaction_quota_list[1], self.actual_split_interaction_quota_list[1], self.SPLIT_URM_DICT["URM_validation"].nnz, compute_density(self.SPLIT_URM_DICT["URM_validation"]),
            self.input_split_interaction_quota_list[2], self.actual_split_interaction_quota_list[2], self.SPLIT_URM_DICT["URM_test"].nnz, compute_density(self.SPLIT_URM_DICT["URM_test"]),
        )

        self._print(statistics_string)

        print("\n")