def get_statistics_URM(self): self._assert_is_initialized() n_users, n_items = self.SPLIT_URM_DICT["URM_train"].shape statistics_string = "DataReader: {}\n" \ "\tNum items: {}\n" \ "\tNum users: {}\n" \ "\tTrain \t\tinteractions {}, \tdensity {:.2E}\n".format( self.dataReader_object._get_dataset_name(), n_items, n_users, self.SPLIT_URM_DICT["URM_train"].nnz, compute_density(self.SPLIT_URM_DICT["URM_train"])) if self.use_validation_set: statistics_string += "\tValidation \tinteractions {}, \tdensity {:.2E}\n".format( self.SPLIT_URM_DICT["URM_validation"].nnz, compute_density(self.SPLIT_URM_DICT["URM_validation"])) statistics_string += "\tTest \t\tinteractions {}, \tdensity {:.2E}\n".format( self.SPLIT_URM_DICT["URM_test"].nnz, compute_density(self.SPLIT_URM_DICT["URM_test"])) self._print(statistics_string) print("\n")
def get_statistics_UCM(self): self._assert_is_initialized() if len(self.dataReader_object.get_loaded_UCM_names()) > 0: for UCM_name, UCM_object in self.SPLIT_UCM_DICT.items(): n_items, n_features = UCM_object.shape statistics_string = "\tUCM name: {}, Num features: {}, feature occurrences: {}, density {:.2E}".format( UCM_name, n_features, UCM_object.nnz, compute_density(UCM_object)) print(statistics_string)
def print_statistics(self): self._assert_is_initialized() URM_all = self.get_URM_all() n_users, n_items = URM_all.shape n_interactions = URM_all.nnz URM_all = sps.csr_matrix(URM_all) user_profile_length = np.ediff1d(URM_all.indptr) max_interactions_per_user = user_profile_length.max() avg_interactions_per_user = n_interactions / n_users min_interactions_per_user = user_profile_length.min() URM_all = sps.csc_matrix(URM_all) item_profile_length = np.ediff1d(URM_all.indptr) max_interactions_per_item = item_profile_length.max() avg_interactions_per_item = n_interactions / n_items min_interactions_per_item = item_profile_length.min() print("DataReader: current dataset is: {}\n" "\tNumber of items: {}\n" "\tNumber of users: {}\n" "\tNumber of interactions in URM_all: {}\n" "\tValue range in URM_all: {:.2f}-{:.2f}\n" "\tInteraction density: {:.2E}\n" "\tInteractions per user:\n" "\t\t Min: {:.2E}\n" "\t\t Avg: {:.2E}\n" "\t\t Max: {:.2E}\n" "\tInteractions per item:\n" "\t\t Min: {:.2E}\n" "\t\t Avg: {:.2E}\n" "\t\t Max: {:.2E}\n" "\tGini Index: {:.2f}\n".format( self.__class__, n_items, n_users, n_interactions, np.min(URM_all.data), np.max(URM_all.data), compute_density(URM_all), min_interactions_per_user, avg_interactions_per_user, max_interactions_per_user, min_interactions_per_item, avg_interactions_per_item, max_interactions_per_item, gini_index(user_profile_length), )) if self._HAS_ICM: for ICM_name, ICM_object in self.AVAILABLE_ICM.items(): n_items, n_features = ICM_object.shape min_value = np.min(ICM_object.data) max_value = np.max(ICM_object.data) format_string = "2E" if np.max( [np.abs(min_value), np.abs(max_value)]) > 100 else "2f" statistics_string = "\tICM name: {}, Value range: {:.{format_string}} / {:.{format_string}}, Num features: {}, feature occurrences: {}, density {:.2E}".format( ICM_name, min_value, max_value, n_features, ICM_object.nnz, compute_density(ICM_object), format_string=format_string) print(statistics_string) print("\n")
def get_statistics_URM(self): self._assert_is_initialized() n_users, n_items = self.SPLIT_URM_DICT["URM_train"].shape statistics_string = "DataReader: {}\n" \ "\tNum items: {}\n" \ "\tNum users: {}\n" \ "\tTrain \t\tquota {:.2f} ({:.2f}), \tinteractions {}, \tdensity {:.2E}\n" \ "\tValidation \tquota {:.2f} ({:.2f}), \tinteractions {}, \tdensity {:.2E}\n" \ "\tTest \t\tquota {:.2f} ({:.2f}), \tinteractions {}, \tdensity {:.2E}\n".format( self.dataReader_object._get_dataset_name(), n_items, n_users, self.input_split_interaction_quota_list[0], self.actual_split_interaction_quota_list[0], self.SPLIT_URM_DICT["URM_train"].nnz, compute_density(self.SPLIT_URM_DICT["URM_train"]), self.input_split_interaction_quota_list[1], self.actual_split_interaction_quota_list[1], self.SPLIT_URM_DICT["URM_validation"].nnz, compute_density(self.SPLIT_URM_DICT["URM_validation"]), self.input_split_interaction_quota_list[2], self.actual_split_interaction_quota_list[2], self.SPLIT_URM_DICT["URM_test"].nnz, compute_density(self.SPLIT_URM_DICT["URM_test"]), ) self._print(statistics_string) print("\n")