def _validate_preprocessed_dataset(user_features: FeatureDataset, item_features: FeatureDataset): if user_features is not None and any( user_features.df.duplicated(subset=user_features.ids.name)): ErrorMapping.throw(DuplicateFeatureDefinitionError()) if item_features is not None and any( item_features.df.duplicated(subset=item_features.ids.name)): ErrorMapping.throw(DuplicateFeatureDefinitionError())
def _validate_features_type(dataset: FeatureDataset): for col in dataset.columns: if dataset.get_column_type(col) == ColumnTypeName.NAN: ErrorMapping.throw( InvalidColumnTypeError( col_type=dataset.get_column_type(col), col_name=col, arg_name=dataset.name))
def _check_features(self, features: FeatureDataset): """Check compatibility between recorded features and the given features. The two feature dataset are compatibility if: 1. The new feature dataset contains all feature names in the old feature dataset 2. The same feature in two dataset is of same type """ common_logger.info(f"Check features compatibility with existing feature metas") for _, feature_meta in self.feature_metas.items(): name = feature_meta.name if name not in features.features: ErrorMapping.throw(ColumnNotFoundError(column_id=name, arg_name_missing_column=features.name)) column_type = features.get_column_type(name) if features.get_column_type(name) != feature_meta.type_: ErrorMapping.verify_element_type(type_=column_type, expected_type=feature_meta.type_, column_name=name, arg_name=features.name)
def _validate_preprocessed_dataset(transactions: TransactionDataset, user_features: FeatureDataset, item_features: FeatureDataset): if transactions.row_size <= 0: ErrorMapping.throw( InvalidDatasetError(dataset1=transactions.name, reason=f"dataset does not have any valid samples")) if transactions.df.duplicated( subset=transactions.columns[[TRANSACTIONS_USER_COL, TRANSACTIONS_ITEM_COL]]).any(): ErrorMapping.throw(MoreThanOneRatingError()) if user_features is not None and any(user_features.df.duplicated(subset=user_features.ids.name)): ErrorMapping.throw(DuplicateFeatureDefinitionError()) if item_features is not None and any(item_features.df.duplicated(subset=item_features.ids.name)): ErrorMapping.throw(DuplicateFeatureDefinitionError())