def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Recommend items from all item.") super().score(learner, test_transactions, user_features, item_features, **kwargs) max_recommended_item_count = kwargs["max_recommended_item_count"] return_ratings = kwargs["return_ratings"] all_items = learner.item_feature_builder.id_vocab test_transactions_df = test_transactions.df users = test_transactions_df.iloc[:, TRANSACTIONS_USER_COL].unique() module_logger.info( f"Get {len(users)} unique users, and {len(all_items)} unique items." ) with TimeProfile("Building complete user item transactions dataset"): transactions_df = self.build_user_item_cartesian_pairs( users=users, items=all_items) transactions = TransactionDataset(transactions_df) recommendations = self._recommend(learner, transactions=transactions, K=max_recommended_item_count, user_features=user_features, item_features=item_features) return self._format_recommendations( recommendations, return_ratings, K=max_recommended_item_count, score_column_names_build_method=build_ranking_column_names)
def _init_mpi_support(self): global _HVD_LIB _HVD_LIB = importlib.import_module("horovod.tensorflow") _HVD_LIB.init() self.hvd_rank = _HVD_LIB.rank() self.hvd_size = _HVD_LIB.size() os.environ["CUDA_VISIBLE_DEVICES"] = str(_HVD_LIB.local_rank()) module_logger.info( f"Set GPU {_HVD_LIB.local_rank()} GPU as visible devices.") if self.hvd_rank != 0: self.save_dir = None
def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Recommend items from unrated item.") super().score(learner, test_transactions, user_features, item_features, **kwargs) max_recommended_item_count = kwargs["max_recommended_item_count"] return_ratings = kwargs["return_ratings"] training_transactions = kwargs["training_transactions"] all_items = learner.item_feature_builder.id_vocab training_transactions_df = training_transactions.df training_transactions_df = training_transactions_df.rename( columns={ training_transactions_df.columns[TRANSACTIONS_USER_COL]: USER_COLUMN, training_transactions_df.columns[TRANSACTIONS_ITEM_COL]: ITEM_COLUMN }) users = test_transactions.df.iloc[:, TRANSACTIONS_USER_COL].unique() module_logger.info( f"Get {len(users)} unique users, and {len(all_items)} unique items." ) with TimeProfile("Building complete user item transactions dataset"): transactions_df = self.build_user_item_cartesian_pairs( users=users, items=all_items) transactions_df = pd.merge(transactions_df, training_transactions_df, how='left', on=[USER_COLUMN, ITEM_COLUMN], indicator=True) transactions_df = transactions_df[transactions_df['_merge'] == 'left_only'] transactions_df = transactions_df.drop(columns=['_merge']) transactions = TransactionDataset(transactions_df) recommendations = self._recommend(learner, transactions=transactions, K=max_recommended_item_count, user_features=user_features, item_features=item_features) return self._format_recommendations( recommendations, return_ratings, K=max_recommended_item_count, score_column_names_build_method=build_ranking_column_names)
def predict(self, transactions: TransactionDataset): if transactions.row_size == 0: return pd.Series() instances_count = transactions.row_size log_every_n_instances = instances_count // 5 if instances_count >= 5 else instances_count module_logger.info(f"Get {instances_count} test instances") module_logger.info(f"Rebuild model:\n {self.hyper_params}") self.build_model(load_checkpoints=True) input_fn = self.get_input_fn(transactions=transactions, batch_size=self.hyper_params.batch_size) predictions = [] start_time = time() with TimeProfile("Making predictions for user-item pairs"): for p in self.estimator.predict(input_fn=input_fn): if len(predictions) % log_every_n_instances == 0 and len( predictions) > 0: cost_seconds = time() - start_time remaining_seconds = cost_seconds / len(predictions) * ( instances_count - len(predictions)) module_logger.info( f"Finished {len(predictions)} instance predictions, " f"cost time: {datetime.timedelta(seconds=cost_seconds)}." f"Remaining time: {datetime.timedelta(seconds=remaining_seconds)}" ) predictions.append(p["predictions"][0]) module_logger.info( f"Finished {len(predictions)} instance predictions. " f"Cost time: {datetime.timedelta(seconds=(time() - start_time))}" ) predictions = pd.Series(predictions) return predictions
def _check_feature_columns(self): basic_features = parse_basic_features( feature_columns=[*self.wide_columns, *self.deep_columns]) module_logger.info( f"Model is expected to be fed with features: {[f.key for f in basic_features]}" ) feature_keys = { *self.user_feature_builder.feature_metas.keys(), *self.item_feature_builder.feature_metas.keys(), self.user_feature_builder.id_key, self.item_feature_builder.id_key } for feature in basic_features: if feature.key not in feature_keys: raise RuntimeError( f"feature {feature.key} not found in feature datasets.")
def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Recommend items from rated item.") super().score(learner, test_transactions, user_features, item_features, **kwargs) max_recommended_item_count = kwargs["max_recommended_item_count"] min_recommendation_pool_size = kwargs["min_recommendation_pool_size"] return_ratings = kwargs["return_ratings"] with TimeProfile( f"Filter users with less than {min_recommendation_pool_size} transactions" ): transactions_df = test_transactions.df.iloc[:, : TRANSACTIONS_RATING_COL] transactions_df = transactions_df.iloc[( ~transactions_df.duplicated()).values, :] transactions_df = transactions_df.rename(columns=dict( zip(transactions_df.columns, [USER_COLUMN, ITEM_COLUMN]))) user_group_size = transactions_df.groupby(USER_COLUMN, as_index=False).count() valid_users_df = user_group_size[[ USER_COLUMN ]][user_group_size[ITEM_COLUMN] >= min_recommendation_pool_size] transactions_df = pd.merge(left=transactions_df, right=valid_users_df, how='inner') transactions = TransactionDataset(transactions_df) recommendations = self._recommend(learner, transactions=transactions, K=max_recommended_item_count, user_features=user_features, item_features=item_features) return self._format_recommendations( recommendations, return_ratings, K=max_recommended_item_count, score_column_names_build_method=build_rated_ranking_column_names)
def train(self, transactions: TransactionDataset): instances_count = transactions.row_size batches_count = np.ceil(instances_count / self.hyper_params.batch_size) module_logger.info( f"Get {instances_count} training instances, and {batches_count} batches per epoch." ) run_config = tf.estimator.RunConfig( tf_random_seed=self.random_seed, log_step_count_steps=batches_count, # log loss after each epoch save_checkpoints_steps=batches_count * self.hyper_params.epochs, keep_checkpoint_max=1) module_logger.info(f"Build model:\n{self.hyper_params}") self.build_model(run_config=run_config) input_fn = self.get_input_fn(transactions=transactions, batch_size=self.hyper_params.batch_size, epochs=self.get_epochs(), shuffle=True) hooks = [] if self.mpi_support: hooks.append(_HVD_LIB.BroadcastGlobalVariablesHook(0)) try: with TimeProfile("Training Wide & Deep recommendation model"): module_logger.info( f"Start to train model, rank {self.hvd_rank}") self.estimator.train(input_fn=input_fn, hooks=hooks) except tf.estimator.NanLossDuringTrainingError as e: raise NanLossDuringTrainingError from e
def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Predict rating for user-item pairs.") super().score(learner, test_transactions, user_features, item_features, **kwargs) test_transactions_df = test_transactions.df.iloc[:, : TRANSACTIONS_RATING_COL].copy( ) test_transactions_df = test_transactions_df.iloc[( ~test_transactions_df.duplicated()).values, :] test_transactions = TransactionDataset(test_transactions_df, name=test_transactions.name) res_df = self._predict(learner, test_transactions, user_features=user_features, item_features=item_features) res_df.columns = build_regression_column_names() return res_df
parser.add_argument( '--boolean-parameter', type=str, help='A boolean parameter.', ) parser.add_argument( '--enum-parameter', type=str, help='A enum parameter.', ) parser.add_argument( '--output-path', help='The output directory.', ) args, _ = parser.parse_known_args() logger.info(f"Hello world MPI from {PACKAGE_NAME} {VERSION}") comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() str_param = args.string_parameter int_param = args.int_parameter bool_param = args.boolean_parameter enum_param = args.enum_parameter logger.debug(f"Received parameters:") logger.debug(f" {str_param}") logger.debug(f" {int_param}") logger.debug(f" {bool_param}") logger.debug(f" {enum_param}")
'--ranking-metric', type=str, help='The metric of ranking used in item recommendation') parser.add_argument('--top-k', type=int, help='The number of top items to recommend.') parser.add_argument('--sort-top-k', type=str, help='Sort top k results.') parser.add_argument( '--remove-seen-items', type=str, help='Remove items seen in training from recommendation') parser.add_argument('--score-result', help='Ratings or items to output') args, _ = parser.parse_known_args() logger.info(f"Arguments: {args}") sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None remove_seen_items = strtobool( args.remove_seen_items) if args.remove_seen_items else None normalize = strtobool(args.normalize) if args.normalize else None sar_model = load_model_from_directory(args.trained_model, model_loader=joblib_loader).data dataset_to_score = load_data_frame_from_directory( args.dataset_to_score).data logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}") score_sar_module = ScoreSARModule(model=sar_model, input_data=dataset_to_score) score_type = ScoreType(args.score_type)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--input-path', help='Input Dataframe path') parser.add_argument('--detect-mode', choices=['AnomalyOnly', 'AnomalyAndMargin'], help='Specify the detect mode.') parser.add_argument('--timestamp-column', help='Choose the column that contains timestamps.') parser.add_argument('--value-column', help='Choose the column that contains values.') parser.add_argument( '--batch-size', type=int, help= 'This parameter specifies the size of each batch that the detection is perfomed.' ) parser.add_argument( '--threshold', type=float, help= 'This parameter specifies the threshold anomaly score that a point is judged as anomaly.' ) parser.add_argument( '--sensitivity', type=float, help= 'This parameter is used in AnomalyAndMargin mode to control the width of margin.' ) parser.add_argument( '--append-mode', type=str2bool, default=False, help= 'This parameter is used in AnomalyAndMargin mode to control the width of margin.' ) parser.add_argument( '--compute-stats-in-visualization', type=str2bool, default=False, help='Enable this parameter to get stats visualization.') parser.add_argument('--output-path', help='Output Dataframe path') args, _ = parser.parse_known_args() logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}") logger.debug("Received parameters:") logger.debug(f"input: {args.input_path}") logger.debug(f"detect mode: {args.detect_mode}") logger.debug(f"timestamp column: {args.timestamp_column}") logger.debug(f"value column: {args.value_column}") logger.debug(f"batch size: {args.batch_size}") logger.debug(f"threshold: {args.threshold}") logger.debug(f"sensitivity: {args.sensitivity}") logger.debug(f"appendMode: {args.append_mode}") logger.debug(f"appendMode: {args.compute_stats_in_visualization}") logger.debug(f"output path: {args.output_path}") invoke(args.input_path, args.detect_mode, args.timestamp_column, args.value_column, args.batch_size, args.threshold, args.sensitivity, args.append_mode, args.compute_stats_in_visualization, args.output_path)