def get_input_fn(self, transactions: TransactionDataset, batch_size, epochs=1, shuffle=False): user_ids = transactions.users item_ids = transactions.items with TimeProfile("Build features for users"): user_features_df = self.user_feature_builder.build(ids=user_ids) with TimeProfile("Build features for items"): item_features_df = self.item_feature_builder.build(ids=item_ids) x_df = pd.concat([user_features_df, item_features_df], axis=1).reset_index(drop=True) y_sr = transactions.ratings if y_sr is not None: y_sr = y_sr.reset_index(drop=True) # fix shuffle result to keep training result if shuffle: x_df = x_df.sample(frac=1, random_state=self.random_seed) if y_sr is not None: y_sr = y_sr[x_df.index].reset_index(drop=True) x_df = x_df.reset_index(drop=True) return tf.compat.v1.estimator.inputs.pandas_input_fn( x=x_df, y=y_sr, batch_size=batch_size, num_epochs=epochs, shuffle=False)
def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Recommend items from all item.") super().score(learner, test_transactions, user_features, item_features, **kwargs) max_recommended_item_count = kwargs["max_recommended_item_count"] return_ratings = kwargs["return_ratings"] all_items = learner.item_feature_builder.id_vocab test_transactions_df = test_transactions.df users = test_transactions_df.iloc[:, TRANSACTIONS_USER_COL].unique() module_logger.info( f"Get {len(users)} unique users, and {len(all_items)} unique items." ) with TimeProfile("Building complete user item transactions dataset"): transactions_df = self.build_user_item_cartesian_pairs( users=users, items=all_items) transactions = TransactionDataset(transactions_df) recommendations = self._recommend(learner, transactions=transactions, K=max_recommended_item_count, user_features=user_features, item_features=item_features) return self._format_recommendations( recommendations, return_ratings, K=max_recommended_item_count, score_column_names_build_method=build_ranking_column_names)
def save(self, save_to: str, overwrite_if_exists=True): with TimeProfile("Saving Wide & Deep recommendation model"): self.estimator = None for feature_column in [*self.wide_columns, *self.deep_columns]: feature_column.reset() checkpoints_save_dir = os.path.join(save_to, self.rel_checkpoints_dir) checkpoints_exist = tf.train.latest_checkpoint( checkpoints_save_dir) is not None model_save_path = os.path.join(save_to, MODEL_SAVE_FILE) model_exist = os.path.exists(model_save_path) # if checkpoints and model both exists, and not overwrite, just return if checkpoints_exist and model_exist and not overwrite_if_exists: return # copy checkpoints from current temp dir to save path # todo: to remove copy logic as soon as DS supports write save_to path directly if os.path.exists(checkpoints_save_dir): shutil.rmtree(checkpoints_save_dir) shutil.copytree(src=self.checkpoints_dir, dst=checkpoints_save_dir) # reset mpi related attributes self.hvd_rank = None self.hvd_size = None # dump model with open(model_save_path, "wb") as f: pickle.dump(self, f)
def predict(self, transactions: TransactionDataset): if transactions.row_size == 0: return pd.Series() instances_count = transactions.row_size log_every_n_instances = instances_count // 5 if instances_count >= 5 else instances_count module_logger.info(f"Get {instances_count} test instances") module_logger.info(f"Rebuild model:\n {self.hyper_params}") self.build_model(load_checkpoints=True) input_fn = self.get_input_fn(transactions=transactions, batch_size=self.hyper_params.batch_size) predictions = [] start_time = time() with TimeProfile("Making predictions for user-item pairs"): for p in self.estimator.predict(input_fn=input_fn): if len(predictions) % log_every_n_instances == 0 and len( predictions) > 0: cost_seconds = time() - start_time remaining_seconds = cost_seconds / len(predictions) * ( instances_count - len(predictions)) module_logger.info( f"Finished {len(predictions)} instance predictions, " f"cost time: {datetime.timedelta(seconds=cost_seconds)}." f"Remaining time: {datetime.timedelta(seconds=remaining_seconds)}" ) predictions.append(p["predictions"][0]) module_logger.info( f"Finished {len(predictions)} instance predictions. " f"Cost time: {datetime.timedelta(seconds=(time() - start_time))}" ) predictions = pd.Series(predictions) return predictions
def train(self, transactions: TransactionDataset): instances_count = transactions.row_size batches_count = np.ceil(instances_count / self.hyper_params.batch_size) module_logger.info( f"Get {instances_count} training instances, and {batches_count} batches per epoch." ) run_config = tf.estimator.RunConfig( tf_random_seed=self.random_seed, log_step_count_steps=batches_count, # log loss after each epoch save_checkpoints_steps=batches_count * self.hyper_params.epochs, keep_checkpoint_max=1) module_logger.info(f"Build model:\n{self.hyper_params}") self.build_model(run_config=run_config) input_fn = self.get_input_fn(transactions=transactions, batch_size=self.hyper_params.batch_size, epochs=self.get_epochs(), shuffle=True) hooks = [] if self.mpi_support: hooks.append(_HVD_LIB.BroadcastGlobalVariablesHook(0)) try: with TimeProfile("Training Wide & Deep recommendation model"): module_logger.info( f"Start to train model, rank {self.hvd_rank}") self.estimator.train(input_fn=input_fn, hooks=hooks) except tf.estimator.NanLossDuringTrainingError as e: raise NanLossDuringTrainingError from e
def _recommend(self, learner: WideNDeepModel, transactions: TransactionDataset, K: int, user_features: FeatureDataset = None, item_features: FeatureDataset = None): if transactions.row_size == 0: return pd.DataFrame( columns=[USER_COLUMN, ITEM_COLUMN, SCORED_RATING]) predict_df = self._predict(learner, transactions, user_features=user_features, item_features=item_features) with TimeProfile(f"Get top {K} items for each user"): predict_df = (predict_df.groupby(by=[USER_COLUMN]).apply( lambda x: x.nlargest(columns=SCORED_RATING, n=K)).reset_index( drop=True)) topK_items = predict_df.groupby(USER_COLUMN)[ITEM_COLUMN].apply( lambda x: (list(x) + [None] * K)[:K]) topK_ratings = predict_df.groupby(USER_COLUMN)[ SCORED_RATING].apply(lambda x: (list(x) + [0] * K)[:K]) return pd.DataFrame({ USER_COLUMN: topK_items.index, ITEM_COLUMN: topK_items.values, SCORED_RATING: topK_ratings.values })
def entrance(input_path='../image_dataset/', output_path='../image_dir/'): logger.info('Start!') with TimeProfile(f"Mount/Download dataset to {input_path}"): print_dir_hierarchy_to_log(input_path) # Case 1: input path is torchvision ImageFolder # TODO: Case 2: input path is custom image format loader_dir = FolderBasedImageDirectory.load_organized(input_path) loader_dir.dump(output_path) logger.info('Finished.')
def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Recommend items from unrated item.") super().score(learner, test_transactions, user_features, item_features, **kwargs) max_recommended_item_count = kwargs["max_recommended_item_count"] return_ratings = kwargs["return_ratings"] training_transactions = kwargs["training_transactions"] all_items = learner.item_feature_builder.id_vocab training_transactions_df = training_transactions.df training_transactions_df = training_transactions_df.rename( columns={ training_transactions_df.columns[TRANSACTIONS_USER_COL]: USER_COLUMN, training_transactions_df.columns[TRANSACTIONS_ITEM_COL]: ITEM_COLUMN }) users = test_transactions.df.iloc[:, TRANSACTIONS_USER_COL].unique() module_logger.info( f"Get {len(users)} unique users, and {len(all_items)} unique items." ) with TimeProfile("Building complete user item transactions dataset"): transactions_df = self.build_user_item_cartesian_pairs( users=users, items=all_items) transactions_df = pd.merge(transactions_df, training_transactions_df, how='left', on=[USER_COLUMN, ITEM_COLUMN], indicator=True) transactions_df = transactions_df[transactions_df['_merge'] == 'left_only'] transactions_df = transactions_df.drop(columns=['_merge']) transactions = TransactionDataset(transactions_df) recommendations = self._recommend(learner, transactions=transactions, K=max_recommended_item_count, user_features=user_features, item_features=item_features) return self._format_recommendations( recommendations, return_ratings, K=max_recommended_item_count, score_column_names_build_method=build_ranking_column_names)
def score(self, learner: WideNDeepModel, test_transactions: TransactionDataset, user_features: FeatureDataset = None, item_features: FeatureDataset = None, **kwargs): module_logger.info( "Recommendation task: Recommend items from rated item.") super().score(learner, test_transactions, user_features, item_features, **kwargs) max_recommended_item_count = kwargs["max_recommended_item_count"] min_recommendation_pool_size = kwargs["min_recommendation_pool_size"] return_ratings = kwargs["return_ratings"] with TimeProfile( f"Filter users with less than {min_recommendation_pool_size} transactions" ): transactions_df = test_transactions.df.iloc[:, : TRANSACTIONS_RATING_COL] transactions_df = transactions_df.iloc[( ~transactions_df.duplicated()).values, :] transactions_df = transactions_df.rename(columns=dict( zip(transactions_df.columns, [USER_COLUMN, ITEM_COLUMN]))) user_group_size = transactions_df.groupby(USER_COLUMN, as_index=False).count() valid_users_df = user_group_size[[ USER_COLUMN ]][user_group_size[ITEM_COLUMN] >= min_recommendation_pool_size] transactions_df = pd.merge(left=transactions_df, right=valid_users_df, how='inner') transactions = TransactionDataset(transactions_df) recommendations = self._recommend(learner, transactions=transactions, K=max_recommended_item_count, user_features=user_features, item_features=item_features) return self._format_recommendations( recommendations, return_ratings, K=max_recommended_item_count, score_column_names_build_method=build_rated_ranking_column_names)
def update_feature_builders(self, user_features: FeatureDataset, item_features: FeatureDataset): with TimeProfile("Update features for users"): self.user_feature_builder.update(features=user_features) with TimeProfile("Update features for items"): self.item_feature_builder.update(features=item_features)