コード例 #1
0
    def get_input_fn(self,
                     transactions: TransactionDataset,
                     batch_size,
                     epochs=1,
                     shuffle=False):
        user_ids = transactions.users
        item_ids = transactions.items
        with TimeProfile("Build features for users"):
            user_features_df = self.user_feature_builder.build(ids=user_ids)
        with TimeProfile("Build features for items"):
            item_features_df = self.item_feature_builder.build(ids=item_ids)

        x_df = pd.concat([user_features_df, item_features_df],
                         axis=1).reset_index(drop=True)
        y_sr = transactions.ratings
        if y_sr is not None:
            y_sr = y_sr.reset_index(drop=True)

        # fix shuffle result to keep training result
        if shuffle:
            x_df = x_df.sample(frac=1, random_state=self.random_seed)
            if y_sr is not None:
                y_sr = y_sr[x_df.index].reset_index(drop=True)
            x_df = x_df.reset_index(drop=True)

        return tf.compat.v1.estimator.inputs.pandas_input_fn(
            x=x_df,
            y=y_sr,
            batch_size=batch_size,
            num_epochs=epochs,
            shuffle=False)
コード例 #2
0
    def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Recommend items from all item.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        max_recommended_item_count = kwargs["max_recommended_item_count"]
        return_ratings = kwargs["return_ratings"]
        all_items = learner.item_feature_builder.id_vocab
        test_transactions_df = test_transactions.df
        users = test_transactions_df.iloc[:, TRANSACTIONS_USER_COL].unique()
        module_logger.info(
            f"Get {len(users)} unique users, and {len(all_items)} unique items."
        )

        with TimeProfile("Building complete user item transactions dataset"):
            transactions_df = self.build_user_item_cartesian_pairs(
                users=users, items=all_items)
        transactions = TransactionDataset(transactions_df)
        recommendations = self._recommend(learner,
                                          transactions=transactions,
                                          K=max_recommended_item_count,
                                          user_features=user_features,
                                          item_features=item_features)
        return self._format_recommendations(
            recommendations,
            return_ratings,
            K=max_recommended_item_count,
            score_column_names_build_method=build_ranking_column_names)
コード例 #3
0
    def save(self, save_to: str, overwrite_if_exists=True):
        with TimeProfile("Saving Wide & Deep recommendation model"):
            self.estimator = None
            for feature_column in [*self.wide_columns, *self.deep_columns]:
                feature_column.reset()

            checkpoints_save_dir = os.path.join(save_to,
                                                self.rel_checkpoints_dir)
            checkpoints_exist = tf.train.latest_checkpoint(
                checkpoints_save_dir) is not None
            model_save_path = os.path.join(save_to, MODEL_SAVE_FILE)
            model_exist = os.path.exists(model_save_path)

            # if checkpoints and model both exists, and not overwrite, just return
            if checkpoints_exist and model_exist and not overwrite_if_exists:
                return

            # copy checkpoints from current temp dir to save path
            # todo: to remove copy logic as soon as DS supports write save_to path directly
            if os.path.exists(checkpoints_save_dir):
                shutil.rmtree(checkpoints_save_dir)
            shutil.copytree(src=self.checkpoints_dir, dst=checkpoints_save_dir)

            # reset mpi related attributes
            self.hvd_rank = None
            self.hvd_size = None

            # dump model
            with open(model_save_path, "wb") as f:
                pickle.dump(self, f)
コード例 #4
0
    def predict(self, transactions: TransactionDataset):
        if transactions.row_size == 0:
            return pd.Series()

        instances_count = transactions.row_size
        log_every_n_instances = instances_count // 5 if instances_count >= 5 else instances_count
        module_logger.info(f"Get {instances_count} test instances")
        module_logger.info(f"Rebuild model:\n {self.hyper_params}")
        self.build_model(load_checkpoints=True)
        input_fn = self.get_input_fn(transactions=transactions,
                                     batch_size=self.hyper_params.batch_size)
        predictions = []
        start_time = time()

        with TimeProfile("Making predictions for user-item pairs"):
            for p in self.estimator.predict(input_fn=input_fn):
                if len(predictions) % log_every_n_instances == 0 and len(
                        predictions) > 0:
                    cost_seconds = time() - start_time
                    remaining_seconds = cost_seconds / len(predictions) * (
                        instances_count - len(predictions))
                    module_logger.info(
                        f"Finished {len(predictions)} instance predictions, "
                        f"cost time: {datetime.timedelta(seconds=cost_seconds)}."
                        f"Remaining time: {datetime.timedelta(seconds=remaining_seconds)}"
                    )
                predictions.append(p["predictions"][0])
            module_logger.info(
                f"Finished {len(predictions)} instance predictions. "
                f"Cost time: {datetime.timedelta(seconds=(time() - start_time))}"
            )

        predictions = pd.Series(predictions)

        return predictions
コード例 #5
0
 def train(self, transactions: TransactionDataset):
     instances_count = transactions.row_size
     batches_count = np.ceil(instances_count / self.hyper_params.batch_size)
     module_logger.info(
         f"Get {instances_count} training instances, and {batches_count} batches per epoch."
     )
     run_config = tf.estimator.RunConfig(
         tf_random_seed=self.random_seed,
         log_step_count_steps=batches_count,  # log loss after each epoch
         save_checkpoints_steps=batches_count * self.hyper_params.epochs,
         keep_checkpoint_max=1)
     module_logger.info(f"Build model:\n{self.hyper_params}")
     self.build_model(run_config=run_config)
     input_fn = self.get_input_fn(transactions=transactions,
                                  batch_size=self.hyper_params.batch_size,
                                  epochs=self.get_epochs(),
                                  shuffle=True)
     hooks = []
     if self.mpi_support:
         hooks.append(_HVD_LIB.BroadcastGlobalVariablesHook(0))
     try:
         with TimeProfile("Training Wide & Deep recommendation model"):
             module_logger.info(
                 f"Start to train model, rank {self.hvd_rank}")
             self.estimator.train(input_fn=input_fn, hooks=hooks)
     except tf.estimator.NanLossDuringTrainingError as e:
         raise NanLossDuringTrainingError from e
コード例 #6
0
 def _recommend(self,
                learner: WideNDeepModel,
                transactions: TransactionDataset,
                K: int,
                user_features: FeatureDataset = None,
                item_features: FeatureDataset = None):
     if transactions.row_size == 0:
         return pd.DataFrame(
             columns=[USER_COLUMN, ITEM_COLUMN, SCORED_RATING])
     predict_df = self._predict(learner,
                                transactions,
                                user_features=user_features,
                                item_features=item_features)
     with TimeProfile(f"Get top {K} items for each user"):
         predict_df = (predict_df.groupby(by=[USER_COLUMN]).apply(
             lambda x: x.nlargest(columns=SCORED_RATING, n=K)).reset_index(
                 drop=True))
         topK_items = predict_df.groupby(USER_COLUMN)[ITEM_COLUMN].apply(
             lambda x: (list(x) + [None] * K)[:K])
         topK_ratings = predict_df.groupby(USER_COLUMN)[
             SCORED_RATING].apply(lambda x: (list(x) + [0] * K)[:K])
     return pd.DataFrame({
         USER_COLUMN: topK_items.index,
         ITEM_COLUMN: topK_items.values,
         SCORED_RATING: topK_ratings.values
     })
def entrance(input_path='../image_dataset/', output_path='../image_dir/'):
    logger.info('Start!')
    with TimeProfile(f"Mount/Download dataset to {input_path}"):
        print_dir_hierarchy_to_log(input_path)
    # Case 1: input path is torchvision ImageFolder
    # TODO: Case 2: input path is custom image format
    loader_dir = FolderBasedImageDirectory.load_organized(input_path)
    loader_dir.dump(output_path)
    logger.info('Finished.')
    def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Recommend items from unrated item.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        max_recommended_item_count = kwargs["max_recommended_item_count"]
        return_ratings = kwargs["return_ratings"]
        training_transactions = kwargs["training_transactions"]

        all_items = learner.item_feature_builder.id_vocab
        training_transactions_df = training_transactions.df
        training_transactions_df = training_transactions_df.rename(
            columns={
                training_transactions_df.columns[TRANSACTIONS_USER_COL]:
                USER_COLUMN,
                training_transactions_df.columns[TRANSACTIONS_ITEM_COL]:
                ITEM_COLUMN
            })
        users = test_transactions.df.iloc[:, TRANSACTIONS_USER_COL].unique()
        module_logger.info(
            f"Get {len(users)} unique users, and {len(all_items)} unique items."
        )

        with TimeProfile("Building complete user item transactions dataset"):
            transactions_df = self.build_user_item_cartesian_pairs(
                users=users, items=all_items)
            transactions_df = pd.merge(transactions_df,
                                       training_transactions_df,
                                       how='left',
                                       on=[USER_COLUMN, ITEM_COLUMN],
                                       indicator=True)
            transactions_df = transactions_df[transactions_df['_merge'] ==
                                              'left_only']
            transactions_df = transactions_df.drop(columns=['_merge'])
            transactions = TransactionDataset(transactions_df)

        recommendations = self._recommend(learner,
                                          transactions=transactions,
                                          K=max_recommended_item_count,
                                          user_features=user_features,
                                          item_features=item_features)
        return self._format_recommendations(
            recommendations,
            return_ratings,
            K=max_recommended_item_count,
            score_column_names_build_method=build_ranking_column_names)
コード例 #9
0
    def score(self,
              learner: WideNDeepModel,
              test_transactions: TransactionDataset,
              user_features: FeatureDataset = None,
              item_features: FeatureDataset = None,
              **kwargs):
        module_logger.info(
            "Recommendation task: Recommend items from rated item.")
        super().score(learner, test_transactions, user_features, item_features,
                      **kwargs)
        max_recommended_item_count = kwargs["max_recommended_item_count"]
        min_recommendation_pool_size = kwargs["min_recommendation_pool_size"]
        return_ratings = kwargs["return_ratings"]

        with TimeProfile(
                f"Filter users with less than {min_recommendation_pool_size} transactions"
        ):
            transactions_df = test_transactions.df.iloc[:, :
                                                        TRANSACTIONS_RATING_COL]
            transactions_df = transactions_df.iloc[(
                ~transactions_df.duplicated()).values, :]
            transactions_df = transactions_df.rename(columns=dict(
                zip(transactions_df.columns, [USER_COLUMN, ITEM_COLUMN])))
            user_group_size = transactions_df.groupby(USER_COLUMN,
                                                      as_index=False).count()
            valid_users_df = user_group_size[[
                USER_COLUMN
            ]][user_group_size[ITEM_COLUMN] >= min_recommendation_pool_size]
            transactions_df = pd.merge(left=transactions_df,
                                       right=valid_users_df,
                                       how='inner')
            transactions = TransactionDataset(transactions_df)

        recommendations = self._recommend(learner,
                                          transactions=transactions,
                                          K=max_recommended_item_count,
                                          user_features=user_features,
                                          item_features=item_features)
        return self._format_recommendations(
            recommendations,
            return_ratings,
            K=max_recommended_item_count,
            score_column_names_build_method=build_rated_ranking_column_names)
コード例 #10
0
 def update_feature_builders(self, user_features: FeatureDataset,
                             item_features: FeatureDataset):
     with TimeProfile("Update features for users"):
         self.user_feature_builder.update(features=user_features)
     with TimeProfile("Update features for items"):
         self.item_feature_builder.update(features=item_features)