コード例 #1
0
def filter_col_isin(df, col, inclusion_set):
    if len(inclusion_set) and col in df.columns:
        logger.info('Filtering on {} in {}'.format(col, inclusion_set))
        return df.loc[df[col].isin(inclusion_set)]
    else:
        logger.info('Nothing to filter on')
        return df
コード例 #2
0
ファイル: data.py プロジェクト: kiminh/tophat
    def load(self, force_reload=False):
        if not (force_reload or self.data is None):
            logger.info('Already loaded')
        else:
            feat_df = self.load_fn(self.path, **self.load_kwargs)
            if hasattr(feat_df, 'compute'):  # cant `.isin` dask
                feat_df = feat_df.compute()
            if self.index_col:
                feat_df.set_index(self.index_col, inplace=True)
                duplicates = feat_df.index.duplicated(keep='last')
                feat_df = feat_df.loc[~duplicates]
            if self.use_cols:
                self.data = feat_df[self.use_cols]
            elif self.use_cols is not None and self.use_cols == []:
                # Empty dataframe (rely on {user|item} specific feature
                self.data = pd.DataFrame(index=feat_df.index)
            else:
                # Use entire dataframe
                self.data = feat_df

            if self.concat_cols is not None:
                self.data = combine_cols(df=self.data,
                                         cols_seq=self.concat_cols)

            if self.drop_cols:
                self.data.drop(list(set(self.drop_cols)), axis=1, inplace=True)

            if self.force_str:
                self.data = self.data.astype(str)
                self.data.index = self.data.index.astype(str)

        return self
コード例 #3
0
def log_shape_or_npartitions(df, name: str = '') -> None:
    """
    df : dataframe to log shape or npartitions
    name : optional name of dataframe as extra info
    """
    if hasattr(df, 'compute'):  # if dask dataframe
        logger.info(f'{name} npartitions:\t({df.npartitions})')
    else:
        logger.info(f'{name} shape:\t(%d,%d)' % df.shape)
コード例 #4
0
ファイル: data.py プロジェクト: kiminh/tophat
    def load(self):
        if self.data is not None:
            logger.info('Already loaded')
        else:
            interactions_df = self.load_fn(self.path, **self.load_kwargs)
            if 'value' in interactions_df.columns \
                    and self.item_col not in interactions_df.columns:
                interactions_df = interactions_df.rename(
                    columns={'value': self.item_col})
            if self.activity_col and self.activity_filter_set:
                interactions_df = filter_col_isin(
                    interactions_df,
                    self.activity_col, self.activity_filter_set)
            if hasattr(interactions_df, 'compute'):
                interactions_df = interactions_df.compute()
            if self.force_str:
                for col in [self.user_col, self.item_col]:
                    interactions_df[col] = interactions_df[col].astype(str)

            self.data = interactions_df
        return self
コード例 #5
0
ファイル: evaluation.py プロジェクト: kiminh/tophat
    def __init__(
        self,
        interactions_val_src: InteractionsSource,
        parent_task_wrapper: FactorizationTaskWrapper,
        limit_items=-1,
        n_users_eval=200,
        include_cold=True,
        cold_only=False,
        n_xns_as_cold=5,
        features_srcs: Optional[FeatureSourceDictType] = None,
        specific_feature: Optional[Dict[FGroup, bool]] = None,
        seed: int = 0,
        name: Optional[str] = None,
    ):

        self.name = name or ''
        self.parent_task_wrapper = parent_task_wrapper
        train_data_loader = parent_task_wrapper.data_loader
        self.model_ref: FactorizationTask = None
        self.rand = np.random.RandomState(seed)

        self.user_col_val = interactions_val_src.user_col
        self.item_col_val = interactions_val_src.item_col
        self.n_users_eval = n_users_eval

        # Allocate processed interactions and features
        self.cats_d = None
        self.cat_codes_dfs = None
        self.num_feats_dfs = None
        self.num_meta = None
        self.interactions_df = None
        self.zero_init_rows = None

        # Allocate Operations
        self.input_fwd_d: Dict[str, tf.Tensor] = None

        self.metric_ops_d = None
        self.reset_metrics_op = None
        self.eval_ph_d = None

        # Allocate dataset stuff
        self.ds = None
        self.input_iter = None
        self.input_batch = None

        # If an item occurs in training less than `n_xns_as_cold` times,
        # it is considered a cold item; otherwise, warm
        train_item_counts = train_data_loader.interactions_df\
            .groupby(train_data_loader.item_col, observed=True).size()
        warm_items = set(
            train_item_counts.loc[train_item_counts >= n_xns_as_cold].index)

        if include_cold:
            self.init_cold(train_data_loader, interactions_val_src, warm_items,
                           features_srcs, specific_feature, cold_only)

        else:
            self.init_warm(train_data_loader, interactions_val_src, warm_items)

        self.user_ids_val = np.array(
            list(
                set(self.interactions_df[self.user_col_val]).intersection(
                    set(self.cat_codes_dfs[FGroup.USER].index))))

        # TODO: could be less sketchy (esp considering the cold stuff above^)
        # self.item_ids = self.cats_d[self.item_col_val].copy()
        self.item_ids = self.cat_codes_dfs[FGroup.ITEM].index.tolist()

        if limit_items >= 0:
            # This will consider all "new"/validation items
            #   plus a limited selection of "old"/training items
            #   (no special logic to handle overlapping sets)
            self.rand.shuffle(self.item_ids)

            val_item_ids = list(
                self.interactions_df[self.item_col_val].unique().to_dense())
            self.item_ids = list(
                set(self.item_ids[:limit_items] + val_item_ids))

        # Re-encode items to only the catalog we care about
        self.interactions_df['item_reenc'] = self.interactions_df\
            [self.item_col_val].cat.set_categories(self.item_ids)

        logger.info(f'Evaluating on {len(self.item_ids)} items')

        self.rand.shuffle(self.user_ids_val)
コード例 #6
0
ファイル: evaluation.py プロジェクト: kiminh/tophat
    def run_val(self, sess, summary_writer=None, step=None, macro=False):
        """

        Args:
            sess:
            summary_writer:
            step: step for summary writer
            macro: Macro average across users, else, micro average across
                interactions

        Returns:

        """
        if self.metric_ops_d is None:
            logger.info('ops missing, making them now via `self.make_ops`')
            self.make_ops()

        if self.n_users_eval < 0:
            n_users_eval = len(self.user_ids_val)
        else:
            n_users_eval = min(self.n_users_eval, len(self.user_ids_val))

        metrics_per_user = defaultdict(lambda: [])
        sess.run(tf.local_variables_initializer())
        sess.run(self.input_iter.initializer)
        sess.run(self.reset_metrics_op)
        metric_vals = [np.nan] * len(self.metric_ops_d)  # will overwrite
        for _ in tqdm(range(n_users_eval)):

            # Run updates
            sess.run([tup[1] for tup in self.metric_ops_d.values()])
            # Run and store aggregation
            metric_vals = sess.run(
                [tup[0] for tup in self.metric_ops_d.values()])

            if macro:
                for m, v in zip(self.metric_ops_d.keys(), metric_vals):
                    metrics_per_user[m].append(v)

                # Reset for each user for macro metrics
                sess.run(self.reset_metrics_op)
        # Micro agg will just be the last updated value (without resets)
        micro_metrics = dict(zip(self.metric_ops_d.keys(), metric_vals))

        ret_d = {}
        for m in self.metric_ops_d.keys():
            if macro:
                vals = metrics_per_user[m]
                metric_score = np.mean(vals)
                metric_score_std = np.std(vals)
                logger.info(
                    f'(val){m} = {metric_score} +/- {metric_score_std}')
            else:
                metric_score = micro_metrics[m]
                logger.info(f'(val){m} = {metric_score}')

            metric_val_summary = tf.Summary(value=[
                tf.Summary.Value(tag=f'{self.name}/{m}_val',
                                 simple_value=metric_score)
            ])
            if summary_writer is not None:
                summary_writer.add_summary(metric_val_summary, step)

            ret_d[m] = metric_score

        return ret_d