def filter_col_isin(df, col, inclusion_set): if len(inclusion_set) and col in df.columns: logger.info('Filtering on {} in {}'.format(col, inclusion_set)) return df.loc[df[col].isin(inclusion_set)] else: logger.info('Nothing to filter on') return df
def load(self, force_reload=False): if not (force_reload or self.data is None): logger.info('Already loaded') else: feat_df = self.load_fn(self.path, **self.load_kwargs) if hasattr(feat_df, 'compute'): # cant `.isin` dask feat_df = feat_df.compute() if self.index_col: feat_df.set_index(self.index_col, inplace=True) duplicates = feat_df.index.duplicated(keep='last') feat_df = feat_df.loc[~duplicates] if self.use_cols: self.data = feat_df[self.use_cols] elif self.use_cols is not None and self.use_cols == []: # Empty dataframe (rely on {user|item} specific feature self.data = pd.DataFrame(index=feat_df.index) else: # Use entire dataframe self.data = feat_df if self.concat_cols is not None: self.data = combine_cols(df=self.data, cols_seq=self.concat_cols) if self.drop_cols: self.data.drop(list(set(self.drop_cols)), axis=1, inplace=True) if self.force_str: self.data = self.data.astype(str) self.data.index = self.data.index.astype(str) return self
def log_shape_or_npartitions(df, name: str = '') -> None: """ df : dataframe to log shape or npartitions name : optional name of dataframe as extra info """ if hasattr(df, 'compute'): # if dask dataframe logger.info(f'{name} npartitions:\t({df.npartitions})') else: logger.info(f'{name} shape:\t(%d,%d)' % df.shape)
def load(self): if self.data is not None: logger.info('Already loaded') else: interactions_df = self.load_fn(self.path, **self.load_kwargs) if 'value' in interactions_df.columns \ and self.item_col not in interactions_df.columns: interactions_df = interactions_df.rename( columns={'value': self.item_col}) if self.activity_col and self.activity_filter_set: interactions_df = filter_col_isin( interactions_df, self.activity_col, self.activity_filter_set) if hasattr(interactions_df, 'compute'): interactions_df = interactions_df.compute() if self.force_str: for col in [self.user_col, self.item_col]: interactions_df[col] = interactions_df[col].astype(str) self.data = interactions_df return self
def __init__( self, interactions_val_src: InteractionsSource, parent_task_wrapper: FactorizationTaskWrapper, limit_items=-1, n_users_eval=200, include_cold=True, cold_only=False, n_xns_as_cold=5, features_srcs: Optional[FeatureSourceDictType] = None, specific_feature: Optional[Dict[FGroup, bool]] = None, seed: int = 0, name: Optional[str] = None, ): self.name = name or '' self.parent_task_wrapper = parent_task_wrapper train_data_loader = parent_task_wrapper.data_loader self.model_ref: FactorizationTask = None self.rand = np.random.RandomState(seed) self.user_col_val = interactions_val_src.user_col self.item_col_val = interactions_val_src.item_col self.n_users_eval = n_users_eval # Allocate processed interactions and features self.cats_d = None self.cat_codes_dfs = None self.num_feats_dfs = None self.num_meta = None self.interactions_df = None self.zero_init_rows = None # Allocate Operations self.input_fwd_d: Dict[str, tf.Tensor] = None self.metric_ops_d = None self.reset_metrics_op = None self.eval_ph_d = None # Allocate dataset stuff self.ds = None self.input_iter = None self.input_batch = None # If an item occurs in training less than `n_xns_as_cold` times, # it is considered a cold item; otherwise, warm train_item_counts = train_data_loader.interactions_df\ .groupby(train_data_loader.item_col, observed=True).size() warm_items = set( train_item_counts.loc[train_item_counts >= n_xns_as_cold].index) if include_cold: self.init_cold(train_data_loader, interactions_val_src, warm_items, features_srcs, specific_feature, cold_only) else: self.init_warm(train_data_loader, interactions_val_src, warm_items) self.user_ids_val = np.array( list( set(self.interactions_df[self.user_col_val]).intersection( set(self.cat_codes_dfs[FGroup.USER].index)))) # TODO: could be less sketchy (esp considering the cold stuff above^) # self.item_ids = self.cats_d[self.item_col_val].copy() self.item_ids = self.cat_codes_dfs[FGroup.ITEM].index.tolist() if limit_items >= 0: # This will consider all "new"/validation items # plus a limited selection of "old"/training items # (no special logic to handle overlapping sets) self.rand.shuffle(self.item_ids) val_item_ids = list( self.interactions_df[self.item_col_val].unique().to_dense()) self.item_ids = list( set(self.item_ids[:limit_items] + val_item_ids)) # Re-encode items to only the catalog we care about self.interactions_df['item_reenc'] = self.interactions_df\ [self.item_col_val].cat.set_categories(self.item_ids) logger.info(f'Evaluating on {len(self.item_ids)} items') self.rand.shuffle(self.user_ids_val)
def run_val(self, sess, summary_writer=None, step=None, macro=False): """ Args: sess: summary_writer: step: step for summary writer macro: Macro average across users, else, micro average across interactions Returns: """ if self.metric_ops_d is None: logger.info('ops missing, making them now via `self.make_ops`') self.make_ops() if self.n_users_eval < 0: n_users_eval = len(self.user_ids_val) else: n_users_eval = min(self.n_users_eval, len(self.user_ids_val)) metrics_per_user = defaultdict(lambda: []) sess.run(tf.local_variables_initializer()) sess.run(self.input_iter.initializer) sess.run(self.reset_metrics_op) metric_vals = [np.nan] * len(self.metric_ops_d) # will overwrite for _ in tqdm(range(n_users_eval)): # Run updates sess.run([tup[1] for tup in self.metric_ops_d.values()]) # Run and store aggregation metric_vals = sess.run( [tup[0] for tup in self.metric_ops_d.values()]) if macro: for m, v in zip(self.metric_ops_d.keys(), metric_vals): metrics_per_user[m].append(v) # Reset for each user for macro metrics sess.run(self.reset_metrics_op) # Micro agg will just be the last updated value (without resets) micro_metrics = dict(zip(self.metric_ops_d.keys(), metric_vals)) ret_d = {} for m in self.metric_ops_d.keys(): if macro: vals = metrics_per_user[m] metric_score = np.mean(vals) metric_score_std = np.std(vals) logger.info( f'(val){m} = {metric_score} +/- {metric_score_std}') else: metric_score = micro_metrics[m] logger.info(f'(val){m} = {metric_score}') metric_val_summary = tf.Summary(value=[ tf.Summary.Value(tag=f'{self.name}/{m}_val', simple_value=metric_score) ]) if summary_writer is not None: summary_writer.add_summary(metric_val_summary, step) ret_d[m] = metric_score return ret_d