Beispiel #1
0
            def inner(*args, **kwargs):
                nonlocal self, path_func
                read_from_cache = kwargs.pop('read_from_cache', False)
                save_to_cache = kwargs.pop('save_to_cache', True)
                cache_valid_days = kwargs.pop('cache_valid_days', None)

                if not read_from_cache and not save_to_cache:
                    # short circuit everything if cache not requested
                    return func(*args, **kwargs)

                path_func = path_func or self.cache_filepath
                cache_path = path_func(*args, **kwargs)
                cache_valid = self.is_cache_valid(cache_path, valid_days=cache_valid_days)

                read_cache_attempt = read_from_cache and cache_valid

                # using pickle here because pickling stores the dataframe more reliably
                # (data types and other information may have changed or lost during write/read of csv)

                if read_cache_attempt:
                    # df = pd.read_sv(cache_path, keep_default_na=False, na_values=NA_VALUES)
                    df = pd.read_pickle(cache_path)
                    logger.info(f'Read cache file from {cache_path}')
                else:
                    if read_from_cache:
                        logger.warning(f'Cache file not found/valid, attempting to create ({cache_path})')
                    df = func(*args, **kwargs)

                if save_to_cache and cache_path and not read_cache_attempt:
                    # df.to_csv(cache_path, index=None)
                    df.to_pickle(cache_path)

                return df
Beispiel #2
0
 def remote_to_local(self, remote_path, local_path, overwrite=True):
     if not os.path.exists(local_path) or overwrite:
         os.makedirs(os.path.dirname(local_path), exist_ok=True)
         logger.info('S3: copying from %s to %s' %
                     (remote_path, local_path))
         with open(local_path, 'wb') as local:
             local.write(self.read(remote_path))
Beispiel #3
0
    def test_c_als_recommender(self):
        from ml_recsys_tools.recommenders.implib_recommenders import ALSRecommender

        als_rec = ALSRecommender()
        als_rec.fit(self.state.train_obs)
        als_rep = als_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='als ')
        logger.info(als_rep)
        self._test_recommender(als_rec)
Beispiel #4
0
    def test_d_comb_rank_ens(self):
        from ml_recsys_tools.recommenders.combination_ensembles import CombinedRankEnsemble

        comb_ranks_rec = CombinedRankEnsemble(
            recommenders=[self.state.lfm_rec, self.state.item_cooc_rec])
        comb_rank_rep = comb_ranks_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='combined ranks ')
        logger.info(comb_rank_rep)
        self._test_recommender(comb_ranks_rec)
Beispiel #5
0
    def test_d_comb_simil_ens(self):
        from ml_recsys_tools.recommenders.combination_ensembles import CombinedSimilRecoEns

        comb_simil_rec = CombinedSimilRecoEns(
            recommenders=[self.state.lfm_rec, self.state.item_cooc_rec])
        comb_simil_rec.fit(self.state.train_obs)
        comb_simil_rep = comb_simil_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='combined simils ')
        logger.info(comb_simil_rep)
        self._test_recommender(comb_simil_rec)
Beispiel #6
0
 def _block_until_first_load_loop(self):
     wait_sec = 0
     while self.keep_reloading and (self.model is None):
         if wait_sec == 0 or (wait_sec % 10) == 0:
             logger.info(
                 'Blocking until first model is loaded (%d seconds already).'
                 % wait_sec)
         time.sleep(1)
         wait_sec += 1
 def _filter_array(array, encoder, message_prefix='', message_suffix=''):
     array = np.array(array).astype(str)
     new_labels_mask = encoder.find_new_labels(array)
     n_discard = np.sum(new_labels_mask)
     if n_discard > 0:
         logger.info(
             '%s Discarding %d (out of %d) %s' %
             (message_prefix, int(n_discard), len(array), message_suffix))
     return array[~new_labels_mask]
Beispiel #8
0
    def test_c_cooc_recommender(self):
        from ml_recsys_tools.recommenders.similarity_recommenders import ItemCoocRecommender

        item_cooc_rec = ItemCoocRecommender()
        item_cooc_rec.fit(self.state.train_obs)
        item_cooc_rep = item_cooc_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='item cooccurrence ')
        logger.info(item_cooc_rep)
        self._test_recommender(item_cooc_rec)
        self.state.item_cooc_rec = item_cooc_rec
Beispiel #9
0
 def _download_through_disk(self, remote_path, local_fileobj):
     with tempfile.NamedTemporaryFile(delete=True) as temp:
         self._stream_obj_to_file(remote_path=remote_path, fileobj=temp)
         try:
             with gzip.open(temp) as gzipfile:
                 with io.BufferedReader(gzipfile) as gzipbuffered:
                     return self._stream_to_file(gzipbuffered, local_fileobj)
         except Exception as e:
             logger.info('_download_through_disk: failed gzip read, assuming regular binary')
             temp.seek(0)
             return self._stream_to_file(temp, local_fileobj)
Beispiel #10
0
    def test_c_spotlight_implicit_recommender(self):
        from ml_recsys_tools.recommenders.spotlight_recommenders import EmbeddingFactorsRecommender

        rec = EmbeddingFactorsRecommender()
        # trying to balance flakiness and speed
        rec.set_params(embedding_dim=32, batch_size=1<<10,
                       num_negative_samples=10, n_iter=5)
        rec.fit(self.state.train_obs)
        report = rec.eval_on_test_by_ranking(self.state.test_obs, prefix='spot ')
        logger.info(report)
        self._test_recommender(rec)
Beispiel #11
0
    def test_c_features_simil_recommender(self):
        from ml_recsys_tools.recommenders.similarity_recommenders import FeaturesSimilRecommender

        cos_rec = FeaturesSimilRecommender()
        cos_rec.fit(self.state.train_obs)
        cos_rep = cos_rec.eval_on_test_by_ranking(self.state.test_obs, prefix='cosine ')
        logger.info(cos_rep)

        # not using _test_recommender because this recommender will fail on fake_data_test and exclusion_test
        self._test_get_recommendations(cos_rec)
        self._test_get_similar_items(cos_rec)
        self._test_predict_for_user(cos_rec)
Beispiel #12
0
    def _test_predictions_on_fake_data(self, rec):
        # check that missing "interactions" are recommended
        for user in self.TESTING_USER_IDS:
            recos = rec.get_recommendations(user_ids=[user], n_rec=10).iloc[0][rec._item_col]
            logger.info(f'{user} {recos}')
            self.assertTrue(user.replace('user', 'item') in recos)

        # check that test items are similar to each other
        for item in self.TESTING_ITEM_IDS:
            simils = rec.get_similar_items(item_ids=[item], n_simil=10).iloc[0][rec._item_col]
            logger.info(f'{item} {simils}')
            self.assertTrue(len(set(simils).intersection(set(self.TESTING_ITEM_IDS))) >= 3)
Beispiel #13
0
def early_stopping_runner(
        score_func, check_point_func,
        epochs_start=0, epochs_max=200, epochs_step=10,
        stop_patience=10, decline_threshold=0.05,
        plot_graph=True):
    res_list = []
    max_score = 0
    decline_counter = 0
    cur_epoch = 0
    epochs_list = []
    max_epoch = 0
    # find optimal number of epochs on validation data
    while cur_epoch <= epochs_max:

        cur_step = epochs_start + epochs_step if cur_epoch == 0 else epochs_step

        simple_logger.info('Training epochs %d - %d.' %
                           (cur_epoch, cur_epoch + cur_step))

        cur_epoch += cur_step
        epochs_list.append(cur_epoch)

        cur_score = score_func(cur_epoch, cur_step)
        res_list.append(cur_score)

        # early stopping logic
        if max_score * (1 - decline_threshold) > cur_score:
            decline_counter += cur_step
            if decline_counter >= stop_patience:
                break
        else:
            decline_counter = 0

        if cur_score > max_score:
            max_score = cur_score
            max_epoch = cur_epoch
            check_point_func()

    # print logging info
    scores_str = ','.join(['%d%%(%d)' % (int(100 * s / max_score), e)
                           for s, e in zip(res_list, epochs_list)])

    simple_logger.info('Early stopping: stopped fit after %d '
                       'epochs (max validation score: %f (@%d), all scores: %s)'
                       % (cur_epoch, max_score, max_epoch, scores_str))

    if plot_graph:
        pyplot.figure()
        pyplot.plot(epochs_list, res_list)

    return max_epoch
 def remove_unseen_labels(self, df):
     # new_u = ~df[self.uid_source_col].isin(self.uid_encoder.classes_)
     new_u = self.uid_encoder.find_new_labels(df[self.uid_source_col])
     # new_i = ~df[self.iid_source_col].isin(self.iid_encoder.classes_)
     new_i = self.iid_encoder.find_new_labels(df[self.iid_source_col])
     percent_new_u = np.mean(new_u)
     percent_new_i = np.mean(new_i)
     if percent_new_u > 0.0 or percent_new_i > 0.0:
         logger.info(
             'Discarding %.1f%% samples with unseen '
             'users(%d) / unseen items(%d) from DF(len: %s).' % \
             (100 * np.mean(new_u | new_i), np.sum(new_u), np.sum(new_i), len(df)))
         return df[~new_u & ~new_i].copy()
     else:
         return df
Beispiel #15
0
    def _filter_relevant_obs_and_items(self, stage=''):
        items_ids = self.df_items[self.item_id_col].unique().astype(str)
        obs_ids = self.df_obs[self.iid_col].unique().astype(str)

        obs_filt = self.df_obs[self.iid_col].astype(str).isin(items_ids)
        item_filt = self.df_items[self.item_id_col].astype(str).isin(obs_ids)

        self.df_obs = self.df_obs[obs_filt].copy()
        self.df_items = self.df_items[item_filt].copy()

        n_dropped_obs = (~obs_filt).sum()
        n_dropped_items = (~item_filt).sum()
        if n_dropped_obs + n_dropped_items:
            logger.info('ObsWithFeatures:_filter_relevant_obs_and_items:%s '
                        'dropped %d observations, %d items' % (stage, n_dropped_obs, n_dropped_items))
Beispiel #16
0
    def _test_predict_for_user(self, rec):
        user = rec.all_users[0]
        items = rec.all_items[:50]

        ts = time.time()
        preds_1 = rec.predict_for_user(user_id=user, item_ids=items)
        elapsed = time.time() - ts
        scores = preds_1[rec._prediction_col].tolist()

        # test format
        # columns
        self.assertListEqual(preds_1.columns.tolist(),
                             [rec._user_col, rec._item_col, rec._prediction_col])
        # length
        self.assertEqual(len(preds_1), len(items))

        # test sorted descending
        self.assertTrue(scores[::-1] == sorted(scores))

        # test combine with original order makes first item in original order higher in results
        preds_2 = rec.predict_for_user(user_id=user, item_ids=items, combine_original_order=True)
        ind_item = lambda item, preds: np.argmax(preds[rec._item_col].values == item)
        ind_diffs = np.array([ind_item(item, preds_1) - ind_item(item, preds_2)
                              for item in items])
        self.assertEqual(ind_diffs.sum(), 0)
        self.assertGreater(ind_diffs[:(len(ind_diffs) // 2)].sum(), 0)  # first items rank higher

        # test training items predictions are last
        train_item = rec.item_ids([rec.train_mat[rec.user_inds([user])[0],:].indices[0]])
        preds_3 = rec.predict_for_user(user_id=user, item_ids=np.concatenate([items, train_item]))
        train_preds = preds_3[preds_3[rec._item_col] == train_item[0]][rec._prediction_col]
        self.assertTrue(all(train_preds == preds_3[rec._prediction_col].min()))

        # test unknown items are last
        new_items = 'new_item'
        preds_4 = rec.predict_for_user(user_id=user, item_ids=np.concatenate([items, [new_items]]))
        new_preds = preds_4[preds_4[rec._item_col] == new_items][rec._prediction_col]
        self.assertTrue(all(new_preds == preds_4[rec._prediction_col].min()))

        # test for unknown user all predictions are the same
        preds_5 = rec.predict_for_user(user_id='new_user', item_ids=items)
        self.assertEqual(preds_5[rec._prediction_col].min(), preds_5[rec._prediction_col].max())

        # test doesn't take more than 0.05 second
        logger.info(f'predict_for_user for {rec} took {elapsed:.3f} seconds.')
        self.assertGreater(0.06 * (1 + 2 * int(DEBUG_ON)), elapsed)  #  allow more time if debugging
Beispiel #17
0
    def _rank_items_for_user(cls,
                             model: BaseDFSparseRecommender,
                             user_id,
                             item_ids,
                             mode,
                             rank_training_last=True,
                             min_score=None):
        ts = time.time()

        n_unknowns = 0
        if mode == cls.mode_disabled:
            scores = [None] * len(item_ids)
        else:
            pred_df = model.predict_for_user(
                user_id=user_id,
                item_ids=item_ids,
                rank_training_last=rank_training_last,
                sort=True,
                combine_original_order=cls._combine_original_order(mode),
            )

            item_ids = pred_df[model._item_col].tolist()

            scores = pred_df[model._prediction_col].values

            if min_score is not None:
                unknowns_mask = scores < min_score
                n_unknowns = unknowns_mask.sum()  # is a numpy array
                scores[unknowns_mask] = min_score

            scores = scores.tolist()

        result = {
            'user_id': user_id,
            'ranked_items': item_ids,
            'scores': scores
        }

        logger.info(
            'Ran ranking for user %s (%d items, %d unknown) in %.3f seconds for mode %s.'
            % (str(user_id), len(scores), n_unknowns, time.time() - ts,
               str(mode)))
        return result
Beispiel #18
0
 def _model_reloading_loop(self):
     time.sleep(self._time_jitter())
     while self.keep_reloading:
         try:
             new_model_s3_path = self._latest_s3_model_path()
             if new_model_s3_path == self._current_model_path:
                 logger.info('Model path unchanged, not reloading. %s' %
                             new_model_s3_path)
             else:
                 updated_model = S3FileIO(
                     self._s3_bucket).unpickle(new_model_s3_path)
                 self._test_loaded_model(updated_model)
                 self.model = updated_model
                 self._current_model_path = new_model_s3_path
                 logger.info('Loaded updated model from S3. %s' %
                             new_model_s3_path)
         except Exception as e:
             logger.error('Failed model update. %s' % str(e))
             logger.exception(e)
             if self.model is None:
                 raise EnvironmentError('Could not load model on startup.')
         time.sleep(self._update_interval_seconds + self._time_jitter())
Beispiel #19
0
    def fit_with_early_stop(self, train_obs, valid_ratio=0.04, refit_on_all=False, metric='AUC',
                            epochs_start=0, epochs_max=200, epochs_step=10, stop_patience=10,
                            plot_convergence=True, decline_threshold=0.05, k=10, valid_split_time_col=None):

        # split validation data
        train_obs_internal, valid_obs = train_obs.split_train_test(
            ratio=valid_ratio ** 0.5 if valid_split_time_col is None else valid_ratio,
            users_ratio=valid_ratio ** 0.5 if valid_split_time_col is None else 1,
            time_split_column=valid_split_time_col,
            random_state=RANDOM_STATE)

        self.model = None
        self.model_checkpoint = None
        all_metrics = pd.DataFrame()

        def update_full_metrics_df(cur_epoch, report_df):
            nonlocal all_metrics
            all_metrics = all_metrics.append(
                report_df.rename(index={'test': cur_epoch}), sort=False)

        def check_point_func():
            if not refit_on_all:
                self.model_checkpoint = deepcopy(self.model)

        def score_func(cur_epoch, step):
            self.fit_partial(train_obs_internal, epochs=step)
            lfm_report = self.eval_on_test_by_ranking(
                valid_obs.df_obs, include_train=False, prefix='', k=k)
            cur_score = float(lfm_report.loc['test', metric])
            update_full_metrics_df(cur_epoch, lfm_report)
            return cur_score

        best_epoch = early_stopping_runner(
            score_func=score_func,
            check_point_func=check_point_func,
            epochs_start=epochs_start,
            epochs_max=epochs_max,
            epochs_step=epochs_step,
            stop_patience=stop_patience,
            decline_threshold=decline_threshold,
            plot_graph=plot_convergence
        )
        simple_logger.info('Early stop, all_metrics:\n' + str(all_metrics))

        if plot_convergence:
            all_metrics = all_metrics.divide(all_metrics.max())
            all_metrics.plot()
        self.early_stop_metrics_df = all_metrics

        self._set_epochs(epochs=best_epoch)

        if refit_on_all:
            simple_logger.info('Refitting on whole train data for %d epochs' % best_epoch)
            self.fit(train_obs)
        else:
            simple_logger.info('Loading best model from checkpoint at %d epochs' % best_epoch)
            self.model, self.model_checkpoint = self.model_checkpoint, None

        return self
Beispiel #20
0
 def __call__(self, result):
     best_result = result.fun
     cur_result = result.func_vals[-1]
     if best_result >= cur_result:
         # self.prev_result = cur_result
         simple_logger.info('best params, iteration %d' % len(result.func_vals))
     simple_logger.info('params for loss=%f:' % cur_result)
     values = result.x_iters[-1]
     params = self.search_inst.values_to_dict(values)
     simple_logger.info(params)
Beispiel #21
0
    def test_b_2_lfm_rec_evaluation(self):
        k = self.k

        rep_exact = self.state.lfm_rec.eval_on_test_by_ranking_exact(
            self.state.test_obs.df_obs, prefix='lfm regular exact ', k=k)
        logger.info(rep_exact)

        rep_reg = self.state.lfm_rec.eval_on_test_by_ranking(
            self.state.test_obs.df_obs, prefix='lfm regular ', n_rec=200, k=k)
        logger.info(rep_reg)

        self.assertListEqual(list(rep_reg.columns), list(rep_exact.columns))

        # test that those fields are almost equal for the two test methods
        logger.info('deviations from exact evaluation')
        for col in rep_reg.columns:
            deviations = abs(1 - (rep_exact[col].values / rep_reg[col].values))
            logger.info(f'{col}: {deviations}')
            if 'AUC' in col:
                self.assertTrue(all(deviations < 0.1))
            elif 'coverage' in col:
                self.assertTrue(all(deviations < 0.03))
            else:
                self.assertTrue(all(deviations < 0.01))
Beispiel #22
0
    def compare_similarity_results(self,
                                   item_id,
                                   items_lists,
                                   scores_lists=None,
                                   names=None,
                                   print_data=True):
        df_item_source = self.items_filtered_by_ids([item_id])
        items_dfs = [self.items_filtered_by_ids(l) for l in items_lists]

        # add variant and scores field
        if names is None:
            names = [str(i) for i in range(len(items_lists))]
        if scores_lists is None:
            scores_lists = [[5] * len(df) for df in items_dfs]
        all_lists = [
            df.assign(variant=name, score=scores[:len(df)])
            for df, name, scores in zip(items_dfs + [df_item_source], names +
                                        ['source'], scores_lists + [[0]])
        ]
        all_data = pd.concat(all_lists, sort=False)

        # add counts
        all_data = all_data.join(
            all_data[self.item_id_col].value_counts().to_frame('count'),
            on=self.item_id_col)
        all_data = all_data.set_index(self.item_id_col)

        if print_data:
            logger.info('\n' + str(all_data))

        mapper = self.mapper_class(all_data)  # for view init

        if names is None:
            names = [''] * len(items_dfs)

        colors = self.mapper_class.get_n_spaced_colors(len(items_dfs))

        # poor man's legend
        logger.info("Poor man's legend: " + str(list(zip(colors, names))))

        for i in range(len(items_lists)):

            if scores_lists[i] is None:
                size = 5
            else:
                # filter simil scores only for those items that we have data for
                listings_with_data = items_dfs[i][self.item_id_col].values
                simil_scores = [
                    score for listing_id, score in zip(items_lists[i],
                                                       scores_lists[i])
                    if listing_id in listings_with_data
                ]
                size = 3 + (10 * np.array(simil_scores)**3).astype(np.int32)
                size = [int(el) for el in size]

            mapper.add_markers(items_dfs[i], size=size, color=colors[i])
            mapper.add_heatmap(items_dfs[i],
                               sensitivity=1,
                               opacity=0.4,
                               spread=50,
                               color=colors[i])

        mapper.add_markers(df_item_source, color='black', size=7)

        return mapper
Beispiel #23
0
 def pickle(self, obj, remote_path, compress=True):
     logger.info('S3: pickling to %s' % remote_path)
     return self.write_binary(pickle.dumps(obj),
                              remote_path,
                              compress=compress)
Beispiel #24
0
 def unpickle(self, remote_path):
     logger.info('S3: unpickling from %s' % remote_path)
     return pickle.loads(self.read(remote_path))
Beispiel #25
0
 def local_to_remote(self, local_path, remote_path, compress=True):
     logger.info('S3: copying from %s to %s' % (local_path, remote_path))
     with open(local_path, 'rb') as local:
         self.write_binary(local.read(), remote_path, compress=compress)