Ejemplo n.º 1
0
    def _create_csvs():
        print('creating CSV...')

        # create no_cluster/full
        path = 'dataset/preprocessed/no_cluster'
        full = data.full_df()
        train_len = data.read_config()[data.TRAIN_LEN_KEY]

        train = full.iloc[0:train_len]
        test = full.iloc[train_len:len(full)]
        target_indices = get_target_indices(test)

        check_folder('dataset/preprocessed/no_cluster/full')
        train.to_csv(os.path.join(path, 'full/train.csv'))
        test.to_csv(os.path.join(path, 'full/test.csv'))
        np.save(os.path.join(path, 'full/train_indices'), train.index)
        np.save(os.path.join(path, 'full/test_indices'), test.index)
        np.save(os.path.join(path, 'full/target_indices'), target_indices)

        no_of_rows_in_small = int(
            input('How many rows do you want in small.csv? '))
        train_small = get_small_dataset(train,
                                        maximum_rows=no_of_rows_in_small)
        check_folder('dataset/preprocessed/no_cluster/small')
        split(train_small, os.path.join(path, 'small'))

        check_folder('dataset/preprocessed/no_cluster/local')
        split(train, os.path.join(path, 'local'))

        # create item_metadata in preprocess folder
        original_item_metadata = data.accomodations_original_df()
        original_item_metadata.to_csv(data.ITEMS_PATH)

        # append missing accomodations to item metadata
        append_missing_accomodations('full')
def score_fn(model, X, y):
    global i
    print(i)
    i += 1
    t0 = time.time()

    target_indices = data.target_indices('local')
    full_impressions = data.full_df()

    scores = list(model.xg.predict(X))

    final_predictions = []
    count = 0
    for index in tqdm(target_indices):
        impressions = list(
            map(int, full_impressions.loc[index]['impressions'].split('|')))
        predictions = scores[count:count + len(impressions)]
        couples = list(zip(predictions, impressions))
        couples.sort(key=lambda x: x[0], reverse=True)
        _, sorted_impr = zip(*couples)
        final_predictions.append((index, list(sorted_impr)))
        count = count + len(impressions)

    mrr = model.compute_MRR(final_predictions)
    print('Done in', time.time() - t0)
    print()

    return mrr
Ejemplo n.º 3
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        # find the clickout interactions
        res_df = df[['user_id','session_id','prices']]
        res_df = res_df[df.action_type == 'clickout item']

        # expand the prices as vector
        expanded_prices = res_df.prices.str.split('|', expand=True).fillna(0).astype('int')

        # scale log
        log_prices = np.log(expanded_prices +1)

        max_price = max(np.max(log_prices))
        min_price = min(np.min(log_prices))

        log_prices = (log_prices - min_price) / (max_price - min_price)

        # add the prices to the resulting df
        for i in range(25):
            res_df['price_{}'.format(i)] = log_prices.loc[:, i]
        
        return res_df.drop(['user_id','session_id','prices'], axis=1)
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()
        df = df.sort_values(['user_id', 'session_id', 'timestamp',
                             'step']).reset_index()

        # find the last clickout rows
        last_clickout_idxs = find_last_clickout_indices(df)
        clickout_rows = df.loc[
            last_clickout_idxs,
            ['user_id', 'session_id', 'impressions', 'index']]
        clickout_rows[
            'impressions_count'] = clickout_rows.impressions.str.split(
                '|').str.len()
        clickout_rows = clickout_rows.drop('impressions', axis=1)

        # multi-hot the counts
        one_hot_counts = np.zeros((clickout_rows.shape[0], 25), dtype=np.int8)
        for i, c in tqdm(enumerate(clickout_rows.impressions_count.values)):
            one_hot_counts[i, 0:c] = 1

        # add to the clickouts
        for i in range(25):
            clickout_rows['impr_c{}'.format(i)] = one_hot_counts[:, i]

        return clickout_rows.drop('impressions_count',
                                  axis=1).set_index('index')
    def get_scores_cv(self, x, groups, test_indices):
        """ Return scores for a fold """
        x_val = x[test_indices]
        indices = x_val[:, :, 0][:, self.dataset.rows_per_sample - 1]

        predictions = self.model.predict(x_val[:, :, 1:])

        # take the target rows
        res_df = data.full_df()[['user_id', 'session_id',
                                 'impressions']].loc[indices].copy()
        res_df['impressions'] = res_df['impressions'].str.split('|')
        # add the scores as a new column
        res_df['scores'] = list(predictions)
        # trim the scores to the real number of impressions
        # (otherwise all rows have the fixed number of scores (25) )
        res_df['length'] = res_df['impressions'].str.len()
        res_df['scores'] = res_df.apply(lambda x: x['scores'][:x['length']],
                                        axis=1)
        res_df.drop('length', axis=1, inplace=True)

        # expand the df to have a row for each item_id - score
        res_df = pd.DataFrame({
            col: np.repeat(res_df[col], res_df['scores'].str.len())
            for col in res_df.columns.drop(['impressions', 'scores'])
        }).assign(
            **{
                'item_id': np.concatenate(res_df['impressions'].values),
                'score': np.concatenate(res_df['scores'].values),
            })

        return res_df
Ejemplo n.º 6
0
    def create_feature(self):
        # load dataset and indices
        train, train_indices = self.dataset.load_Xtrain(return_indices=True)
        test, test_indices = self.dataset.load_Xtest()
        # make predictions
        train_test = np.concatenate([train, test])
        del train
        del test
        predictions = self.model.predict(train_test).flatten()
        # build feature df
        concat_indices = np.concatenate([train_indices, test_indices])
        del train_indices
        del test_indices
        users_sessions = data.full_df().loc[concat_indices]
        feature_df = pd.DataFrame(
            {
                'user_id': users_sessions['user_id'],
                'session_id': users_sessions['session_id'],
                'rnn_binary_preds': predictions
            },
            index=concat_indices)

        path = 'dataset/preprocessed/no_cluster/{}/feature/rnn_binary_preds/features.csv'.format(
            self.mode)
        check_folder(path)
        feature_df.to_csv(path)

        return feature_df
 def get_r_hat(self):
     """
     Return the r_hat matrix as: R^ = R•S or R^ = S•R
     """
     R = self.urm
     target_indices_urm = []
     for ind in self.target_indices:
         if self.type == 'user':
             target_indices_urm.append(
                 self.dict_row[data.full_df().loc[ind]['user_id']])
         if self.type == 'session':
             target_indices_urm.append(self.dict_row[tuple(
                 data.full_df().loc[ind][['user_id', 'session_id']])])
     if self._matrix_mul_order == 'inverse':
         return self._sim_matrix.tocsr()[target_indices_urm].dot(R)
     else:
         return R[target_indices_urm].dot(self._sim_matrix)
    def recommend_batch(self):
        # load full df
        print('loading df_full')
        full_df = data.full_df()
        icm = data.icm().tocsr(
        )  #sim.normalization.bm25(data.icm().tocsr(), axis=1)

        predictions_batch = []
        self.scores_batch = []

        count = 0
        predicted_count = 0
        skipped_count = 0
        for index in tqdm(self.target_indices):

            # get the impressions of the clickout to predict
            impr = list(map(int, full_df.loc[index]['impressions'].split('|')))
            # get the row index of the icm to predict
            icm_rows = []
            for i in impr:
                icm_rows.append(self.dict_col[i])
            temp = list(zip(impr, icm_rows))
            temp.sort(key=lambda tup: tup[1])
            list_impr_icmrows = list(zip(*temp))
            impr_sorted = list(list_impr_icmrows[0])
            icm_rows = list(list_impr_icmrows[1])

            icm_filtered = icm[icm_rows]
            r_hat_row = self.user_features_matrix[count] * icm_filtered.T

            l = list(zip(impr_sorted, r_hat_row.todense().tolist()[0]))
            l_scores = l.copy()
            l.sort(key=lambda tup: tup[1], reverse=True)
            l_scores.sort(key=lambda tup: tup[0], reverse=True)

            count += 1

            if l[0][1] == 0:
                skipped_count += 1
                self.scores_batch.append((index, [], []))
                continue
            else:
                predicted_count += 1
                p = [e[0] for e in l]
                print(f'impr: {impr}\n rec: {p}')
                predictions_batch.append((index, p))

                scores = [e[1] for e in l]
                self.scores_batch.append((index, p, scores))

                print(scores)

        print(
            f'predicted percentage: {predicted_count/len(self.target_indices)}\n jumped percentage: {skipped_count/len(self.target_indices)}'
        )
        print('prediction created !!!')
        return predictions_batch
    def fit(self):

        urm = data.urm(self.mode, self.cluster, self.type, self.urm_name)
        icm = data.icm().tocsr()

        # computing target indices_urm
        target_indices_urm = []
        if self.type == 'user':
            for ind in self.target_indices:
                target_indices_urm.append(
                    self.dict_row[data.full_df().loc[ind]['user_id']])
        if self.type == 'session':
            for ind in self.target_indices:
                target_indices_urm.append(self.dict_row[tuple(
                    data.full_df().loc[ind][['user_id', 'session_id']])])

        self.user_features_matrix = sps.normalize(urm[target_indices_urm] *
                                                  icm,
                                                  norm='l2',
                                                  axis=0)
def _reinsert_clickout(df):
    # take the row of the missing clickout
    clickout_rows_df = df[(df['action_type'] == 'clickout item')
                          & df['reference'].isnull()]
    # check if it exsists
    if len(clickout_rows_df) > 0:
        # retrieve from the full_df the clickout
        missing_click = data.full_df().loc[
            clickout_rows_df.index[0]]['reference']
        # reinsert the clickout on the df
        df.at[clickout_rows_df.index[0], 'reference'] = missing_click
    return df
Ejemplo n.º 11
0
    def get_scores_cv(self, x, groups, test_indices):
        """ Return scores for a fold """
        x_val = x[test_indices]
        indices = x_val[:, :, 0][:, self.dataset.rows_per_sample - 1]

        predictions = self.model.predict(x_val[:, :, 1:])

        # take the target rows
        res_df = data.full_df()[['user_id', 'session_id']].loc[indices].copy()

        # add the scores as a new column
        res_df['scores'] = predictions

        return res_df
Ejemplo n.º 12
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        df = df.sort_index()
        # find the clickout rows
        clickout_rows = df[[
            'user_id', 'session_id', 'action_type', 'impressions'
        ]][df.action_type == 'clickout item']
        clickout_rows[
            'impressions_count'] = clickout_rows.impressions.str.split(
                '|').str.len()
        # prepare the resulting dataframe
        res_df = df[['user_id', 'session_id']].copy()
        res_df['impressions_count'] = 0

        # iterate over the sorted reference_rows and clickout_rows
        j = 0
        clickout_indices = clickout_rows.index.values

        ck_idx = clickout_indices[0]
        next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id']
        next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id']
        for idx, row in tqdm(res_df.iterrows()):
            # if the current index is over the last clickout, break
            if idx > clickout_indices[-1]:
                break
            # find the next clickout index
            while idx > clickout_indices[j]:
                j += 1
                ck_idx = clickout_indices[j]
                next_clickout_user_id = clickout_rows.at[ck_idx, 'user_id']
                next_clickout_sess_id = clickout_rows.at[ck_idx, 'session_id']

            # check if row and next_clickout are in the same session
            if row.user_id == next_clickout_user_id and row.session_id == next_clickout_sess_id:
                res_df.at[idx, 'impressions_count'] = clickout_rows.at[
                    ck_idx, 'impressions_count']

        # create the 25 categories
        one_hot_counts = np.zeros((res_df.shape[0], 25), dtype=np.int8)
        for i, c in enumerate(res_df.impressions_count.values):
            one_hot_counts[i, 0:c] = 1

        for i in range(25):
            res_df['impr_c{}'.format(i)] = one_hot_counts[:, i]

        return res_df.drop(['user_id', 'session_id', 'impressions_count'],
                           axis=1)
Ejemplo n.º 13
0
        def get_y_true(clickout_indices):
            df = data.full_df().loc[clickout_indices]

            def add_label(row):
                impress = list(map(int, row['impressions'].split('|')))
                ref = row['reference']

                if ref in impress:
                    return 1 if impress[0] == ref else 0
                else:
                    return 0

            df = df.astype({'reference': 'int'})
            df['label'] = df.progress_apply(add_label, axis=1)
            return df['label']
Ejemplo n.º 14
0
    def get_scores_batch(self):
        final_predictions = []

        count = 0
        for index in tqdm(self.target_indices):
            impr = list(
                map(int,
                    data.full_df().loc[index]['impressions'].split('|')))
            pred = self.predictions[count][0:len(impr)]
            couples = list(zip(pred, impr))
            # print(couples)
            couples.sort(key=lambda x: x[0], reverse=True)
            scores, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr), list(scores)))
            count += 1
        return final_predictions
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        # count the popularity
        #cnt = Counter(df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)].reference.values.astype(int))
        pop_df = df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)] \
                    [['reference','frequence']].astype('int').groupby('reference').sum()
        cnt = pop_df.to_dict()['frequence']

        # find the clickout rows
        clickout_rows = df[df.action_type == 'clickout item'][[
            'reference', 'impressions'
        ]]
        clickout_rows = clickout_rows.fillna(-1).astype({'reference': 'int'})
        clickout_rows['impressions'] = clickout_rows.apply(
            lambda x: list(map(int, x.impressions.split('|'))), axis=1)

        # build the resulting matrix
        matrix = np.zeros((clickout_rows.shape[0], 25), dtype=int)

        i = 0
        for impr in tqdm(clickout_rows.impressions):
            for j, impr in enumerate(impr):
                ## OLD version
                #popularity = cnt[impr] if impr in cnt else 0
                #if popularity == row.reference:
                #    popularity -= 1

                ## NEW ! (decrease 1 to all references)
                popularity = cnt[impr] - 1 if impr in cnt else 0
                matrix[i, j] = popularity
            i += 1

        # scale log and min-max
        min_pop = np.log((pop_df['frequence'] - 1).clip(0).min() + 1)
        max_pop = np.log((pop_df['frequence'] - 1).clip(0).max() + 1)

        matrix = (np.log(matrix + 1) - min_pop) / (max_pop - min_pop)

        # add the columns to the resulting dataframe
        for i in range(25):
            clickout_rows['impr_pop{}'.format(i)] = matrix[:, i]

        return clickout_rows.drop(['reference', 'impressions'], axis=1)
    def compute_MRR(self, predictions):
        """
        :param predictions:
        :return: MRR computed on just the sessions where the clickout is not on the first impression
        """
        #assert (self.mode == 'local' or self.mode == 'small')
        #train_df = pd.read_csv('dataset/preprocessed/no_cluster/full/train.csv'.format(
        #        self.cluster), usecols=['reference', 'impressions'])

        if self.mode == 'full':
            train_df = data.full_df()
        else:
            train_df = data.train_df('full')

        target_indices, recs = zip(*predictions)
        target_indices = list(target_indices)
        correct_clickouts = train_df.loc[target_indices].reference.values
        impression = train_df.loc[target_indices].impressions.values
        len_rec = len(recs)
        count = 0

        RR = 0
        print("Calculating MRR (hoping for a 0.99)")
        for i in tqdm(range(len_rec)):
            if correct_clickouts[i] not in impression[i].split('|'):
                print(f'Reference {correct_clickouts[i]} not in impression')
                continue
            if impression[i].split('|').index(
                    correct_clickouts[i]) != 0 or not self.class_weights:
                correct_clickout = int(correct_clickouts[i])
                if correct_clickout in predictions[i][1]:
                    rank_pos = recs[i].index(correct_clickout) + 1
                    if rank_pos <= 25:
                        RR += 1 / rank_pos
                count += 1
            else:
                print('skipping because:')
                print(impression[i])
                print(correct_clickouts[i])

        MRR = RR / count
        print(f'MRR: {MRR}')

        return MRR
Ejemplo n.º 17
0
    def recommend_batch(self, target_indices):
        X, indices = self.dataset.load_Xtest()

        # predict the references
        predictions = self.model.predict(X)

        # flatten the predictions and the indices to be 2-dimensional
        predictions = predictions.reshape((-1, predictions.shape[-1]))
        indices = indices.flatten()

        # take only the target predictions
        pred_df = pd.DataFrame(predictions)
        pred_df['orig_index'] = indices
        pred_df = pred_df.set_index('orig_index')
        predictions = pred_df.loc[target_indices].sort_index().values
        del pred_df

        assert len(predictions) == len(target_indices)

        full_df = data.full_df()
        accomodations1hot_df = data.accomodations_one_hot()

        result_predictions = []
        for k, index in tqdm(enumerate(target_indices)):
            # get the impressions of the clickout to predict
            impr = list(map(int, full_df.loc[index]['impressions'].split('|')))
            # get the true labels from the accomodations one-hot
            true_labels = accomodations1hot_df.loc[impr].values
            # build a list of (impression, l2norm distance)
            prediction_impressions_distances = [
                (impr[j], L2Norm(true_labels[j] - predictions[k]))
                for j in range(len(impr))
            ]
            # order the list based on the l2norm (smaller distance is better)
            prediction_impressions_distances.sort(key=lambda tup: tup[1])
            # get only the impressions ids
            ordered_impressions = list(
                map(lambda x: x[0], prediction_impressions_distances))
            # append the couple (index, reranked impressions)
            result_predictions.append((index, ordered_impressions))

        print('prediction created !!!')

        return result_predictions
Ejemplo n.º 18
0
    def __init__(self,
                 mode,
                 cluster,
                 urm_name,
                 factors=100,
                 regularization=0.01,
                 iterations=10,
                 alpha=25):
        os.environ['MKL_NUM_THREADS'] = '1'
        name = 'ALS urm_name: {}\n factors: {}\n regularization: {}\n ' \
                    'iterations: {}\n alpha: {}'.format(urm_name, factors, regularization, iterations, alpha)
        super(AlternatingLeastSquare, self).__init__(mode, cluster, name)

        self.factors = int(factors)
        self.regularization = regularization
        self.iterations = int(iterations)
        self.alpha = int(alpha)

        self.target_indices = data.target_indices(mode, cluster)

        self.dict_row = data.dictionary_row(mode, cluster)
        self.target_indices_urm = []
        for ind in self.target_indices:
            self.target_indices_urm.append(self.dict_row[tuple(
                data.full_df().loc[ind][['session_id', 'user_id']])])

        self.urm = data.urm(mode=mode, cluster=cluster, urm_name=urm_name)
        self.user_vecs = None
        self.item_vecs = None
        self._model = None

        self.fixed_params_dict = {
            'mode': mode,
            'urm_name': urm_name,
            'cluster': cluster
        }

        self.hyperparameters_dict = {
            'factors': (50, 200),
            'regularization': (0, 1),
            'iterations': (1, 250),
            'alpha': (15, 45)
        }
Ejemplo n.º 19
0
 def _merge_sessions():
     print("Merging similar sessions (same user_id and city)")
     print("Loading full_df")
     full_df = data.full_df()
     print("Sorting, grouping, and other awesome things")
     grouped = full_df.sort_values(["user_id", "timestamp"],
                                   ascending=[True, True]).groupby(
                                       ["user_id", "city"])
     new_col = np.array(["" for _ in range(len(full_df))], dtype=object)
     print("Now I'm really merging...")
     for name, g in tqdm(grouped):
         s_id = g.iloc[0]["session_id"]
         new_col[g.index.values] = s_id
     print("Writing on the df")
     full_df["unified_session_id"] = pd.Series(new_col)
     print("Saving new df to file")
     with open(data.FULL_PATH, 'w', encoding='utf-8') as f:
         full_df.to_csv(f)
     data.refresh_full_df()
    def extract_feature(self):
        df = data.full_df()

        # find the clickout rows
        clickout_rows = df[df.prices.notnull()][['impressions', 'prices']]
        # cast the impressions and the prices to lists
        clickout_rows['impressions'] = clickout_rows.impressions.str.split('|')
        clickout_rows['prices'] = clickout_rows.prices.str.split('|')

        clickout_rows = pd.DataFrame({col:np.concatenate(clickout_rows[col].values) \
                                    for col in clickout_rows.columns }).astype('int')
        # compute mean and standard deviation
        res_df = clickout_rows.groupby('impressions').agg(['mean', 'std'
                                                           ]).reset_index()
        res_df.columns = ['_'.join(x) for x in res_df.columns.ravel()]
        res_df = res_df.rename(columns={'impressions_': 'item_id'})
        res_df['prices_std'] = res_df['prices_std'].fillna(0)

        return res_df
Ejemplo n.º 21
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        df = df.sort_index()
        # find the clickout rows
        clickout_rows = df[['user_id','session_id','action_type','impressions','prices']][df.action_type == 'clickout item']
        # cast the impressions and the prices to lists
        clickout_rows['impression_list'] = clickout_rows.impressions.str.split('|')
        clickout_rows['price_list'] = clickout_rows.prices.str.split('|').apply(lambda x: list(map(int,x)))
        # order the prices
        clickout_rows['sorted_price_list'] = clickout_rows.price_list.apply(lambda x: sorted(x))
        clickout_rows = clickout_rows.drop('prices', axis=1)
        # find the interaction with numeric reference
        reference_rows = df[['user_id','session_id','reference','action_type']]
        reference_rows = reference_rows[df.reference.str.isnumeric() & (df.action_type != 'clickout item')]
        reference_rows = reference_rows.drop('action_type',axis=1)
        reference_rows['price_pos'] = -1
        reference_rows = reference_rows.sort_index()

        # iterate over the sorted reference_rows and clickout_rows
        j = 0
        clickout_indices = clickout_rows.index.values
        for idx,row in tqdm(reference_rows.iterrows()):
            # if the current index is over the last clickout, break
            if idx >= clickout_indices[-1]:
                break
            # find the next clickout index
            while idx > clickout_indices[j]:
                j += 1
            next_clickout = clickout_rows.loc[clickout_indices[j]]

            # check if row and next_clickout are in the same session
            if row.user_id == next_clickout.user_id and row.session_id == next_clickout.session_id:
                try:
                    ref_idx = next_clickout.impression_list.index(row.reference)
                    ref_price = int(next_clickout.price_list[ref_idx])
                    reference_rows.at[idx, 'price_pos'] = next_clickout.sorted_price_list.index(ref_price)
                except:
                    pass
        
        return reference_rows.drop('reference', axis=1)
Ejemplo n.º 22
0
    def __init__(self,
                 mode='local',
                 learning_rate=0.3,
                 min_child_weight=1,
                 n_estimators=300,
                 max_depth=3,
                 subsample=1,
                 colsample_bytree=1,
                 reg_lambda=1.0,
                 reg_alpha=0):
        name = 'gbdt_hybrid'
        cluster = 'no_cluster'
        super(Gbdt_Hybrid, self).__init__(mode, cluster, name)

        self.current_directory = Path(__file__).absolute().parent
        self.data_directory = self.current_directory.joinpath(
            '..', '..', 'submissions/hybrid')
        #self.gt_csv = self.data_directory.joinpath('ground_truth.csv')
        self.mode = mode
        self.full = data.full_df()

        self.local_target_indices = data.target_indices(mode='local',
                                                        cluster='no_cluster')
        self.full_target_indices = data.target_indices(mode='full',
                                                       cluster='no_cluster')

        directory = self.data_directory.joinpath('local')

        full_dir = self.data_directory.joinpath('full')

        self.xg = xgb.XGBRanker(learning_rate=learning_rate,
                                min_child_weight=min_child_weight,
                                max_depth=math.ceil(max_depth),
                                n_estimators=math.ceil(n_estimators),
                                subsample=subsample,
                                colsample_bytree=colsample_bytree,
                                reg_lambda=reg_lambda,
                                reg_alpha=reg_alpha,
                                n_jobs=-1,
                                objective='rank:ndcg')

        self.cv_path = self.data_directory.joinpath('cross_validation')
    def extract_feature(self):
        df = data.full_df()

        # count the numeric references (skipping NaN in the test)
        res_df = df[(df.action_type == 'clickout item') & (df.reference.str.isnumeric() == True)]
        res_df = res_df[['reference','frequence']].astype('int').groupby('reference').sum()
        res_df['frequence'] -= 1
        res_df = res_df[res_df['frequence'] > 0]

        # scale log and min-max
        min_pop = res_df['frequence'].values.min()
        max_pop = res_df['frequence'].values.max()

        min_pop = np.log(min_pop +1)
        max_pop = np.log(max_pop +1)

        res_df['frequence'] = (np.log(res_df['frequence'].values +1) - min_pop) / (max_pop - min_pop)

        res_df = res_df.reset_index()
        return res_df.rename(columns={'reference': 'item_id', 'frequence': 'glob_clickout_popularity'})
    def recommend_batch(self, target_indices):
        X, indices = self.dataset.load_Xtest()

        # predict the references
        predictions = self.model.predict(X)

        # take only the last index for each session (target row) and flatten
        #predictions = predictions.reshape((-1, predictions.shape[-1]))
        #indices = indices[:,-1].flatten()

        # take only the target predictions
        pred_df = pd.DataFrame(predictions)
        pred_df['orig_index'] = indices
        pred_df = pred_df.set_index('orig_index')
        predictions = pred_df.loc[target_indices]
        del pred_df

        assert len(predictions) == len(target_indices)

        full_df = data.full_df()

        result_predictions = []
        for index in tqdm(target_indices):
            # get the impressions of the clickout to predict
            impr = list(map(int, full_df.loc[index]['impressions'].split('|')))
            # build a list of (impression, score)
            prediction_impressions_distances = [(impr[j], predictions.at[index,
                                                                         j])
                                                for j in range(len(impr))]
            # order the list based on scores (greater is better)
            prediction_impressions_distances.sort(key=lambda tup: tup[1],
                                                  reverse=True)
            # get only the impressions ids
            ordered_impressions = list(
                map(lambda x: x[0], prediction_impressions_distances))
            # append the couple (index, reranked impressions)
            result_predictions.append((index, ordered_impressions))

        print('prediction created !!!')

        return result_predictions
    def get_scores_batch(self, scores_type='test'):
        assert scores_type in ['train', 'test']

        if scores_type == 'test':
            X, indices = self.dataset.load_Xtest()
        else:
            X, indices = self.dataset.load_Xtrain(return_indices=True)

        predictions = self.model.predict(X)

        full_df = data.full_df()

        result_predictions = []
        for i, index in tqdm(enumerate(indices)):
            # get the impressions of the clickout to predict
            impr = list(map(int, full_df.loc[index]['impressions'].split('|')))
            scores = predictions[i]
            # append the triple (index, impressions, scores)
            result_predictions.append((index, impr, scores))

        return result_predictions
 def recommend_batch(self):
     X_test, _, _, _ = data.dataset_xgboost_test(mode=self.mode,
                                                 cluster=self.cluster,
                                                 kind=self.kind)
     target_indices = data.target_indices(self.mode, self.cluster)
     full_impressions = data.full_df()
     print('data for test ready')
     scores = list(self.xg.predict(X_test))
     final_predictions = []
     count = 0
     for index in tqdm(target_indices):
         impressions = list(
             map(int,
                 full_impressions.loc[index]['impressions'].split('|')))
         predictions = scores[count:count + len(impressions)]
         couples = list(zip(predictions, impressions))
         couples.sort(key=lambda x: x[0], reverse=True)
         _, sorted_impr = zip(*couples)
         final_predictions.append((index, list(sorted_impr)))
         count = count + len(impressions)
     return final_predictions
Ejemplo n.º 27
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()

        # mapping and encoding
        change_sort_filters = set(['sort by price','best value', 
                           'sort by rating','focus on rating',
                           'sort by popularity'])
        sof_classes = ['sort_rating', 'sort_pop', 'sort_price']
        mapping = {
            'sort by price':      [0,0,1],
            'best value':         [0,1,1],
            'sort by rating':     [1,0,0],
            'focus on rating':    [1,1,0],
            'sort by popularity': [0,1,0],
        }

        rows = df[(df.action_type == 'clickout item') & df.current_filters.notnull()]
        rows = rows[['current_filters']]
        # filter the filters by the sort filters and re-cast to list
        rows['filters_list'] = rows['current_filters'].str.lower().str.split('|')\
                                        .progress_apply(lambda x: list(set(x) & change_sort_filters))
        rows = rows.drop(['current_filters'], axis=1)
        rows = rows[rows['filters_list'].str.len() > 0]
        rows['filters_list'] = rows['filters_list'].apply(lambda x: x[0])

        # iterate over the interactions
        print('Total interactions:', rows.shape[0])
        matrix = np.zeros((rows.shape[0], len(sof_classes)), dtype='int8')
        k = 0
        for fl in tqdm(rows['filters_list'].values):
            matrix[k,:] = mapping[fl]
            k += 1

        # add the 3 new columns
        for i,col_name in enumerate(sof_classes):
            rows[col_name] = matrix[:,i]

        return rows.drop('filters_list', axis=1)
Ejemplo n.º 28
0
    def recommend_batch(self):

        final_predictions = []
        scores_batch = []

        count = 0
        for index in tqdm(self.target_indices):
            impr = list(
                map(int,
                    data.full_df().loc[index]['impressions'].split('|')))
            pred = self.predictions[count][0:len(impr)]
            couples = list(zip(pred, impr))
            #print(couples)
            couples.sort(key=lambda x: x[0], reverse=True)
            scores, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr)))
            scores_batch.append((index, list(sorted_impr), list(scores)))
            count += 1
        if self.mode != 'small':
            cf.check_folder('scores')
            np.save(f'scores/{self.name}', np.array(scores_batch))
        return final_predictions
Ejemplo n.º 29
0
    def __init__(self,
                 mode,
                 cluster,
                 dataset_name,
                 pred_name,
                 predict_train=False):
        """
        the dataset name is used to load the prediction created by the tensorflow ranking class

        :param dataset_name: dataset name passed to the CreateDataset() method
        """

        name = 'tf_ranking'
        super(TensorflowRankig, self).__init__(mode=mode,
                                               cluster=cluster,
                                               name=name)

        self.dataset_name = dataset_name

        # the path where the PREDICTION are stored
        _BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{self.dataset_name}'

        _PREDICTION_PATH = f'{_BASE_PATH}/{pred_name}.npy'

        # check if the PREDICTION have been made
        exists_path_predictions = os.path.isfile(_PREDICTION_PATH)

        if not exists_path_predictions:
            print(
                f'the prediction for the \ndataset: {self.dataset_name}\n mode:{mode}\n '
                f'cluster:{cluster}\n have not been made')
            exit(0)

        self.predictions = np.load(_PREDICTION_PATH)
        print('predictions loaded')
        if not predict_train:
            self.target_indices = data.target_indices(mode, cluster)
        else:
            self.target_indices = sorted(find_last_clickouts(data.full_df()))
    def save(self, mode='full', add_unused_clickouts_to_test=True):
        """
        makes use of fit to create the dataset for a specific cluster. in particular it take cares
        to create a folder at the same level of base_split with the specified name and the
        folders structure inside 
        """
        print('Creating {} cluster...'.format(mode), end=' ', flush=True)
        self._fit(mode)

        # create cluster root folder
        path = f'dataset/preprocessed/{self.name}'
        check_folder(path)

        # create full and local folders
        full_path = os.path.join(path, mode)
        check_folder(full_path)

        train = data.train_df(mode).loc[self.train_indices]
        train.to_csv(os.path.join(full_path, 'train.csv'))
        del train

        # in case I specify some target_indices, I do not want to leave missing clickout not-to-predict
        if add_unused_clickouts_to_test & len(self.target_indices) > 0:
            indices_from_full = list(set(self.test_indices) - set(self.target_indices))
            indices_from_test = self.target_indices
            test = pd.concat([data.test_df(mode).loc[indices_from_test], data.full_df().loc[indices_from_full]])
        else:
            test = data.test_df(mode).loc[self.test_indices]

        test.to_csv(os.path.join(full_path, 'test.csv'))

        if len(self.target_indices) > 0:
            np.save(os.path.join(full_path, 'target_indices'), self.target_indices)
        else:
            trgt_indices = preprocess.get_target_indices(test)
            np.save(os.path.join(full_path, 'target_indices'), trgt_indices)
        del test

        print('Done!')