Beispiel #1
0
        def create_user_feature_dict(df):

            idxs_clicks = find_last_clickout_indices(df)
            df = df.drop(idxs_clicks)

            # retrieve the icm
            icm = data.accomodations_one_hot()

            # filter on the columns of interests
            temp_df = df[['user_id', 'session_id', 'reference']].dropna()

            # mantain only the rows with numeric reference
            temp_df = temp_df[temp_df['reference'].str.isnumeric()]
            temp_df = temp_df.drop_duplicates()

            # retrieve user_sess and item associated to it from the dataframe
            users_idxs = temp_df.to_dict('list')['user_id']
            sessions_idxs = temp_df.to_dict('list')['session_id']
            users_sess = list(zip(users_idxs, sessions_idxs))
            items_idxs = list(map(int, temp_df.to_dict('list')['reference']))

            # create a diction with keys tuples like ('user_id', 'session_id') and as value the array representing
            # the user as an house summing up all the features of the houses with wich he has interacted
            # during the session
            count = 0
            user_session_dict = {}
            for user_session in tqdm(users_sess):
                user_items = icm.loc[items_idxs[count]].values
                if user_session in user_session_dict:
                    user_session_dict[user_session] += user_items
                else:
                    user_session_dict[user_session] = user_items
                count += 1

            return user_session_dict
Beispiel #2
0
        def retrieve_pd_dataframe_score(df):

            icm = data.accomodations_one_hot().sort_index()

            sess_user_dict = create_user_feature_dict(df)
            idxs_clicks = find_last_clickout_indices(df)

            scores = []
            # iterate on the index of the target clicks and create for each iteration a tuple to be appended on the final list
            print('computing the distances...')
            for idx in tqdm(idxs_clicks):

                # retrieve the user sess and impressions of the click
                user = df.at[idx, 'user_id']
                sess = df.at[idx, 'session_id']
                impressions = list(map(int, df.at[idx, 'impressions'].split('|')))

                # retrieve the impression of the user-sess pair if it isn't in the dictionary means
                # that there weren't numeric actions in the session so initialize it with an empty vector
                us_tuple = (user, sess)

                if us_tuple in sess_user_dict:
                    user_feature_vec = sess_user_dict[(user, sess)]
                else:
                    user_feature_vec = np.zeros(icm.shape[1])

                # retrieve the features of the impression selected
                features_imp = icm.loc[impressions].values

                # create the various version of the user vector CLIPPED, TRESHOLDED
                clipped_user_feature_vec = np.clip(user_feature_vec,0,1)

                tresholded_user_feature_vec = user_feature_vec.copy()
                if np.sum(user_feature_vec) > 0:
                    treshold_limit = np.mean(user_feature_vec[user_feature_vec > 0])
                    tresholded_user_feature_vec[tresholded_user_feature_vec<treshold_limit]=0

                # compute the distance between the two vectors
                _scores_manhattan = manhattan_distances(user_feature_vec.reshape(1, -1), features_imp)
                _scores_cosine = cosine_similarity(user_feature_vec.reshape(1, -1), features_imp)
                _scores_jaccard_no_norm = np.dot(user_feature_vec, features_imp.T)

                _scores_manhattan_clip = manhattan_distances(clipped_user_feature_vec.reshape(1, -1), features_imp)
                _scores_cosine_clip = cosine_similarity(clipped_user_feature_vec.reshape(1, -1), features_imp)
                _scores_jaccard_no_norm_clip = np.dot(clipped_user_feature_vec, features_imp.T)

                _scores_manhattan_tr = manhattan_distances(tresholded_user_feature_vec.reshape(1, -1), features_imp)
                _scores_cosine_tr = cosine_similarity(tresholded_user_feature_vec.reshape(1, -1), features_imp)
                _scores_jaccard_no_norm_tr = np.dot(tresholded_user_feature_vec, features_imp.T)

                # create and append a tuple on the final list
                for i in range(len(impressions)):
                    scores.append((user, sess, impressions[i],
                                   _scores_cosine[0][i], _scores_manhattan[0][i],_scores_jaccard_no_norm[i],
                                   _scores_cosine_clip[0][i], _scores_manhattan_clip[0][i],_scores_jaccard_no_norm_clip[i],
                                   _scores_cosine_tr[0][i], _scores_manhattan_tr[0][i],_scores_jaccard_no_norm_tr[i]))
            return pd.DataFrame(scores, columns=['user_id', 'session_id', 'item_id',
                                                 'scores_cosine', 'scores_manhatthan', 'scores_jaccard_no_norm',
                                                 'scores_cosine_clip', 'scores_manhatthan_clip', 'scores_jaccard_no_norm_clip',
                                                 'scores_cosine_tr', 'scores_manhatthan_tr', 'scores_jaccard_no_norm_tr'])
def merge_reference_features(df, pad_sessions_length):
    res_df = df.copy()
    # set the non-numeric references to 0 and cast to int
    res_df.loc[res_df.reference.str.isnumeric() != True, 'reference'] = 0
    res_df = res_df.astype({'reference': 'int'})
    # join
    res_df = res_df.merge(data.accomodations_one_hot(),
                          how='left',
                          left_on='reference',
                          right_index=True)
    # set to 0 the features of the non-joined rows
    features_cols = data.accomodations_one_hot().columns
    col_start = list(res_df.columns).index(features_cols[0])
    col_end = list(res_df.columns).index(features_cols[-1])
    res_df.loc[:, features_cols] = res_df.loc[:, features_cols].fillna(0)
    # remove the item features for the last clickout of each session: TO-DO clickout may be not the last item
    res_df.iloc[np.arange(-1, len(res_df), pad_sessions_length)[1:],
                col_start:col_end] = 0
    return res_df
def add_accomodations_features(df, path_to_save, logic='skip', row_indices=[]):
    """
    Add the features (one-hot) to the dataframe that match the 'reference' and save the resulting dataframe.
    It is possible to specify a list of rows to skip (logic='skip'), or to join only for some rows (logic='subset').
    Return the target columns and the one-hot columns that have been added to the dataframe
    """
    # save the references series and then set the reference to NaN to skip the join on that rows
    join_data = dict()
    join_data['backup_reference_series'] = df.reference.values.copy()
    if len(row_indices) > 0:
        if logic == 'skip':
            # set to NaN the rows to be skipped
            df.loc[row_indices, 'reference'] = np.nan
        if logic == 'subset':
            # set to NaN all rows, except for the specified rows
            backup_serie = df.loc[row_indices].reference.copy()
            df.reference = np.nan
            df.loc[row_indices, 'reference'] = backup_serie

    # cast the reference column to Int64 removing the string values
    df.reference = pd.to_numeric(df.reference,
                                 errors='coerce')  #.astype('Int64')

    # one-hot encoding of the accomodations features
    attributes_df = data.accomodations_one_hot()

    # accomodations features columns
    #features_columns = attributes_df.columns
    # with open(one_hot_accomodations_features_path, 'w') as f:
    #     pickle.dump(features_columns, f)

    #original_columns = set(df.columns)

    # add the 'no-reference' column
    #df['no_reference'] = (~df.reference.fillna('').str.isnumeric()) * 1

    # after_one_hot_columns = set(df.columns)
    # one_hot_columns = after_one_hot_columns.difference(original_columns)
    # one_hot_columns = list(one_hot_columns.union(set(features_columns)))

    def post_join(chunk_df, data):
        # reset the original references
        #chunk_df.loc[:,'reference'] = data['backup_reference_series'][data['$i1']:data['$i2']]
        return chunk_df.drop('reference', axis=1)

    sparsedf.left_join_in_chunks(df,
                                 attributes_df,
                                 left_on='reference',
                                 right_on=None,
                                 right_index=True,
                                 post_join_fn=post_join,
                                 data=join_data,
                                 path_to_save=path_to_save)
Beispiel #5
0
    def recommend_batch(self, target_indices):
        X, indices = self.dataset.load_Xtest()

        # predict the references
        predictions = self.model.predict(X)

        # flatten the predictions and the indices to be 2-dimensional
        predictions = predictions.reshape((-1, predictions.shape[-1]))
        indices = indices.flatten()

        # take only the target predictions
        pred_df = pd.DataFrame(predictions)
        pred_df['orig_index'] = indices
        pred_df = pred_df.set_index('orig_index')
        predictions = pred_df.loc[target_indices].sort_index().values
        del pred_df

        assert len(predictions) == len(target_indices)

        full_df = data.full_df()
        accomodations1hot_df = data.accomodations_one_hot()

        result_predictions = []
        for k, index in tqdm(enumerate(target_indices)):
            # get the impressions of the clickout to predict
            impr = list(map(int, full_df.loc[index]['impressions'].split('|')))
            # get the true labels from the accomodations one-hot
            true_labels = accomodations1hot_df.loc[impr].values
            # build a list of (impression, l2norm distance)
            prediction_impressions_distances = [
                (impr[j], L2Norm(true_labels[j] - predictions[k]))
                for j in range(len(impr))
            ]
            # order the list based on the l2norm (smaller distance is better)
            prediction_impressions_distances.sort(key=lambda tup: tup[1])
            # get only the impressions ids
            ordered_impressions = list(
                map(lambda x: x[0], prediction_impressions_distances))
            # append the couple (index, reranked impressions)
            result_predictions.append((index, ordered_impressions))

        print('prediction created !!!')

        return result_predictions
    def get_user_favorite_filters(full_df, users):
        """
        I want a structure that for every user in the train gives
        an one_hot_encoded structures for all possible parameters of hotels clicked by that user
        ex. parameter: 3 Stars
        """

        # get clickout of train and merge metadata of the hotel
        train_df = full_df[full_df["user_id"].isin(users)]
        train_df = train_df[(train_df["action_type"] == "clickout item") & (
            pd.to_numeric(train_df['reference'], errors='coerce').notnull())]

        train_df.drop([
            "session_id", "timestamp", "step", "action_type", "platform",
            "city", "device", "current_filters", "impressions", "prices"
        ],
                      axis=1,
                      inplace=True)

        # Merge & eliminate column
        metatadata_one_hot = data.accomodations_one_hot().reset_index()

        train_df['reference'] = train_df['reference'].astype(int)
        metatadata_one_hot['item_id'] = metatadata_one_hot['item_id'].astype(
            int)
        train_df = pd.merge(train_df,
                            metatadata_one_hot,
                            how='outer',
                            left_on='reference',
                            right_on='item_id')

        train_df = train_df.drop(["reference", "item_id"], axis=1)

        print(
            "Finishing binaryzing, now summing and getting user favorite properties of hotels..."
        )

        out_df = train_df.groupby('user_id')[train_df.columns[2:]].sum()
        return out_df
Beispiel #7
0
    def extract_feature(self):
        tqdm.pandas()

        df = data.full_df()
        #df = data.train_df('small')
        accom_df = data.accomodations_one_hot()

        # this kind of filters are those of the type 'change-of-sort order'
        # they have a particular meaning and they must be handled in a separate feature
        change_sort_filters = set(['sort by price', 'sort by distance', 'sort by rating', 'sort by popularity',
                                    'focus on rating', 'focus on distance', 'best value'])
        
        # find the clickout rows
        clickouts = df[(df.action_type == 'clickout item')] # & df.current_filters.notnull()]
        clickouts = clickouts[['user_id','session_id','current_filters','impressions']]
        # split the filters and the impressions
        clickouts['filters_list'] = clickouts['current_filters'].str.lower().str.split('|').fillna('')
        clickouts['impress_list'] = clickouts['impressions'].str.split('|')
        clickouts = clickouts.drop(['impressions','current_filters'], axis=1)
        # cast the impressions to int
        clickouts['impress_list'] = clickouts['impress_list'].apply(lambda x: list(map(int, x)))
        
        # create the binarizer with the same classes as the accomodations one-hot
        mlb = MultiLabelBinarizer(classes=accom_df.columns.str.lower())
        
        # iterate over the clickouts and one-hot
        print('Total interactions:', clickouts.shape[0])
        result = np.zeros((clickouts.shape[0],25), dtype='float')
        k = 0
        for idx in tqdm(clickouts.index):
            filters = clickouts.at[idx, 'filters_list']
            impressions = clickouts.at[idx, 'impress_list']
            # do not consider change-of-sort filters
            filters = set(filters).difference(change_sort_filters)
            # fix some wrong filters names
            if 'gay friendly' in filters:
                filters.remove('gay friendly')
                filters.add('gay-friendly')
            if 'internet (rooms)' in filters:
                filters.remove('internet (rooms)')
                filters.add('free wifi (rooms)')
            filters = list(filters)
            filters_len = len(filters)
            if filters_len > 0:
                # one-hot the filters
                filters_one_hot = mlb.fit_transform([filters])[0]
                # take the one-hot of the impressions tags
                impressions_features_one_hot = accom_df.loc[impressions].values

                satisfaction_percentage = np.sum( np.bitwise_and(filters_one_hot, impressions_features_one_hot), axis=1) / filters_len
                result[k, 0:len(impressions)] = satisfaction_percentage
            else:
                # there are only change-of-sort filters
                result[k, 0:len(impressions)] = 1
            k += 1
        
        result = result.round(4)

        # add the 25 new columns
        for i in range(25):
            clickouts['satisf_perc_{}'.format(i)] = result[:,i]

        return clickouts.drop(['user_id','session_id','filters_list','impress_list'], axis=1)
    def extract_feature(self):
        tqdm.pandas()

        tr = data.train_df(self.mode, cluster=self.cluster)
        te = data.test_df(self.mode, cluster=self.cluster)
        df = pd.concat([tr, te])
        accom_df = data.accomodations_one_hot()

        # this kind of filters are those of the type 'change-of-sort order'
        # they have a particular meaning and they must be handled in a separate feature
        change_sort_filters = set([
            'sort by price', 'sort by distance', 'sort by rating',
            'sort by popularity', 'focus on rating', 'focus on distance',
            'best value'
        ])

        # find the clickout rows
        last_clk = find(df)
        clickouts = df.loc[last_clk]
        clickouts = clickouts[[
            'user_id', 'session_id', 'current_filters', 'impressions'
        ]]
        # split the filters and the impressions
        clickouts['filters_list'] = clickouts['current_filters'].str.lower(
        ).str.split('|').fillna('')
        clickouts['impress_list'] = clickouts['impressions'].str.split('|')
        clickouts = clickouts.drop(['impressions', 'current_filters'], axis=1)
        # cast the impressions to int
        clickouts['impress_list'] = clickouts['impress_list'].apply(
            lambda x: list(map(int, x)))

        # create the binarizer with the same classes as the accomodations one-hot
        mlb = MultiLabelBinarizer(classes=accom_df.columns.str.lower())

        # iterate over the clickouts and one-hot
        print('Total interactions:', clickouts.shape[0])
        satisfaction_percentage = []
        k = 0
        for idx in tqdm(clickouts.index):
            filters = clickouts.at[idx, 'filters_list']
            impressions = clickouts.at[idx, 'impress_list']
            # do not consider change-of-sort filters
            filters = set(filters).difference(change_sort_filters)
            # fix some wrong filters names
            if 'gay friendly' in filters:
                filters.remove('gay friendly')
                filters.add('gay-friendly')
            if 'internet (rooms)' in filters:
                filters.remove('internet (rooms)')
                filters.add('free wifi (rooms)')
            filters = list(filters)
            filters_len = len(filters)
            if filters_len > 0:
                # one-hot the filters
                filters_one_hot = mlb.fit_transform([filters])[0]
                # take the one-hot of the impressions tags
                impressions_features_one_hot = accom_df.loc[impressions].values
                impression_satisfaction = list(
                    np.sum(np.bitwise_and(filters_one_hot,
                                          impressions_features_one_hot),
                           axis=1) / filters_len)
                satisfaction_percentage.append(impression_satisfaction)

            else:
                # there are only change-of-sort filters
                satisfaction_percentage.append(list(np.ones(len(impressions))))
            k += 1

        clickouts['satisfaction_percentage'] = satisfaction_percentage

        clickouts = clickouts.drop(['filters_list'], axis=1)

        expanded = pd.DataFrame({col: np.repeat(clickouts[col].values, clickouts.satisfaction_percentage.str.len()) \
                                for col in clickouts.columns.drop(['impress_list', 'satisfaction_percentage'])}).\
                                assign(**{'item_id': np.concatenate(clickouts.impress_list.values), \
                                        'satisfaction_percentage': np.concatenate(clickouts.satisfaction_percentage.values)})

        return expanded
def create_dataset_for_regression(mode, cluster, pad_sessions_length, add_item_features=True, save_X_Y=True):
    """
    pad_sessions_length (int): final length of sessions after padding/truncating
    add_item_features (bool): whether to add the one-hot accomodations features to the training data
    save_X_Y (bool): whether to save the train data into 2 separate files (X_train, Y_train) or in a unique file (train_vec)
    """
    train_df = data.train_df(mode, cluster)
    test_df = data.test_df(mode, cluster)

    path = f'dataset/preprocessed/{cluster}/{mode}/dataset_regression'
    check_folder(path)

    devices_classes = ['mobile', 'desktop', 'tablet']
    actions_classes = ['show_impression', 'clickout item', 'interaction item rating', 'interaction item info',
           'interaction item image', 'interaction item deals', 'change of sort order', 'filter selection',
           'search for item', 'search for destination', 'search for poi']
    
    ## ======== TRAIN ======== ##
    # add the impressions as new interactions
    print('Adding impressions as new actions...')
    train_df, final_new_index = sess2vec.add_impressions_as_new_actions(train_df)
    print('Done!\n')

    # pad the sessions
    if pad_sessions_length > 0:
        print('Padding/truncating sessions...')
        train_df = sess2vec.pad_sessions(train_df, max_session_length=pad_sessions_length)
        print('Done!\n')

    if save_X_Y:
        print('Getting the last clickout of each session...')
        train_clickouts_df = sess2vec.get_last_clickout(train_df, index_name='index', rename_index='orig_index')
        train_clickouts_indices = train_clickouts_df.orig_index.values
        train_clickouts_indices.sort()
        print('Done!\n')

    # add the one-hot of the device
    print('Adding one-hot columns of device...', end=' ', flush=True)
    train_df = sess2vec.one_hot_df_column(train_df, 'device', classes=devices_classes)
    print('Done!\n')

    # add the one-hot of the action-type
    print('Adding one-hot columns of action_type...', end=' ', flush=True)
    train_df = sess2vec.one_hot_df_column(train_df, 'action_type', classes=actions_classes)
    print('Done!\n')

    TRAIN_LEN = train_df.shape[0]
    TRAIN_NAME = ''

    if save_X_Y:
        # set the columns to be placed in the labels file
        Y_COLUMNS = ['user_id','session_id','timestamp','step','reference']
    
        # join the accomodations one-hot features
        X_train_path = os.path.join(path, 'X_train.csv')
        if add_item_features:
            print('Joining the accomodations features...')
            sess2vec.add_accomodations_features(train_df.copy(), X_train_path, logic='skip', row_indices=train_clickouts_indices)
        else:
            # set the last clickouts to NaN and save the X dataframe
            backup_ref_serie = train_df.reference.values.copy()
            train_df.loc[train_clickouts_indices, 'reference'] = np.nan
            train_df.to_csv(X_train_path, index_label='orig_index', float_format='%.4f')
            train_df.reference = backup_ref_serie
            del backup_ref_serie
        
        Y_train_path = os.path.join(path, 'Y_train.csv')
        train_df = train_df[Y_COLUMNS]
        if add_item_features:
            sess2vec.add_accomodations_features(train_df.copy(), Y_train_path, logic='subset', row_indices=train_clickouts_indices)
        else:
            # set all clickouts to NaN except for the last clickouts and save the Y dataframe
            backup_ref_serie = train_df.loc[train_clickouts_indices].reference.copy()
            train_df.reference = np.nan
            train_df.loc[train_clickouts_indices, 'reference'] = backup_ref_serie
            train_df.to_csv(Y_train_path, index_label='orig_index', float_format='%.4f')

        # clean ram
        del train_clickouts_df
        del train_clickouts_indices
    else:
        TRAIN_NAME = 'train_vec.csv'
        train_path = os.path.join(path, TRAIN_NAME)
        train_df.to_csv(train_path, index_label='orig_index', float_format='%.4f')

    del train_df

    ## ======== TEST ======== ##
    print('Adding impressions as new actions...')
    test_df, _ = sess2vec.add_impressions_as_new_actions(test_df, final_new_index)
    print('Done!\n')

    # pad the sessions
    if pad_sessions_length > 0:
        print('Padding/truncating sessions...')
        test_df = sess2vec.pad_sessions(test_df, max_session_length=pad_sessions_length)
        print('Done!\n')

    print('Getting the last clickout of each session...')
    test_clickouts_df = sess2vec.get_last_clickout(test_df, index_name='index', rename_index='orig_index')
    test_clickouts_indices = test_clickouts_df.orig_index.values
    test_clickouts_indices.sort()
    print('Done!\n')

    # add the one-hot of the device
    print('Adding one-hot columns of device...', end=' ', flush=True)
    test_df = sess2vec.one_hot_df_column(test_df, 'device', classes=devices_classes)
    print('Done!\n')

    # add the one-hot of the action-type
    print('Adding one-hot columns of action_type...', end=' ', flush=True)
    test_df = sess2vec.one_hot_df_column(test_df, 'action_type', classes=actions_classes)
    print('Done!\n')

    TEST_LEN = test_df.shape[0]

    # join the accomodations one-hot features
    X_test_path = os.path.join(path, 'X_test.csv')
    if add_item_features:
        print('Joining the accomodations features...')
        sess2vec.add_accomodations_features(test_df.copy(), X_test_path, logic='skip', row_indices=test_clickouts_indices)
    else:
        # set the last clickouts to NaN and save the X dataframe
        backup_ref_serie = test_df.reference.values.copy()
        test_df.loc[test_clickouts_indices, 'reference'] = np.nan
        test_df.to_csv(X_test_path, index_label='orig_index', float_format='%.4f')
        #test_df.reference = backup_ref_serie
        del backup_ref_serie
    
    ## ======== CONFIG ======== ##
    # save the dataset config file that stores dataset length and the list of sparse columns
    features_cols = list(data.accomodations_one_hot().columns) if add_item_features else []    
    x_sparse_cols = devices_classes + actions_classes + features_cols
    datasetconfig.save_config(path, mode, cluster, TRAIN_LEN, TEST_LEN, train_name=TRAIN_NAME,
                            rows_per_sample=pad_sessions_length,
                            X_sparse_cols=x_sparse_cols, Y_sparse_cols=features_cols)