Beispiel #1
0
    def _fit(self, mode):
        """
        has target indices equal to the 10% of the session with:
        - no num ref
        - more than 1 step
        but anyway we train on all of them ;)
        """

        def RepresentsInt(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        train = data.train_df('small')

        test = data.test_df('small')
        tgt_indices = data.target_indices('small')

        real_test_to_keep = []
        for idx in tgt_indices:
            usr_sess_indices = []
            theres_int = False
            a_user = test.at[idx, 'user_id']
            a_sess = test.at[idx, 'session_id']
            usr_sess_indices.append(idx)
            j = idx-1
            pos_moved = 0
            while j >= 0:
                try:
                    new_user = test.at[j, 'user_id']
                    new_sess = test.at[j, 'session_id']
                    if new_user == a_user and new_sess == a_sess:
                        usr_sess_indices.append(j)
                        reference = test.at[j, 'reference']
                        if RepresentsInt(reference):
                            theres_int = True
                        j -= 1
                        pos_moved += 1
                    else:
                        if not (pos_moved == 0 or theres_int):
                            real_test_to_keep += sorted(usr_sess_indices)
                        break
                except:
                    if j < test.index.values[0]:
                        if not (pos_moved == 0 or theres_int):
                            real_test_to_keep += sorted(usr_sess_indices)
                        break
                    else:
                        j -= 1
        
        self.train_indices = train.index.values
        real_test_indices = retrieve_real_test_indices(mode, 'no_cluster')
        all_test_indices = data.test_df(mode).index.values
        self.test_indices = sorted(list(set(all_test_indices) - set(real_test_indices)) + real_test_to_keep)
        self.target_indices = sorted(list(set(self.test_indices) & set(tgt_indices)))
    def fit(self):

        df_test = data.test_df(self.mode, cluster=self.cluster)

        print("{}: creating grouped sessions with interaction lists".format(self.name))
        session_groups = self.get_groupby_sessions_references(data.test_df(self.mode, cluster=self.cluster))

        # Getting target sessions
        target_indices = data.target_indices(self.mode, self.cluster)

        df_test_target = df_test[df_test.index.isin(target_indices)]

        # I must reason with session_ids since i'm interested in getting last interactions of same session
        df_test_target = df_test_target.set_index("session_id")

        # then i create a dictionary for re-mapping session into indices
        if len(df_test_target.index) != len(target_indices):
            print("Indices not same lenght of sessions, go get some coffee...")
            return

        self.dictionary_indices = dict(zip(df_test_target.index, target_indices))

        list_sessions = session_groups.index

        recs_tuples = []

        print("{}: fitting the model".format(self.name))
        for i in tqdm(df_test_target.index):
            # Check if it is a session without interactions
            if i not in list_sessions:
                recs_tuples.append((self.dictionary_indices.get(i), []))
            else:
                # Get interacted element of session with no duplicates
                interacted_elements = np.asarray(session_groups.at[i, "sequence"])

                interacted_elements = np.asarray(self._set_no_reordering(x for x in interacted_elements))

                impressions = np.asarray(df_test_target.at[i, "impressions"].split("|"))

                # First i want to be sure the impressions contains all the interacted elements (if not, they must be cutted off from relevant items)
                mask_only_in_impression = np.in1d(interacted_elements, impressions, assume_unique=True)

                interacted_elements = interacted_elements[mask_only_in_impression]

                # I append the last interacted elements as first (so I invert the order of relevant_elements!)
                real_recommended = np.flipud(interacted_elements)

                real_recommended = real_recommended.astype(np.int)

                recs_tuples.append(
                    (self.dictionary_indices.get(i), list(real_recommended)[:self.k_first_only_to_recommend]))

        self.recs_batch = recs_tuples
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        temp = df[df.action_type == "clickout item"]
        temp = temp.drop_duplicates("session_id", keep="last")
        temp = temp[["user_id", "session_id", "step", "impressions"]]
        actions = list()
        for index, row in tqdm(temp.iterrows(), desc="Scanning clickouts"):
            if index > 0:
                if int(row.step) == 1:
                    actions.append(0)
                else:
                    impression = list(map(int, row.impressions.split("|")))[0]
                    reference = df.loc[index - 1, "reference"]
                    if (type(reference) == str) and (reference.isdigit()) and (int(reference) == impression):
                        actions.append(1)
                    else:
                        actions.append(0)
            else:
                actions.append(0)

        temp = temp.drop(["step", "impressions"], axis=1)
        temp["last_action_involving_first_impression"] = actions
        return temp
Beispiel #4
0
 def extract_feature(self):
     train = data.train_df(mode=self.mode, cluster=self.cluster)
     test = data.test_df(mode=self.mode, cluster=self.cluster)
     df = pd.concat([train, test])
     # remove last clks and last part of session
     new_df = remove_last_part_of_clk_sessions(df)
     new_df = new_df.drop(find(new_df))
     no_last_clks_numeric = new_df[
         new_df.reference.str.isnumeric() == True][[
             'user_id', 'session_id', 'action_type', 'reference'
         ]]
     # we want to make it fast, avoid any loops...
     # simply drop duplicates and mantain last occurence
     # of the tuple user-session-item :D
     last_actions = no_last_clks_numeric.drop_duplicates(
         ['user_id', 'session_id', 'reference'], keep='last')
     last_actions = last_actions.rename(
         columns={
             'reference': 'item_id',
             'action_type': 'last_action_involving_impression'
         })
     last_actions.item_id = last_actions.item_id.astype(int)
     # get last clickouts and expand
     last_clk = df.loc[find(df)]
     clk_expanded = expand_impressions(last_clk)[[
         'user_id', 'session_id', 'item_id'
     ]]
     # now simply merge and fill NaNs with 'no_action' as in the original feature
     feature = pd.merge(clk_expanded,
                        last_actions,
                        how='left',
                        on=['user_id', 'session_id', 'item_id'])
     feature.last_action_involving_impression = feature.last_action_involving_impression.astype(
         object).fillna('no_action')
     return feature
    def _fit(self, mode):
        self.train_indices = data.train_df(mode).index.values

        df = data.test_df(mode)
        self.test_indices = df.index.values

        just_missing_refs = df[df['reference'].isnull()]
        just_missing_refs = just_missing_refs[just_missing_refs['action_type']
                                              == 'clickout item']
        idx_last_ref_numeric = []
        for idx, row in just_missing_refs.iterrows():
            sess = row['session_id']
            i = 1
            while True:
                if not self.existsIndex(df, idx - i):
                    break
                prev_row = df.loc[idx - i]
                if prev_row['session_id'] != sess:
                    break
                if self.RepresentsInt(prev_row['reference']):
                    if i == 1:
                        idx_last_ref_numeric.append(idx)
                        break
                    else:
                        break
                else:
                    i += 1

        self.target_indices = idx_last_ref_numeric
Beispiel #6
0
    def fit(self):
        """
        Create list of tuples for recommendations ordering them by impressions
        """

        df_test = data.test_df(self.mode)

        target_indices = data.target_indices(self.mode, self.cluster)

        df_test_target = df_test[df_test.index.isin(target_indices)]

        # Initializing list of recs
        recs_tuples = []

        for i in tqdm(df_test_target.index):

            impressions = df_test_target.at[i, "impressions"]
            impressions = list(map(int, impressions.split('|')))

            prices = df_test_target.at[i, "prices"]
            prices = list(map(int, prices.split('|')))

            temp_dict = {}

            for j in range(len(impressions)):
                temp_dict[impressions[j]] = int(prices[j])

            ordered_recs = sorted(temp_dict, key=temp_dict.__getitem__)

            recs_tuples.append((i, ordered_recs))

        self.recs_batch = recs_tuples

        return recs_tuples
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        target_indices = list(
            df[df.action_type == "clickout item"].drop_duplicates(
                "session_id", keep="last").index)

        temp = df.loc[target_indices]
        avg_prices = list()
        min_prices = list()
        max_prices = list()
        for t in tqdm(zip(temp["session_id"], temp["prices"]),
                      desc="Retreiving first impression's price per clickout"):
            prices = list(map(int, t[1].split("|")))
            prices = prices[1:]
            if len(prices) > 0:
                avg_prices.append(mean(prices))
                min_prices.append(min(prices))
                max_prices.append(max(prices))
            else:
                avg_prices.append(0)
                min_prices.append(0)
                max_prices.append(0)

        if len(target_indices) != len(avg_prices):
            print("Something went wrong, blame Piccio")
            exit(69)

        temp = df.loc[target_indices, ["user_id", "session_id"]]
        temp["avg_prices_other_impressions"] = avg_prices
        temp["min_price"] = min_prices
        temp["max_price"] = max_prices
        return temp
    def extract_feature(self):
        self.current_directory = Path(__file__).absolute().parent
        self.data_dir = self.current_directory.joinpath(
            '..', '..', 'stacking', self.mode)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_indices = find(df)

        # extract scores
        self.train_dir = self.data_dir.joinpath('test')
        for file in glob.glob(str(self.train_dir) + '/rnn*'):
            rnn = np.load(file)
            rnn = pd.DataFrame(
                rnn, columns=['index', 'item_recommendations', 'scores'])
            rnn = rnn.astype({'index': int})
            rnn = rnn[rnn['index'].isin(last_indices)]

        rnn_idx = list(rnn['index'])
        print(f'Rnn indices are : {len(set(rnn_idx))}')
        print(f'Last indices are : {len((last_indices))}')
        common = set(rnn_idx) & set(last_indices)
        print(f'In common : {len(common)}')

        t = assign_score(rnn, 'rnn')
        t = t.sort_values(by='index')

        df['index'] = df.index.values
        df = df[['user_id', 'session_id', 'index']]
        df = pd.merge(t, df, how='left', on=['index'])
        num_idx = len(set(df['index'].values))
        print(num_idx)
        return df[['user_id', 'session_id', 'item_id', 'score_rnn']]
    def extract_feature(self):

        feature = TopPopPerImpression(mode=self.mode, cluster=self.cluster).read_feature()
        items = dict()
        for t in tqdm(zip(feature.item_id, feature.top_pop_per_impression), desc="Creating item dict..."):
            items[t[0]] = t[1]
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        target_indices = list(df[df.action_type == "clickout item"].drop_duplicates("session_id", keep="last").index)
        temp = df[df.index.isin(target_indices)]
        first_pop = list()
        max_pop_in_impressions = list()

        for t in tqdm(temp.impressions):
            impressions = list(map(int, t.split("|")))
            fi = impressions[0]
            if fi in items:
                fi_pop = items[fi]
            else:
                fi_pop = 0
            first_pop.append(fi_pop)
            max_pop = fi_pop
            for i in impressions[1:]:
                if i in items:
                    pop = items[i]
                    max_pop = max(pop, max_pop)
            max_pop_in_impressions.append(max_pop)

        temp = temp[["user_id", "session_id"]]
        temp["pop_first_impression"] = first_pop
        temp["max_pop_in_impressions"] = max_pop_in_impressions
        return temp
Beispiel #10
0
    def extract_feature(self):
        def func(x):
            change_of_sort_order_actions = x[x['action_type'] ==
                                             'change of sort order']
            if len(change_of_sort_order_actions) > 0:
                y = x[(x['action_type'] == 'clickout item')]
                if len(y) > 0:
                    clk = y.tail(1)
                    head_index = x.head(1).index
                    x = x.loc[head_index.values[0]:clk.index.values[0] - 1]
                    change_of_sort_order_actions = x[x['action_type'] ==
                                                     'change of sort order']
                    if len(change_of_sort_order_actions) > 0:
                        change_of_sort_order_actions = change_of_sort_order_actions.tail(
                            1)
                        return change_of_sort_order_actions[
                            'reference'].values[0]
            return 'our recommendations'

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        s = df.groupby(['user_id', 'session_id']).progress_apply(func)
        return pd.DataFrame({
            'user_id': [x[0] for x in s.index.values],
            'session_id': [x[1] for x in s.index.values],
            'sort_order_active_when_clickout': s.values
        })
def supersampling(mode):
    print("Supersampling for mode: {}".format(mode))
    train = data.train_df(mode)
    class_to_sessions = get_class_to_sessions_dict(train)
    session_to_indices = get_session_to_indices_dict(train)
    sessions_to_be_resapmled = resample_session(class_to_sessions.copy(),
                                                train)
    new = duplicate_sessions(sessions_to_be_resapmled.copy(), train,
                             session_to_indices)
    test = data.test_df(mode)
    max_test_index = max(test.index)
    max_train_index = max(train.index)
    max_index = max(max_test_index, max_train_index)
    new.index += max_index + 1
    new = pd.concat([train, new])
    train_len = len(new)
    old_starting_index = test.index[0]
    new = pd.concat([new, test])
    print("Supersampling ended for mode={}, saving df".format(mode))
    new_train = new.iloc[:train_len - 1]
    new_test = new.iloc[train_len:]
    #    new_starting_index = new_test.index[0]
    #    offset = new_starting_index - old_starting_index
    #    target_indices += offset
    target_indices = data.target_indices(mode, "no_cluster")
    np.save(path + "/" + mode + "/target_indices", target_indices)
    new_train.to_csv(path + "/" + mode + "/train.csv", index=True)
    new_test.to_csv(path + "/" + mode + "/test.csv", index=True)
def preprocess_cv(mode='full'):
    def save_folds(df, user_session_df, train_index, test_index, count, mode):
        u_s_train = list(
            user_session_df.loc[train_index]['user_session'].values)
        u_s_test = list(user_session_df.loc[test_index]['user_session'].values)

        path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode)
        check_folder(path)

        train = df[df['user_session'].isin(u_s_train)]
        train = train.drop(['user_session'], axis=1)
        train.to_csv(os.path.join(path, 'train.csv'))
        train_indices = train.index.values
        np.save(os.path.join(path, 'train_indices'), train_indices)

        test = df[df['user_session'].isin(u_s_test)]
        target_indices = sorted(find(test))
        test.at[target_indices, 'reference'] = np.nan
        test = test.drop(['user_session'], axis=1)
        test.to_csv(os.path.join(path, 'test.csv'))
        test_indices = test.index.values
        np.save(os.path.join(path, 'test_indices'), test_indices)
        np.save(os.path.join(path, 'target_indices'), target_indices)

        print(f'Train shape : {train.shape} , Test shape : {test.shape}')
        print(f'Last clickout indices : {len(target_indices)}')

    train_df = data.train_df(mode=mode, cluster='no_cluster')
    train_df['user_session'] = train_df['user_id'].values + '_' + train_df[
        'session_id'].values

    test_df = data.test_df(mode=mode, cluster='no_cluster')
    test_df['user_session'] = test_df['user_id'].values + '_' + test_df[
        'session_id'].values

    df = pd.concat([train_df, test_df])

    # extract user_session referring to target_indices
    target_indices = data.target_indices(mode=mode, cluster='no_cluster')
    test_target_u_s = test_df.loc[target_indices].drop_duplicates(
        'user_session')['user_session'].to_list()
    print(f'Number of user_session in target_indices : {len(test_target_u_s)}')

    # remove those sessions from df
    df = df[~df['user_session'].isin(test_target_u_s)]

    #df['user_session'] = df['user_id'].values + '_' + df['session_id'].values
    user_session_df = df.drop_duplicates('user_session')
    user_session_df = user_session_df.reset_index(drop=True)
    print(
        f'Number of user_session NOT in target_indices : {user_session_df.shape[0]}'
    )

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for i, (train_index, test_index) in enumerate(kf.split(user_session_df)):
        print(
            f' train indices : {len(train_index)}, test indices : {len(test_index)}'
        )
        save_folds(df, user_session_df, train_index, test_index, i, mode)
Beispiel #13
0
    def extract_feature(self):

        def func(x):

            def last_important_steps(x):

                y = x[x.action_type == 'filter selection'].tail(1)
                i = x[x.action_type == 'search for item'].tail(1)
                d = x[x.action_type == 'search for destination'].tail(1)
                p = x[x.action_type == 'search for poi'].tail(1)
                steps = [y, i, d, p]
                _from = 1
                _from_serie = x.head(1)
                for i in steps:
                    if i.step.empty != True:
                        if i.step.values[0] > _from:
                            _from = i.step.values[0]
                            _from_serie = i
                return pd.Series({'session_length_timestamp': int(x.tail(1)['timestamp'].values[0]) -
                                                              int(_from_serie['timestamp'].values[0]),
                                  'session_length_step': int(x.tail(1).step) - int(_from) + 1})

            _important_steps = x.groupby(['user_id', 'session_id']).progress_apply(last_important_steps)
            return pd.DataFrame(_important_steps).reset_index()

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        s = func(df)
        return s
Beispiel #14
0
    def _fit(self, mode):
        """
        Cluster and predict for the test sessions without any numerical reference interactions

        self.train_indices: will contain all the train interactions
        self.test_indices: will contain all the test interactions
        self.target_indices: will contain the test interactions of sessions without
                            any other numerical reference interaction
        """
        # use only train of only cluster
        train = data.train_df(mode)
        train_groups = train.groupby(['session_id', 'user_id'],
                                     as_index=False).progress_apply(
                                         self.func_remove_steps_over_clk)

        self.train_indices = [x[1] for x in train_groups.index.values]

        # Those are groups of train I need, now let's keep only last clickout as part of the session

        test = data.test_df(mode)

        test_df = test.groupby(['session_id', 'user_id'])

        test_df = test_df.progress_apply(self.func_remove_steps_over_clk_test)

        if test_df.shape[0] > 0:
            self.target_indices = test_df[test_df.action_type ==
                                          'clickout item'].index.values
            # test_df has only those indices belonging to desired sessions cluster
            self.test_indices = list(list(zip(*test_df.index.values))[2])
def create_weights_position(train_df, mode,cluster):
    train = data.train_df(mode, cluster)
    test = data.test_df(mode, cluster)
    df = pd.concat([train, test])
    # get for each user-session the position of the clicked item
    df_clks = df[(df['reference'].str.isnumeric()==True)&(df['action_type']=='clickout item')][['user_id','session_id','reference','impressions']]
    df_clks.impressions = df_clks.impressions.str.split('|')
    new_col = []
    for t in tqdm(zip(df_clks.reference, df_clks.impressions)):
        if t[0] in t[1]:
            new_col.append(t[1].index(t[0])+1)
        else:
            new_col.append(-1)
    df_clks['pos_clicked'] = new_col
    pos_clicked_list = df_clks.pos_clicked.tolist()
    # create dictionary {pos:score}
    dict_pos_score = {}
    for i in tqdm(range(1,26)):
        dict_pos_score[i] = 1-(pos_clicked_list.count(i)/len(pos_clicked_list)) # the function is 1-(#pos/tot_rowså)
    # group per user-session
    group = train_df.drop_duplicates(['user_id','session_id'])[['user_id','session_id']].reset_index(drop=True)
    # assign weight
    gr = train_df[train_df.label==1][['user_id','session_id','impression_position']]
    new_col = []
    for p in gr.impression_position:
        if p not in range(1,26):
            new_col.append(0)
        else:
            new_col.append(dict_pos_score[p])
    gr['weight'] = new_col
    final = pd.merge(group, gr, how='left', on=['user_id','session_id']).fillna(0)
    sample_weights = final['weight'].values
    return sample_weights
    def extract_feature(self):
        o = ImpressionFeature(self.mode)
        f = o.read_feature()
        f = f.drop(['properties'], axis=1)
        f['popularity'] = 0
        pop = dict(zip(f.item_id.values, f.popularity.values))

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        df_dropped_last_clickouts = df.drop(last_clickout_indices)
        df_no_last_clickouts = df_dropped_last_clickouts[
            (df_dropped_last_clickouts.action_type == 'clickout item')
            & ~(df_dropped_last_clickouts.reference.isnull())]
        references = df_no_last_clickouts.reference.values

        for r in references:
            pop[int(r)] += 1

        final_df = pd.DataFrame(
            list(pop.items()),
            columns=['item_id', 'top_pop_interaction_clickout_per_impression'])

        return final_df
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','action_type','impressions']]
        reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type == 'clickout item')]

        df_item_clicks = (
            reference_rows
            .groupby(["reference"])
            .size()
            .reset_index(name="n_interactions_per_item")
        )
        df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)
        #df_item_clicks

        clk_expanded = expand_impressions(clickout_rows)
        final_feature = pd.merge(clk_expanded, df_item_clicks, how='left', on=['item_id']).fillna(0)
        final_feature.n_interactions_per_item = final_feature.n_interactions_per_item.astype(int)
        final_feature = final_feature.drop(['index'], axis=1)

        final_feature.reference = final_feature.reference.astype(int)
        new_column = []
        for t in zip(final_feature.item_id, final_feature.reference, final_feature.n_interactions_per_item):
            if t[0] == t[1]:
                new_column.append(int(t[2]-1))
            else:
                new_column.append(int(t[2]))
        final_feature['personalized_popularity'] = new_column

        final_feature_reduced = final_feature[['user_id','session_id','item_id','personalized_popularity']]

        return final_feature_reduced
    def recommend_batch(self):

        svm_filename = 'svmlight_test.txt'
        _path = self.data_dir.joinpath(svm_filename)

        X_test, y_test = load_svmlight_file(str(_path))

        target_indices = data.target_indices(self.mode, self.cluster)
        target_indices.sort()

        test = data.test_df('small', 'no_cluster')
        print('data for test ready')

        scores = list(self.model.predict(X_test))

        final_predictions = []
        count = 0
        for index in tqdm(target_indices):
            impressions = list(map(int, test.loc[index]['impressions'].split('|')))
            predictions = scores[count:count + len(impressions)]
            couples = list(zip(predictions, impressions))
            couples.sort(key=lambda x: x[0], reverse=True)
            _, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr)))
            count = count + len(impressions)

        return final_predictions
Beispiel #19
0
    def extract_feature(self):
        def count_freq(x):
            r = []
            y = x[x['action_type'] == 'clickout item']
            if len(y) > 0:
                clk = y.tail(1)
                x = x[x['step'] < int(clk['step'])]
                list_impressions = list(
                    x[~x.impressions.isnull()].impressions.values)
                impressions = ('|'.join(list_impressions)).split('|')
                impr = clk.impressions.values[0].split('|')
                for i in impr:
                    r.append((i, impressions.count(i) + 1))
            return r

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        df = df.drop([
            'timestamp', 'reference', 'platform', 'city', 'device',
            'current_filters', 'prices'
        ],
                     axis=1)
        s = df.groupby(['user_id', 'session_id']).progress_apply(count_freq)
        s = s.apply(pd.Series).reset_index().melt(
            id_vars=['user_id', 'session_id'], value_name='tuple').sort_values(
                by=['user_id', 'session_id']).dropna()
        s[['item_id', 'times_impression_appeared_in_clickouts_session'
           ]] = pd.DataFrame(s['tuple'].tolist(), index=s.index)
        s = s.drop(['variable', 'tuple'], axis=1)
        s = s.reset_index(drop=True)
        return s
Beispiel #20
0
def merge_consecutive_equal_actions():
    tqdm.pandas()
    test = data.test_df('full')
    test_grouped_by_session_id = test.groupby('session_id')
    merged = test_grouped_by_session_id.progress_apply(
        _merge_consecutive_equal_actions)
    cf.check_folder('dataset/cleaned_csv')
    merged.to_csv('dataset/cleaned_csv/test.csv')
def train_indices(mode='local', cluster='no_cluster'):
    df_train = data.train_df(mode=mode, cluster=cluster)
    df_test = data.test_df(mode=mode, cluster=cluster)
    target_indices = data.target_indices(mode=mode, cluster=cluster)
    df = pd.concat([df_train, df_test])
    idx = find_last_clickout_indices(df)
    train_idx = set(idx) - set(target_indices)
    return train_idx
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        idxs_click = find_last_clickout_indices(df)
        temp = df[['user_id', 'session_id', 'step', 'timestamp']]
        session_id_l = []
        length_step_l = []
        length_timestamp_l = []
        timestamp_last_action_l = []
        final_timestamp_l = []
        user_id_l = []
        for i in tqdm(idxs_click):
            user_id = temp.at[i, 'user_id']
            session_id = temp.at[i, 'session_id']
            step = temp.at[i, 'step']
            f_timestamp = temp.at[i, 'timestamp']
            i_timestamp = temp.at[i - (step - 1), 'timestamp']
            if step > 1:
                timestamp_last_action = temp.at[i - 1, 'timestamp']
            else:
                timestamp_last_action = f_timestamp

            user_id_l.append(user_id)
            session_id_l.append(session_id)
            length_step_l.append(int(step))
            length_timestamp_l.append(int(f_timestamp - i_timestamp))
            timestamp_last_action_l.append(int(timestamp_last_action))
            final_timestamp_l.append(int(f_timestamp))
        final_df = pd.DataFrame({
            'user_id': user_id_l,
            'session_id': session_id_l,
            'length_step': length_step_l,
            'length_timestamp': length_timestamp_l,
            'timestamp_last_action': timestamp_last_action_l,
            'final_timestamp': final_timestamp_l
        })
        final_df['mean_time_action'] = final_df['length_timestamp'] / final_df[
            'length_step']

        final_df['elapsed_last_action_click'] = final_df[
            'final_timestamp'] - final_df['timestamp_last_action']

        final_df['elapsed_last_action_click_log'] = np.log(
            final_df['elapsed_last_action_click'] + 1)

        final_df['variance_last_action'] = (
            final_df['elapsed_last_action_click'] -
            final_df['mean_time_action'])**2

        final_df['std_last_action'] = abs(
            final_df['elapsed_last_action_click'] -
            final_df['mean_time_action'])

        final_df.drop(['timestamp_last_action', 'final_timestamp', 'mean_time_action', \
                       'length_step', 'length_timestamp', 'elapsed_last_action_click'], axis=1, inplace=True)
        return final_df
    def fit_predict(self, multithreading=True, save_folder='scores/'):

        if multithreading:
            self.scores = Parallel(backend='multiprocessing',
                                   n_jobs=-1,
                                   max_nbytes=None)(delayed(self._fit_model)(i)
                                                    for i in range(5))

            print(len(self.scores))
        else:
            self.scores = [self._fit_model(i) for i in range(5)]
            print(len(self.scores))

        model = self.model_class(mode=self.mode,
                                 cluster='no_cluster',
                                 **self.init_params)
        model.fit()
        scores_test = model.get_scores_batch()
        self.scores.append(scores_test)

        self.scores = [item for sublist in self.scores for item in sublist]
        scores = pd.DataFrame(
            self.scores, columns=['index', 'item_recommendations', 'scores'])
        scores = scores.sort_values(by='index')
        print(scores)
        idx_scores = set(scores['index'].values)

        train_full = data.train_df(mode='full', cluster='no_cluster')
        test_full = data.test_df(mode='full', cluster='no_cluster')
        full = pd.concat([train_full, test_full])
        full = full[['user_id', 'session_id', 'action_type']]

        last_clk_full = full.loc[idx_scores]

        # checking that all rows are clickouts
        num_not_clk_row = last_clk_full[
            last_clk_full['action_type'] != 'clickout item'].shape[0]
        print(f'Number of not clickout rows is : {num_not_clk_row}')
        if num_not_clk_row != 0:
            print("Error, some indices are not clickouts")

        last_clk_full = last_clk_full.drop(['action_type'], axis=1)

        last_clk_full['index'] = last_clk_full.index
        merged = last_clk_full.merge(scores, on=['index'])
        model_name = model.name
        df = assign_score(merged, self.model_name)
        df = df.drop(['index'], axis=1)

        if save_folder is not None:
            check_folder(save_folder)
            filepath = os.path.join(save_folder, model_name + '.csv.gz')
            print('Saving scores to', filepath, end=' ', flush=True)
            df.to_csv(filepath, index=False, compression='gzip')
            print('Done!', flush=True)

        return df
Beispiel #24
0
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        temp = df.fillna('0')
        idxs_click = sorted(find_last_clickout_indices(temp))
        idxs_numeric_reference = temp[temp['reference'].str.isnumeric() == True].index

        count = 0
        last_click = idxs_click[0]

        impr_features = {}
        impr_feature = []
        for i in tqdm(sorted(idxs_numeric_reference)):
            if i == last_click:
                impressions = list(map(int, temp.at[i, 'impressions'].split('|')))
                click_timestamp = temp.at[i, 'timestamp']
                click_step = temp.at[i, 'step']
                for impr in impressions:
                    if impr not in impr_features:
                        impr_feature.append({'num_interactions_impr': 0, 'step_from_last_interaction': -1,
                                             'timestamp_from_last_interaction': -1,
                                             'last_action_type_with_impr': 'None'})
                    else:
                        impr_features[impr]['timestamp_from_last_interaction'] = click_timestamp - impr_features[impr][
                            'timestamp_from_last_interaction']
                        impr_features[impr]['step_from_last_interaction'] = click_step - impr_features[impr][
                            'step_from_last_interaction']
                        impr_feature.append(impr_features[impr])
                impr_features = {}
                count += 1
                if count < len(idxs_click):
                    last_click = idxs_click[count]
                continue
            ref = int(temp.at[i, 'reference'])
            if ref in impr_features:
                impr_features[ref]['num_interactions_impr'] += 1
                impr_features[ref]['step_from_last_interaction'] = df.at[i, 'step']
                impr_features[ref]['timestamp_from_last_interaction'] = df.at[i, 'timestamp']
                impr_features[ref]['last_action_type_with_impr'] = df.at[i, 'action_type']
            else:
                impr_features[ref] = {'num_interactions_impr': 1, 'step_from_last_interaction': df.at[i, 'step'],
                                      'timestamp_from_last_interaction': df.at[i, 'timestamp'],
                                      'last_action_type_with_impr': df.at[i, 'action_type']}

        final_df = expand_impressions(temp[['user_id', 'session_id', 'impressions']].loc[idxs_click])
        print(len(final_df))
        print(len(impr_feature))
        final_df['dict'] = impr_feature

        features_df = pd.DataFrame(final_df.progress_apply(lambda x: tuple(x['dict'].values()), axis=1).tolist(),
                                   columns=list(final_df.iloc[0].dict.keys()))
        final_df_ = pd.concat([final_df, features_df], axis=1).drop('dict', axis=1)
        final_df_ = final_df_.drop(['num_interactions_impr', 'last_action_type_with_impr'], axis=1)
        return final_df_
    def extract_feature(self):

        list_of_sorting_filters_wout_pop = [
            'Sort by Price', 'Sort by Distance', 'Sort by Rating',
            'Best Value', 'Focus on Rating', 'Focus on Distance'
        ]

        list_of_sorting_filters = [
            'Sort by Price', 'Sort by Distance', 'Sort by Rating',
            'Best Value', 'Focus on Rating', 'Focus on Distance',
            'Sort by Popularity'
        ]

        def mask_sorting(x):
            if np.isin(x, list_of_sorting_filters_wout_pop).any():
                return x
            else:
                return ['Sort by Popularity']

        start = time.time()
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        indices_last_clks = find(df)
        d = df[df.action_type == 'clickout item'].drop(indices_last_clks)
        d_splitted = d.current_filters.progress_apply(
            lambda x: str(x).split('|'))
        md = d_splitted.progress_apply(mask_sorting)
        df_f = df.loc[md.index]
        df_ref = df_f.reference
        dict_ref_to_filters = dict(
            zip(df_ref.unique(), [dict(zip(list_of_sorting_filters, np.zeros(len(list_of_sorting_filters))))\
                                     for i in range(len(df_ref.unique()))]))

        for index, row in tqdm(df_f.iterrows(), total=df_f.shape[0]):
            for i in md.loc[index]:
                if i in list_of_sorting_filters:
                    dict_ref_to_filters[row.reference][i] += 1
        df_feature = pd.DataFrame.from_dict(dict_ref_to_filters,
                                            orient='index')
        df_feature = df_feature.astype(int).reset_index().rename(
            index=str, columns={"index": "item_id"})
        set_of_not_clicked_items = set(data.accomodations_df().item_id) - set(
            df_feature.item_id)
        extension = pd.DataFrame(data=sorted(
            [i for i in set_of_not_clicked_items]),
                                 columns=['item_id'])
        extd = df_feature.append(extension, ignore_index=True, sort=True)
        f = extd.fillna(0).reset_index().drop(columns=['index'])
        feature = f[np.insert(f.columns[:-1].values, 0,
                              f.columns[-1])].astype(int)

        _time = time.time() - start
        elapsed = time.strftime('%Mm %Ss', time.gmtime(_time))
        print(f"elapsed in: {elapsed}")
        return feature
Beispiel #26
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get all the cities
        cities = df['city'].unique().tolist()
        # get clickout rows (WITHOUT last clk)
        last_indices = find(df)
        df_non_last_clk = df.drop(last_indices)
        df_clickout = df_non_last_clk[(df_non_last_clk['action_type']=='clickout item')][['reference','city']]
        df_clickout = df_clickout.rename(columns={'reference':'item_id'})
        df_clickout = df_clickout.dropna() # remove NaNs, that should not be there anywayss
        df_clickout.item_id = df_clickout.item_id.astype(int)
        # open impressions df
        o = ImpressionFeature(mode='small')
        df_accomodations = o.read_feature(True)
        df_accomodations = df_accomodations.drop(['properties1 Star', 'properties2 Star', 'properties3 Star', 'properties4 Star', 'properties5 Star'],1)
        # get all clicks properties
        df_clicks_properties = pd.merge(df_clickout, df_accomodations, how='left', on=['item_id'])
        df_clicks_properties = df_clicks_properties.sort_values(by=['city'])
        df_clicks_properties = df_clicks_properties.drop('item_id',1)
        # sum all properties per city
        grouped_by_city = df_clicks_properties.groupby('city').sum()
        # create df with city:array_of_features
        df_city_features = pd.DataFrame(columns=['city','properties_array'])
        df_city_features.city = grouped_by_city.index
        df_city_features.properties_array = grouped_by_city.values.tolist()
        # now take last clk df
        clickout_rows = df.loc[last_indices,
                       ['user_id','session_id','city','action_type','impressions']][df.action_type == 'clickout item']
        clk_expanded = expand_impressions(clickout_rows)
        clk_expanded_wt_city_feat = pd.merge(clk_expanded, df_city_features, how='left', on=['city'])
        # create df with item:array_of_features
        array = df_accomodations.drop(['item_id'],axis=1).values
        df_item_features = pd.DataFrame(columns=['item_id','features_array'])
        df_item_features['item_id'] = df_accomodations['item_id'].values
        df_item_features['features_array'] = list(array)
        final_feature = pd.merge(clk_expanded_wt_city_feat, df_item_features, how='left', on=['item_id'])
        for n in tqdm(final_feature[final_feature['properties_array'].isnull()].index.tolist()):
            final_feature.at[n,'properties_array'] = [0]*152
        # cast list to numpy array to use the cosine (it's written for doubles)
        final_feature.properties_array = final_feature.properties_array.progress_apply(lambda x: np.asarray(x))
        # create new column
        new_col =[]
        if self.metric == 'cosine':
            shrink = 0 # TRY ME
            for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)):
                new_col.append(cosine_similarity(t[0].astype(np.double), t[1].astype(np.double),shrink))
        if self.metric == 'euclidean':
            for t in tqdm(zip(final_feature.properties_array, final_feature.features_array)):
                new_col.append(np.linalg.norm(t[0]-t[1]))
        # final feature
        new_feature = final_feature[['user_id','session_id','item_id']]
        new_feature['city_similarity'] = new_col

        return new_feature
Beispiel #27
0
    def __init__(self, filepaths, cluster):
        self.filepaths = filepaths
        self.cluster = cluster

        self.target_sessions = list(data.test_df("full", "no_cluster")
                                    .iloc[data.target_indices("full", "no_cluster")].session_id.values)

        #TODO Check if filepaths exists

        self.absolute_path = 'submission/'
Beispiel #28
0
    def extract_feature(self):
        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        # get ALL clickouts
        reference_rows = df[(df.reference.str.isnumeric() == True) & (df.action_type =='clickout item')][['user_id','session_id','reference','impressions']]
        # get last clickout
        last_clickout_indices = find(df)
        clickout_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']]
        clk_expanded = expand_impressions(clickout_rows)

        # get the impressions
        impression_lists = reference_rows.impressions.str.split('|').tolist()
        big_list = [x for l in impression_lists for x in l]
        c = dict(Counter(big_list))

        df_times_in_impressions = pd.DataFrame.from_dict(c, orient='index',columns=['number_of_times_in_impr'])
        df_times_in_impressions['item_id'] = df_times_in_impressions.index.astype(int)
        df_times_in_impressions = df_times_in_impressions.reindex(columns = ['item_id', 'number_of_times_in_impr'])

        feature_times_per_imp = pd.merge(clk_expanded, df_times_in_impressions, how='left', on=['item_id']).fillna(0)
        feature_times_per_imp.number_of_times_in_impr = feature_times_per_imp.number_of_times_in_impr.astype(int)
        feature_times_per_imp = feature_times_per_imp[['user_id', 'session_id','item_id','number_of_times_in_impr']]

        df_item_clicks = (
            reference_rows
            .groupby(["reference"])
            .size()
            .reset_index(name="n_clickouts")
        )
        df_item_clicks = df_item_clicks.rename(columns={'reference':'item_id'})
        df_item_clicks['item_id'] = df_item_clicks['item_id'].astype(int)
        merged = pd.merge(df_times_in_impressions, df_item_clicks, how='left', on=['item_id']).fillna(0)
        merged.n_clickouts = merged.n_clickouts.astype(int)

        final_feature = pd.merge(clk_expanded, merged, how='left', on=['item_id']).fillna(0)
        new_col = []
        final_feature.reference = final_feature.reference.astype(int)
        final_feature.item_id = final_feature.item_id.astype(int)
        for t in tqdm(zip(final_feature.reference, final_feature.item_id,
                     final_feature.number_of_times_in_impr, final_feature.n_clickouts)):
            if t[0]==t[1]: # stessa reference, quindi decremento di 1 sia #click che #imp
                if t[2]!=1:
                    new_col.append(round(((t[3]-1)*100)/(t[2]-1),5))
                else:
                    new_col.append(0)
            else:
                if 0 not in [t[2],t[3]] and t[2]!=1:
                    new_col.append(round(((t[3])*100)/(t[2]-1),5))
                else:
                    new_col.append(0)
        final_feature['adj_perc_click_appeared'] = new_col
        final_feature = final_feature[['user_id','session_id','item_id','adj_perc_click_appeared']]

        return final_feature
    def extract_feature(self):

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])

        idxs_click = find_last_clickout_indices(df)
        df = df.loc[idxs_click][[
            'user_id', 'session_id', 'impressions', 'prices'
        ]]

        impression_price_position_list = []
        fraction_pos_price_list = []
        for i in tqdm(df.index):
            impr = list(map(int, df.at[i, 'impressions'].split('|')))
            prices = list(map(int, df.at[i, 'prices'].split('|')))

            impression_position = np.arange(len(impr)) + 1

            couples = zip(prices, impression_position, impr)
            couples = sorted(couples, key=lambda a: a[0])

            prices_ordered, position, impressions_ordered = zip(*couples)

            _, price_pos = list(
                zip(*sorted(list(zip(position, impression_position)),
                            key=lambda a: a[0])))
            fraction_pos_price = list(impression_position / price_pos)

            fraction_pos_price_list.append(np.array(fraction_pos_price))
            impression_price_position_list.append(np.array(price_pos))
        df['impression_pos_price'] = impression_price_position_list

        df['impressions'] = df['impressions'].str.split('|')
        df['prices'] = df['prices'].str.split('|')

        final_df = pd.DataFrame({
            col: np.repeat(df[col], df['impressions'].str.len())
            for col in df.columns.drop(['impressions', 'prices'])
        }).assign(
            **{
                'item_id':
                np.concatenate(df['impressions'].values),
                'price':
                np.concatenate(df['prices'].values),
                'impression_pos_price':
                np.concatenate(df['impression_pos_price'].values)
            })

        final_df['item_id'] = pd.to_numeric(final_df['item_id'])
        final_df['impression_pos_price'] = pd.to_numeric(
            final_df['impression_pos_price'])
        final_df['price'] = pd.to_numeric(final_df['price'])

        return final_df
    def extract_feature(self):

        def get_pos(item, rec):
            res = np.empty(item.shape)
            for i in tqdm(range(len(item))):
                if str(item[i]) in rec[i]:
                    res[i] = rec[i].index(str(item[i])) + 1
                else:
                    res[i] = -1
            return res.astype(int)

        train = data.train_df(mode=self.mode, cluster=self.cluster)
        test = data.test_df(mode=self.mode, cluster=self.cluster)
        df = pd.concat([train, test])
        last_clickout_indices = find(df)
        all_clk_rows = df[df.reference.str.isnumeric()==True][df.action_type == 'clickout item']
        all_clk_rows = all_clk_rows [['user_id','session_id','reference','impressions']]

        all_clk_rows.impressions = all_clk_rows.impressions.str.split('|')
        pos_col = get_pos(all_clk_rows.reference.values,all_clk_rows.impressions.values)
        all_clk_rows = all_clk_rows.drop('impressions',1)
        all_clk_rows['position'] = pos_col
        all_clk_rows_after_1 = all_clk_rows[all_clk_rows.position>1]

        df_clicks_after_1 = (
            all_clk_rows_after_1
            .groupby(["reference"])
            .size()
            .reset_index(name="n_clicks_per_item")
        )
        df_clicks_after_1.reference = df_clicks_after_1.reference.astype(int)
        df_clicks_after_1 = df_clicks_after_1.rename(columns={'reference':'item_id'})

        last_clk_rows = df.loc[last_clickout_indices, ['user_id','session_id','reference','impressions']]
        last_clk_rows['imp_list'] = last_clk_rows.impressions.str.split('|')
        clk_expanded = expand_impressions(last_clk_rows)
        clk_expanded = clk_expanded.drop('index',1)

        pos_col = get_pos(clk_expanded.item_id.values,clk_expanded.imp_list.values)
        clk_expanded['position'] = pos_col
        clk_expanded = clk_expanded.drop('imp_list',1)

        merged = pd.merge(clk_expanded, df_clicks_after_1, how='left',on='item_id').fillna(0)
        new_col = []
        merged.item_id = merged.item_id.astype(int)
        merged.reference = merged.reference.astype(int)
        for t in tqdm(zip(merged.item_id, merged.reference, merged.position, merged.n_clicks_per_item)):
            if t[0]==t[1] and t[2]>1:
                new_col.append(int(t[3]-1))
            else:
                new_col.append(int(t[3]))

        merged['n_clicks_after_first_pos'] = new_col
        feature = merged[['user_id','session_id','item_id','n_clicks_after_first_pos']]
        return feature