Exemple #1
0
    def _fit(self, mode):
        self.test_indices = data.test_indices(mode)
        self.train_indices = data.train_indices(mode)

        total_target_set = set(
            data.target_indices(mode=mode, cluster=data.SPLIT_USED))
        covered_target_list = []
        for i in self.clusters:
            covered_target_list += list(data.target_indices(mode, cluster=i))
        covered_target_set = set(covered_target_list)
        self.target_indices = list(total_target_set - covered_target_set)
    def fit(self):
        """
        Fit the model on the data.
        Creates a list list_scores of 100m lines initialized to 0 representing id of each hotel

        For every Recommender matrix, for every triple, the corresponding
        cell of list_scores will be summed by the score

        """

        self.dict_scores = {}

        target_indices = data.target_indices(self.mode, self.cluster)

        # Initialize list for dict containing scores of imoressions
        for s_target in target_indices:
            self.dict_scores[s_target] = {}

        for i in range(len(self.impression_scores_matrices)):
            print("Getting scores from recommender number {} ...".format(i))

            self.matrices_array = self.impression_scores_matrices[i].copy()
            self._normalization(self.normalization_mode)

            for k in tqdm(range(len(self.impression_scores_matrices[i]))):
                triple = self.impression_scores_matrices[i][k]

                # updating scores multiplied by corresponding weight
                for j in range(len(triple[2])):
                    if triple[1][j] in self.dict_scores[triple[0]]:
                        self.dict_scores[triple[0]][triple[1][j]] += self.normalized_matrices_array[k][j] * \
                                                                     self.weights_array[i]
                    else:
                        self.dict_scores[triple[0]][triple[1][j]] = self.normalized_matrices_array[k][j] * \
                                                                    self.weights_array[i]
def preprocess_cv(mode='full'):
    def save_folds(df, user_session_df, train_index, test_index, count, mode):
        u_s_train = list(
            user_session_df.loc[train_index]['user_session'].values)
        u_s_test = list(user_session_df.loc[test_index]['user_session'].values)

        path = 'dataset/preprocessed/{}/{}'.format('fold_' + str(count), mode)
        check_folder(path)

        train = df[df['user_session'].isin(u_s_train)]
        train = train.drop(['user_session'], axis=1)
        train.to_csv(os.path.join(path, 'train.csv'))
        train_indices = train.index.values
        np.save(os.path.join(path, 'train_indices'), train_indices)

        test = df[df['user_session'].isin(u_s_test)]
        target_indices = sorted(find(test))
        test.at[target_indices, 'reference'] = np.nan
        test = test.drop(['user_session'], axis=1)
        test.to_csv(os.path.join(path, 'test.csv'))
        test_indices = test.index.values
        np.save(os.path.join(path, 'test_indices'), test_indices)
        np.save(os.path.join(path, 'target_indices'), target_indices)

        print(f'Train shape : {train.shape} , Test shape : {test.shape}')
        print(f'Last clickout indices : {len(target_indices)}')

    train_df = data.train_df(mode=mode, cluster='no_cluster')
    train_df['user_session'] = train_df['user_id'].values + '_' + train_df[
        'session_id'].values

    test_df = data.test_df(mode=mode, cluster='no_cluster')
    test_df['user_session'] = test_df['user_id'].values + '_' + test_df[
        'session_id'].values

    df = pd.concat([train_df, test_df])

    # extract user_session referring to target_indices
    target_indices = data.target_indices(mode=mode, cluster='no_cluster')
    test_target_u_s = test_df.loc[target_indices].drop_duplicates(
        'user_session')['user_session'].to_list()
    print(f'Number of user_session in target_indices : {len(test_target_u_s)}')

    # remove those sessions from df
    df = df[~df['user_session'].isin(test_target_u_s)]

    #df['user_session'] = df['user_id'].values + '_' + df['session_id'].values
    user_session_df = df.drop_duplicates('user_session')
    user_session_df = user_session_df.reset_index(drop=True)
    print(
        f'Number of user_session NOT in target_indices : {user_session_df.shape[0]}'
    )

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for i, (train_index, test_index) in enumerate(kf.split(user_session_df)):
        print(
            f' train indices : {len(train_index)}, test indices : {len(test_index)}'
        )
        save_folds(df, user_session_df, train_index, test_index, i, mode)
    def recommend_batch(self):

        svm_filename = 'svmlight_test.txt'
        _path = self.data_dir.joinpath(svm_filename)

        X_test, y_test = load_svmlight_file(str(_path))

        target_indices = data.target_indices(self.mode, self.cluster)
        target_indices.sort()

        test = data.test_df('small', 'no_cluster')
        print('data for test ready')

        scores = list(self.model.predict(X_test))

        final_predictions = []
        count = 0
        for index in tqdm(target_indices):
            impressions = list(map(int, test.loc[index]['impressions'].split('|')))
            predictions = scores[count:count + len(impressions)]
            couples = list(zip(predictions, impressions))
            couples.sort(key=lambda x: x[0], reverse=True)
            _, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr)))
            count = count + len(impressions)

        return final_predictions
def supersampling(mode):
    print("Supersampling for mode: {}".format(mode))
    train = data.train_df(mode)
    class_to_sessions = get_class_to_sessions_dict(train)
    session_to_indices = get_session_to_indices_dict(train)
    sessions_to_be_resapmled = resample_session(class_to_sessions.copy(),
                                                train)
    new = duplicate_sessions(sessions_to_be_resapmled.copy(), train,
                             session_to_indices)
    test = data.test_df(mode)
    max_test_index = max(test.index)
    max_train_index = max(train.index)
    max_index = max(max_test_index, max_train_index)
    new.index += max_index + 1
    new = pd.concat([train, new])
    train_len = len(new)
    old_starting_index = test.index[0]
    new = pd.concat([new, test])
    print("Supersampling ended for mode={}, saving df".format(mode))
    new_train = new.iloc[:train_len - 1]
    new_test = new.iloc[train_len:]
    #    new_starting_index = new_test.index[0]
    #    offset = new_starting_index - old_starting_index
    #    target_indices += offset
    target_indices = data.target_indices(mode, "no_cluster")
    np.save(path + "/" + mode + "/target_indices", target_indices)
    new_train.to_csv(path + "/" + mode + "/train.csv", index=True)
    new_test.to_csv(path + "/" + mode + "/test.csv", index=True)
def score_fn(model, X, y):
    global i
    print(i)
    i += 1
    t0 = time.time()

    target_indices = data.target_indices('local')
    full_impressions = data.full_df()

    scores = list(model.xg.predict(X))

    final_predictions = []
    count = 0
    for index in tqdm(target_indices):
        impressions = list(
            map(int, full_impressions.loc[index]['impressions'].split('|')))
        predictions = scores[count:count + len(impressions)]
        couples = list(zip(predictions, impressions))
        couples.sort(key=lambda x: x[0], reverse=True)
        _, sorted_impr = zip(*couples)
        final_predictions.append((index, list(sorted_impr)))
        count = count + len(impressions)

    mrr = model.compute_MRR(final_predictions)
    print('Done in', time.time() - t0)
    print()

    return mrr
Exemple #7
0
    def recommend_batch(self):
        test = pd.read_csv(
            str(self.data_directory.joinpath('..', 'gbdt_test.csv')))
        target_indices = data.target_indices(mode='full')
        #target_indices = sorted(target_indices)

        # Take indices in common between test and target_indices
        test_idx = set(list(test['index'].values))
        print(f'Length test_idx : {len(test_idx)}')
        print(f'Length target_indices : {len(target_indices)}')
        target_indices = set(target_indices) & test_idx
        print(f'Length indices in common: {len(target_indices)}')
        target_indices = sorted(target_indices)
        #target_indices = sorted(test_idx)

        X_test = test.sort_values(by=['index', 'impression_position'])
        X_test = X_test.drop(['item_id', 'label', 'index'], axis=1)
        X_test = X_test.astype(np.float64)
        # full_impressions = data.full_df()
        full_impressions = self.full
        print('data for test ready')
        scores = list(self.xg.predict(X_test))
        final_predictions = []
        count = 0
        for index in tqdm(target_indices):
            impressions = list(
                map(int,
                    full_impressions.loc[index]['impressions'].split('|')))
            predictions = scores[count:count + len(impressions)]
            couples = list(zip(predictions, impressions))
            couples.sort(key=lambda x: x[0], reverse=True)
            _, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr)))
            count = count + len(impressions)
        return final_predictions
Exemple #8
0
    def fit(self):
        """
        Create list of tuples for recommendations ordering them by impressions
        """

        df_test = data.test_df(self.mode)

        target_indices = data.target_indices(self.mode, self.cluster)

        df_test_target = df_test[df_test.index.isin(target_indices)]

        # Initializing list of recs
        recs_tuples = []

        for i in tqdm(df_test_target.index):

            impressions = df_test_target.at[i, "impressions"]
            impressions = list(map(int, impressions.split('|')))

            prices = df_test_target.at[i, "prices"]
            prices = list(map(int, prices.split('|')))

            temp_dict = {}

            for j in range(len(impressions)):
                temp_dict[impressions[j]] = int(prices[j])

            ordered_recs = sorted(temp_dict, key=temp_dict.__getitem__)

            recs_tuples.append((i, ordered_recs))

        self.recs_batch = recs_tuples

        return recs_tuples
def train_indices(mode='local', cluster='no_cluster'):
    df_train = data.train_df(mode=mode, cluster=cluster)
    df_test = data.test_df(mode=mode, cluster=cluster)
    target_indices = data.target_indices(mode=mode, cluster=cluster)
    df = pd.concat([df_train, df_test])
    idx = find_last_clickout_indices(df)
    train_idx = set(idx) - set(target_indices)
    return train_idx
Exemple #10
0
    def _fit(self, mode):
        """
        has target indices equal to the 10% of the session with:
        - no num ref
        - more than 1 step
        but anyway we train on all of them ;)
        """

        def RepresentsInt(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        train = data.train_df('small')

        test = data.test_df('small')
        tgt_indices = data.target_indices('small')

        real_test_to_keep = []
        for idx in tgt_indices:
            usr_sess_indices = []
            theres_int = False
            a_user = test.at[idx, 'user_id']
            a_sess = test.at[idx, 'session_id']
            usr_sess_indices.append(idx)
            j = idx-1
            pos_moved = 0
            while j >= 0:
                try:
                    new_user = test.at[j, 'user_id']
                    new_sess = test.at[j, 'session_id']
                    if new_user == a_user and new_sess == a_sess:
                        usr_sess_indices.append(j)
                        reference = test.at[j, 'reference']
                        if RepresentsInt(reference):
                            theres_int = True
                        j -= 1
                        pos_moved += 1
                    else:
                        if not (pos_moved == 0 or theres_int):
                            real_test_to_keep += sorted(usr_sess_indices)
                        break
                except:
                    if j < test.index.values[0]:
                        if not (pos_moved == 0 or theres_int):
                            real_test_to_keep += sorted(usr_sess_indices)
                        break
                    else:
                        j -= 1
        
        self.train_indices = train.index.values
        real_test_indices = retrieve_real_test_indices(mode, 'no_cluster')
        all_test_indices = data.test_df(mode).index.values
        self.test_indices = sorted(list(set(all_test_indices) - set(real_test_indices)) + real_test_to_keep)
        self.target_indices = sorted(list(set(self.test_indices) & set(tgt_indices)))
Exemple #11
0
    def __init__(self, filepaths, cluster):
        self.filepaths = filepaths
        self.cluster = cluster

        self.target_sessions = list(data.test_df("full", "no_cluster")
                                    .iloc[data.target_indices("full", "no_cluster")].session_id.values)

        #TODO Check if filepaths exists

        self.absolute_path = 'submission/'
Exemple #12
0
    def __init__(self,
                 mode='local',
                 learning_rate=0.3,
                 min_child_weight=1,
                 n_estimators=300,
                 max_depth=3,
                 subsample=1,
                 colsample_bytree=1,
                 reg_lambda=1.0,
                 reg_alpha=0):
        name = 'gbdt_hybrid'
        cluster = 'no_cluster'
        super(Gbdt_Hybrid, self).__init__(mode, cluster, name)

        self.current_directory = Path(__file__).absolute().parent
        self.data_directory = self.current_directory.joinpath(
            '..', '..', 'submissions/hybrid')
        #self.gt_csv = self.data_directory.joinpath('ground_truth.csv')
        self.mode = mode
        self.full = data.full_df()

        self.local_target_indices = data.target_indices(mode='local',
                                                        cluster='no_cluster')
        self.full_target_indices = data.target_indices(mode='full',
                                                       cluster='no_cluster')

        directory = self.data_directory.joinpath('local')

        full_dir = self.data_directory.joinpath('full')

        self.xg = xgb.XGBRanker(learning_rate=learning_rate,
                                min_child_weight=min_child_weight,
                                max_depth=math.ceil(max_depth),
                                n_estimators=math.ceil(n_estimators),
                                subsample=subsample,
                                colsample_bytree=colsample_bytree,
                                reg_lambda=reg_lambda,
                                reg_alpha=reg_alpha,
                                n_jobs=-1,
                                objective='rank:ndcg')

        self.cv_path = self.data_directory.joinpath('cross_validation')
Exemple #13
0
    def get_preprocessed_dataset(self, mode):
        """
        Apply preprocessing steps to dataset
        - add id to identify groups
        - keep only useful columns
        - join eventual one-hotted categorical features
        :param mode:
        :return:
        """

        if mode == 'train':
            classification_df = data.dataset_catboost_train(
                self.mode, self.cluster).copy()
        elif mode == 'test':
            classification_df = data.dataset_catboost_test(
                self.mode, self.cluster).copy()
        else:
            print('Wrong mode given in get_preprocessed_dataset!')
            return

        if len(self.features_to_drop) > 0:
            classification_df.drop(self.features_to_drop, axis=1, inplace=True)

        print('Lenght of dataset {} is {}, features numbers are {}'.format(
            mode, classification_df.shape[0], classification_df.shape[1]))

        target_indices = data.target_indices(mode=self.mode,
                                             cluster=self.cluster)

        if mode == 'test':
            self.test_df = data.test_df(self.mode, self.cluster)

            sessi_target = self.test_df.loc[target_indices].session_id.values
            self.dict_session_id = dict(zip(sessi_target, target_indices))

            classification_df['id'] = classification_df.apply(
                lambda row: self.dict_session_id.get(row.session_id), axis=1)

        else:
            # Creating univoque id for each user_id / session_id pair
            classification_df = classification_df.sort_values(
                by=['user_id', 'session_id'])
            classification_df = classification_df.assign(id=(
                classification_df['user_id'] + '_' +
                classification_df['session_id']).astype('category').cat.codes)

        if self.features_to_one_hot is not None:
            for f in self.features_to_one_hot:
                one_hot = pd.get_dummies(classification_df[f])
                classification_df = classification_df.drop([f], axis=1)
                classification_df = classification_df.join(one_hot)

        return classification_df
    def _fit(self, mode):
        """
        train, test and target indices are just sessions which have:
        - no num ref
        - more than 1 step
        """
        def RepresentsInt(s):
            try:
                int(s)
                return True
            except ValueError:
                return False

        train = data.train_df(mode)
        train_index = train.index.values
        test = data.test_df(mode)
        test_index = test.index.values
        tgt_indices = data.target_indices(mode)
        df = pd.concat([train, test])
        del train
        del test
        lst_clk_indices = sorted(find(df))

        to_return = []
        for idx in lst_clk_indices:
            usr_sess_indices = []
            try:
                a_user = df.at[idx, 'user_id']
                a_sess = df.at[idx, 'session_id']
                usr_sess_indices.append(idx)
            except:
                continue
            j = idx - 1
            while j >= 0:
                try:
                    new_user = df.at[j, 'user_id']
                    new_sess = df.at[j, 'session_id']
                    if new_user == a_user and new_sess == a_sess:
                        usr_sess_indices.append(j)
                        reference = df.at[j, 'reference']
                        if RepresentsInt(reference):
                            break
                        j -= 1
                    else:
                        if idx - j >= 2:
                            to_return += usr_sess_indices
                        break
                except:
                    j -= 1

        self.train_indices = sorted(list(set(train_index) & set(to_return)))
        self.test_indices = sorted(list(set(test_index) & set(to_return)))
        self.target_indices = sorted(list(set(tgt_indices) & set(to_return)))
 def fit(self):
     all_recs = {}
     self.recs = []
     for e in recs:
         if i in all_recs:
             raise Exception('Recommendation ids shouldn\'t be overlapped')
         all_recs[e[0]] = e[1]
     target_ids = data.target_indices(self.mode, self.cluster)
     for i in target_ids:
         if i not in all_recs:
             raise Exception('Recommendation ids should cover all the ids of the actual mode used')
         self.recs.append((i, all_recs[i]))
    def fit(self):

        df_test = data.test_df(self.mode, cluster=self.cluster)

        print("{}: creating grouped sessions with interaction lists".format(self.name))
        session_groups = self.get_groupby_sessions_references(data.test_df(self.mode, cluster=self.cluster))

        # Getting target sessions
        target_indices = data.target_indices(self.mode, self.cluster)

        df_test_target = df_test[df_test.index.isin(target_indices)]

        # I must reason with session_ids since i'm interested in getting last interactions of same session
        df_test_target = df_test_target.set_index("session_id")

        # then i create a dictionary for re-mapping session into indices
        if len(df_test_target.index) != len(target_indices):
            print("Indices not same lenght of sessions, go get some coffee...")
            return

        self.dictionary_indices = dict(zip(df_test_target.index, target_indices))

        list_sessions = session_groups.index

        recs_tuples = []

        print("{}: fitting the model".format(self.name))
        for i in tqdm(df_test_target.index):
            # Check if it is a session without interactions
            if i not in list_sessions:
                recs_tuples.append((self.dictionary_indices.get(i), []))
            else:
                # Get interacted element of session with no duplicates
                interacted_elements = np.asarray(session_groups.at[i, "sequence"])

                interacted_elements = np.asarray(self._set_no_reordering(x for x in interacted_elements))

                impressions = np.asarray(df_test_target.at[i, "impressions"].split("|"))

                # First i want to be sure the impressions contains all the interacted elements (if not, they must be cutted off from relevant items)
                mask_only_in_impression = np.in1d(interacted_elements, impressions, assume_unique=True)

                interacted_elements = interacted_elements[mask_only_in_impression]

                # I append the last interacted elements as first (so I invert the order of relevant_elements!)
                real_recommended = np.flipud(interacted_elements)

                real_recommended = real_recommended.astype(np.int)

                recs_tuples.append(
                    (self.dictionary_indices.get(i), list(real_recommended)[:self.k_first_only_to_recommend]))

        self.recs_batch = recs_tuples
Exemple #17
0
    def recommend_batch(self):
        test = data.test_df(self.mode, self.cluster)
        target_indices = data.target_indices(self.mode, self.cluster)
        target_indices.sort()

        if self.include_test:
            test_with_weights = self.test_with_weights
        else:
            dataset_test = self.get_preprocessed_dataset(mode='test')

            print('data for test ready')

            test_feat_df = dataset_test.drop(
                ['user_id', 'session_id', 'item_id', 'label', 'id'], axis=1)

            if list(test_feat_df.columns.values) != list(self.train_features):
                print(
                    'Training columns are different from test columns! Check')
                print(self.train_features)
                print(test_feat_df.columns.values)
                exit(0)

            X_test = test_feat_df.values

            group_id = dataset_test.id.values

            test_with_weights = Pool(data=X_test,
                                     label=None,
                                     group_id=group_id,
                                     cat_features=self.categorical_features)

        scores = self.ctb.predict(test_with_weights)

        self.predictions = []
        self.scores_batch = []
        count = 0
        for index in tqdm(target_indices):
            impressions = list(
                map(int, test.loc[index]['impressions'].split('|')))
            predictions = scores[count:count + len(impressions)]
            couples = list(zip(predictions, impressions))
            couples.sort(key=lambda x: x[0], reverse=True)
            if len(couples) > 0:
                scores_impr, sorted_impr = zip(*couples)
                count = count + len(impressions)

                self.predictions.append((index, list(sorted_impr)))
                self.scores_batch.append(
                    (index, list(sorted_impr), scores_impr))

        return self.predictions
    def __init__(self,
                 matrix,
                 _type,
                 mode='full',
                 cluster='no_cluster',
                 name='distancebased',
                 urm_name='urm_clickout',
                 k=100,
                 distance='cosine',
                 shrink=0,
                 threshold=0,
                 implicit=False,
                 alpha=0.5,
                 beta=0.5,
                 l=0.5,
                 c=0.5,
                 urm=None,
                 matrix_mul_order='standard'):
        super(DistanceBasedRecommender, self).__init__(mode=mode,
                                                       cluster=cluster,
                                                       name=name)
        self.type = _type
        self.urm_name = urm_name
        self._sim_matrix = None
        self.target_indices = data.target_indices(mode, cluster)
        self._matrix_mul_order = matrix_mul_order  # if you want R•R', or 'inverse' if you want to compute S•R

        self.dict_row = data.dictionary_row(mode=self.mode,
                                            cluster=self.cluster,
                                            urm_name=self.urm_name,
                                            type=self.type)
        self.dict_col = data.dictionary_col(mode=self.mode,
                                            cluster=self.cluster,
                                            urm_name=self.urm_name,
                                            type=self.type)

        self.matrix = matrix
        self.k = int(k)
        self.distance = distance
        self.shrink = shrink
        self.threshold = threshold
        self.implicit = implicit
        self.alpha = alpha
        self.beta = beta
        self.l = l
        self.c = c
        self.urm = urm
        self.scores_batch = None
Exemple #19
0
    def fit(self):
        """
        Create list of tuples for recommendations ordering them by impressions
        """

        df_test = data.test_df(self.mode)

        target_indices = data.target_indices(self.mode, self.cluster)

        df_test_target = df_test[df_test.index.isin(target_indices)]

        # Initializing list of recs
        recs_tuples = []
        for i in tqdm(df_test_target.index):
            impressions = df_test_target.at[i, "impressions"]
            impressions = list(map(int, impressions.split('|')))
            recs_tuples.append((i, impressions))

        self.recs_batch = recs_tuples
    def train():
        mode = menu.mode_selection()
        # fit the model
        model = interactive_model(mode)
        model.fit(epochs=10000,
                  early_stopping_patience=25,
                  early_stopping_on='val_mrr',
                  mode='max')
        print('\nFit completed!')

        # recommend
        target_indices = data.target_indices(mode, 'cluster_recurrent')
        print('Recommending...')
        recommendations = model.recommend_batch(target_indices)
        print('Recommendation count: ', len(recommendations))
        mrr = model.compute_MRR(recommendations)

        model.save(folderpath='saved_models/',
                   suffix='_{}'.format(round(mrr, 5)).replace('.', ''))
    def __init__(self, mode, urm_name, _type, cluster='no_cluster'):
        name = 'content_based_remastered'

        super(ContentBased, self).__init__(mode, cluster, name)
        self.type = _type
        self.urm_name = urm_name
        self.dict_row = data.dictionary_row(mode=mode,
                                            urm_name=urm_name,
                                            cluster=cluster,
                                            type=self.type)
        self.dict_col = data.dictionary_col(mode=mode,
                                            urm_name=urm_name,
                                            cluster=cluster,
                                            type=self.type)
        self.user_features_matrix = None
        self.scores_batch = None

        # load target indices
        self.target_indices = data.target_indices(mode, cluster)
    def __init__(self,
                 mode,
                 cluster,
                 urm_name,
                 factors=100,
                 regularization=0.01,
                 iterations=10,
                 alpha=25):
        os.environ['MKL_NUM_THREADS'] = '1'
        name = 'ALS urm_name: {}\n factors: {}\n regularization: {}\n ' \
                    'iterations: {}\n alpha: {}'.format(urm_name, factors, regularization, iterations, alpha)
        super(AlternatingLeastSquare, self).__init__(mode, cluster, name)

        self.factors = int(factors)
        self.regularization = regularization
        self.iterations = int(iterations)
        self.alpha = int(alpha)

        self.target_indices = data.target_indices(mode, cluster)

        self.dict_row = data.dictionary_row(mode, cluster)
        self.target_indices_urm = []
        for ind in self.target_indices:
            self.target_indices_urm.append(self.dict_row[tuple(
                data.full_df().loc[ind][['session_id', 'user_id']])])

        self.urm = data.urm(mode=mode, cluster=cluster, urm_name=urm_name)
        self.user_vecs = None
        self.item_vecs = None
        self._model = None

        self.fixed_params_dict = {
            'mode': mode,
            'urm_name': urm_name,
            'cluster': cluster
        }

        self.hyperparameters_dict = {
            'factors': (50, 200),
            'regularization': (0, 1),
            'iterations': (1, 250),
            'alpha': (15, 45)
        }
    def load_Xtest(self):
        test_df = data.dataset_catboost_test(mode=self.mode,
                                             cluster=self.cluster)

        target_indices = data.target_indices(self.mode, self.cluster)
        sessi_target = data.test_df(
            self.mode, self.cluster).loc[target_indices].session_id.values

        dict_session_trg_idx = dict(zip(sessi_target, target_indices))

        test_df['id'] = test_df.apply(
            lambda row: dict_session_trg_idx.get(row.session_id), axis=1)

        print('data for test ready')

        test_feat_df = test_df.drop(['user_id', 'session_id', 'item_id'],
                                    axis=1)

        return test_feat_df
 def recommend_batch(self):
     X_test, _, _, _ = data.dataset_xgboost_test(mode=self.mode,
                                                 cluster=self.cluster,
                                                 kind=self.kind)
     target_indices = data.target_indices(self.mode, self.cluster)
     full_impressions = data.full_df()
     print('data for test ready')
     scores = list(self.xg.predict(X_test))
     final_predictions = []
     count = 0
     for index in tqdm(target_indices):
         impressions = list(
             map(int,
                 full_impressions.loc[index]['impressions'].split('|')))
         predictions = scores[count:count + len(impressions)]
         couples = list(zip(predictions, impressions))
         couples.sort(key=lambda x: x[0], reverse=True)
         _, sorted_impr = zip(*couples)
         final_predictions.append((index, list(sorted_impr)))
         count = count + len(impressions)
     return final_predictions
Exemple #25
0
    def __init__(self,
                 mode,
                 cluster,
                 dataset_name,
                 pred_name,
                 predict_train=False):
        """
        the dataset name is used to load the prediction created by the tensorflow ranking class

        :param dataset_name: dataset name passed to the CreateDataset() method
        """

        name = 'tf_ranking'
        super(TensorflowRankig, self).__init__(mode=mode,
                                               cluster=cluster,
                                               name=name)

        self.dataset_name = dataset_name

        # the path where the PREDICTION are stored
        _BASE_PATH = f'dataset/preprocessed/tf_ranking/{cluster}/{mode}/{self.dataset_name}'

        _PREDICTION_PATH = f'{_BASE_PATH}/{pred_name}.npy'

        # check if the PREDICTION have been made
        exists_path_predictions = os.path.isfile(_PREDICTION_PATH)

        if not exists_path_predictions:
            print(
                f'the prediction for the \ndataset: {self.dataset_name}\n mode:{mode}\n '
                f'cluster:{cluster}\n have not been made')
            exit(0)

        self.predictions = np.load(_PREDICTION_PATH)
        print('predictions loaded')
        if not predict_train:
            self.target_indices = data.target_indices(mode, cluster)
        else:
            self.target_indices = sorted(find_last_clickouts(data.full_df()))
Exemple #26
0
    def recommend_batch(self):
        print('loading target indices')
        target_indices = data.target_indices(mode=self.mode,
                                             cluster=self.cluster)
        print('done\n')

        full_impressions = data.full_df()

        print('retriving predictions')
        scores = self.model.predict(self.x_vali)
        final_predictions = []
        count = 0
        for index in tqdm(target_indices):
            impressions = list(
                map(int,
                    full_impressions.loc[index]['impressions'].split('|')))
            predictions = scores[count:count + len(impressions)]
            couples = list(zip(predictions, impressions))
            couples.sort(key=lambda x: x[0], reverse=True)
            _, sorted_impr = zip(*couples)
            final_predictions.append((index, list(sorted_impr)))
            count = count + len(impressions)
        return final_predictions
Exemple #27
0
    def fit(self):
        df_test = data.test_df(self.mode, cluster=self.cluster)

        # Getting target sessions
        target_indices = data.target_indices(self.mode, self.cluster)

        df_test_target = df_test[df_test.index.isin(target_indices)]
        count = 0
        oneshot = 0
        resorted = 0
        recs_tuples = []
        print("Fitting...")
        for index, row in tqdm(df_test_target.iterrows()):
            impressions = list(map(int, row["impressions"].split("|")))
            if int(row.step) == 1:
                # This means that the clickout is the first interaction of the session --> we are in a one shot session
                oneshot += 1
                recs_tuples.append((index, impressions))
            else:
                previous_row = df_test.loc[index - 1]
                last_interaction_ref = previous_row["reference"]
                if type(last_interaction_ref) == str and last_interaction_ref.isdigit():
                    last_interaction_ref = int(last_interaction_ref)
                if last_interaction_ref and last_interaction_ref in impressions:
                    i = impressions.index(last_interaction_ref)
                    sorted_impressions = impressions[i:] + impressions[:i]
                    count += 1
                    recs_tuples.append((index, sorted_impressions))
                else:
                    resorted += 1
                    recs_tuples.append((index, impressions))

        print("{} % of resorted session".format(round(resorted / len(df_test_target) * 100, 2)))
        print("{} % of oneshot session".format(round(oneshot / len(df_test_target) * 100, 2)))
        print("{} % of lazy session".format(round(count / len(df_test_target) * 100, 2)))
        self.recs = recs_tuples
        print("Fitting completed!")
    def submission():
        mode = 'full'
        model = interactive_model(mode)
        sub_suffix = input('Insert submission suffix: ')

        checkpoint_path = menu.checkpoint_selection(
            checkpoints_dir='saved_models')

        print('Loading {}...'.format(checkpoint_path), end='\r', flush=True)
        model.load(checkpoint_path)
        print('Done!', flush=True)

        # recommend
        target_indices = data.target_indices(mode, 'cluster_recurrent')
        print('Recommending...')
        recommendations = model.recommend_batch(target_indices)

        # create and send sub
        sub_name = f'{model.name}_{sub_suffix}'
        sub_path = out.create_sub(recommendations, submission_name=sub_name)
        print('Done')
        sub.send(sub_path,
                 username='******',
                 password='******')
import utils.menu as menu
from recommenders.recurrent.RNNClassificationRecommender import RNNClassificationRecommender
from utils.dataset import SequenceDatasetForClassification

if __name__ == "__main__":
    mode = menu.mode_selection()

    dataset = SequenceDatasetForClassification(
        f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification'
    )

    model = RNNClassificationRecommender(dataset,
                                         cell_type='gru',
                                         num_recurrent_units=128,
                                         num_recurrent_layers=3,
                                         num_dense_layers=2,
                                         class_weights=[])

    model.load(
        '/Users/federico/Desktop/rnn_GRU_3layers_128units_2dense_class_06316.h5'
    )
    #model.load('gru.h5')

    target_indices = data.target_indices(mode, 'cluster_recurrent')
    recomendations = model.recommend_batch(target_indices)

    if mode != 'full':
        model.compute_MRR(recomendations)

    out.create_sub(recomendations, submission_name=model.name + '_06316')
def build_user_prop(mode, cluster='no_cluster'):
    def func(x):

        y = x[(x['action_type'] == 'clickout item')]

        # features
        features = {
            'avg price': -1,
            'avg cheap position': -1,
            'avg time per step': 0,
            'session avg length': 0,
            'session avg steps': 0,
            'session num': 0,
            'mobile perc': 0,
            'tablet perc': 0,
            'desktop perc': 0,
            'filters_during_session': '',
            'num change of sort order session': 0,
            'num clickout item session': 0,
            'num filter selection session': 0,
            'num interaction item deals session': 0,
            'num interaction item image session': 0,
            'num interaction item info session': 0,
            'num interaction item rating session': 0,
            'num search for destination session': 0,
            'num search for item session': 0,
            'num search for poi session': 0
        }

        # Compute avg lenght of session in seconds (OSS: not considering session ending at last clickout!)
        session_grouped = x.groupby("session_id")
        get_lenght_sum = 0
        step_count = 0

        for name, group in session_grouped:
            tail_temp = group.tail(1)
            get_lenght_sum += int(tail_temp['timestamp'].values[0]) - int(
                group.head(1)['timestamp'].values[0])
            step_count += int(tail_temp['step'])

        # Compure avg steps in a session (OSS: not considering session ending at last clickout!)
        user_sessions = set(x['session_id'].values)
        avg_steps = round(step_count / len(user_sessions), 2)
        features['session avg steps'] = avg_steps
        features['session num'] = len(user_sessions)
        avg_length = round(get_lenght_sum / len(user_sessions), 2)
        features['session avg length'] = avg_length
        features['avg time per step'] = round(avg_length / avg_steps, 2)

        # Computing types of non_numeric actions performed by that user in the past
        actions = list(x['action_type'].values)

        for ind in range(len(actions)):
            if ('num ' + actions[ind] + ' session') in features:
                features['num ' + actions[ind] + ' session'] += 1

        # Remove duplicates:
        x.drop(['timestamp', 'step'], axis=1, inplace=True)
        x = x.drop_duplicates()

        if len(y) > 0:
            # Builld a record of interacted price of items only when available:
            impressions_prices_available = y[y['impressions'] != None][[
                "impressions", "prices"
            ]].drop_duplicates()
            # [13, 43, 4352, 543, 345, 3523] impressions
            # [45, 34, 54, 54, 56, 54] prices
            # -> [(13,45), (43,34), ...]
            # Then create dict
            # {13: 45, 43: 34, ... }

            tuples_impr_prices = []
            tuples_impr_price_pos_asc = []
            for i in impressions_prices_available.index:
                impr = impressions_prices_available.at[i, 'impressions'].split(
                    '|')
                prices = impressions_prices_available.at[i,
                                                         'prices'].split('|')
                tuples_impr_prices += list(zip(impr, prices))

                sorted(tuples_impr_prices, key=lambda x: x[1])

                tuples_impr_price_pos_asc += list(
                    zip(impr, list(range(1,
                                         len(tuples_impr_prices) + 1))))

            tuples_impr_prices = list(set(tuples_impr_prices))
            dict_impr_price = dict(tuples_impr_prices)

            # Create dict for getting position wrt clicked impression based on cheapest item
            tuples_impr_price_pos_asc = list(set(tuples_impr_price_pos_asc))
            dict_impr_price_pos = dict(tuples_impr_price_pos_asc)

            sum_price = 0
            sum_pos_price = 0
            count_interacted = 0

            # IMPORTANT: I decided to consider impressions and clickouts distinctively.
            # If an impression is also clicked, that price counts double
            df_only_numeric = x[pd.to_numeric(x['reference'],
                                              errors='coerce').notnull()][[
                                                  "reference", "impressions",
                                                  "action_type"
                                              ]].drop_duplicates()

            # Not considering last clickout in the train sessions
            clks_num_reference = df_only_numeric[df_only_numeric['action_type']
                                                 == 'clickout item']
            if len(clks_num_reference) == len(y):  # is it a train session?
                idx_last_clk = y.tail(1).index.values[0]
                df_only_numeric = df_only_numeric.drop(idx_last_clk)

            for idx, row in df_only_numeric.iterrows():
                reference = row.reference
                if reference in dict_impr_price.keys():
                    if row.action_type == "clickout item":
                        sum_price += int(dict_impr_price[reference]) * 2
                        sum_pos_price += int(
                            dict_impr_price_pos[reference]) * 2
                        count_interacted += 2
                    else:
                        sum_price += int(dict_impr_price[reference])
                        sum_pos_price += int(dict_impr_price_pos[reference])
                        count_interacted += 1

            if count_interacted > 0:
                features['avg price'] = round(sum_price / count_interacted, 2)
                features['avg cheap position'] = round(
                    sum_pos_price / count_interacted, 2)
            else:
                features['avg price'] = -1
                features['avg cheap position'] = -1

            # Device percentages features
            tot_clks = len(y)
            features['mobile perc'] = round(
                y[y.device == "mobile"].shape[0] / tot_clks, 2)
            features['tablet perc'] = round(
                y[y.device == "tablet"].shape[0] / tot_clks, 2)
            features['desktop perc'] = round(
                y[y.device == "desktop"].shape[0] / tot_clks, 2)

            # Getting used filters during past clickouts (except during clickout to predict!), then they will be one_hotted
            y_filters = y[(y.current_filters != None) & (y.reference != None)]
            for i in y_filters.index:
                features['filters_during_session'] += str(
                    y_filters.at[i, 'current_filters']) + "|"

            x_activating_filters = x[(x.action_type == "filter selection")]
            for i in x_activating_filters.index:
                features['filters_during_session'] += str(
                    x_activating_filters.at[i, 'reference']) + "|"

        return pd.DataFrame.from_records([features])

    def construct_features(df):
        dataset = df.groupby(['user_id']).progress_apply(func)

        one_hot = dataset['filters_during_session'].astype(
            str).str.get_dummies()

        missing = poss_filters - set(one_hot.columns)

        to_drop = set(one_hot.columns) - poss_filters

        for e in missing:
            one_hot[e] = 0
        for e in to_drop:
            one_hot = one_hot.drop([e], axis=1)
        dataset = dataset.drop(['filters_during_session'], axis=1)
        dataset = dataset.join(one_hot)

        return dataset

    def get_user_favorite_filters(full_df, users):
        """
        I want a structure that for every user in the train gives
        an one_hot_encoded structures for all possible parameters of hotels clicked by that user
        ex. parameter: 3 Stars
        """

        # get clickout of train and merge metadata of the hotel
        train_df = full_df[full_df["user_id"].isin(users)]
        train_df = train_df[(train_df["action_type"] == "clickout item") & (
            pd.to_numeric(train_df['reference'], errors='coerce').notnull())]

        train_df.drop([
            "session_id", "timestamp", "step", "action_type", "platform",
            "city", "device", "current_filters", "impressions", "prices"
        ],
                      axis=1,
                      inplace=True)

        # Merge & eliminate column
        metatadata_one_hot = data.accomodations_one_hot().reset_index()

        train_df['reference'] = train_df['reference'].astype(int)
        metatadata_one_hot['item_id'] = metatadata_one_hot['item_id'].astype(
            int)
        train_df = pd.merge(train_df,
                            metatadata_one_hot,
                            how='outer',
                            left_on='reference',
                            right_on='item_id')

        train_df = train_df.drop(["reference", "item_id"], axis=1)

        print(
            "Finishing binaryzing, now summing and getting user favorite properties of hotels..."
        )

        out_df = train_df.groupby('user_id')[train_df.columns[2:]].sum()
        return out_df

    # Start trying to compute dataset
    train = data.train_df(mode=mode, cluster=cluster)

    test = data.test_df(mode=mode, cluster=cluster)
    target_indices = data.target_indices(mode=mode, cluster=cluster)
    target_user_id = test.loc[target_indices]['user_id'].values

    full = pd.concat([train, test])
    del train
    del test

    poss_filters = []
    for f in full[~full['current_filters'].isnull()]['current_filters'].values:
        poss_filters += f.split('|')
    poss_filters = set(poss_filters)

    user_fav_filters = get_user_favorite_filters(full, target_user_id)

    # Add suffix in order to distinguish hotel properties from user filters
    user_fav_filters.columns = [
        str(col) + '_hotel' for col in user_fav_filters.columns
    ]

    user_fav_filters.reset_index(inplace=True)

    #Remove duplicate before processing
    full.drop_duplicates(
        subset=["user_id", "session_id", "action_type", "reference"],
        inplace=True)

    # build in chunk
    count_chunk = 0
    chunk_size = 10000000000

    print(
        "{}: Started chunk processing".format("Build Dataset Classification"))
    groups = full.groupby(np.arange(len(full)) // chunk_size)
    for idxs, gr in groups:
        features = construct_features(gr)

        features.reset_index(inplace=True)

        outcome = pd.merge(features,
                           user_fav_filters,
                           how='outer',
                           left_on="user_id",
                           right_on="user_id")

        outcome.drop(["level_1", outcome.columns.values[-1]],
                     axis=1,
                     inplace=True)

        #Get floats to int
        outcome.iloc[:, -user_fav_filters.
                     shape[1]:] = outcome.iloc[:, -user_fav_filters.
                                               shape[1]:].fillna(0).astype(int)

        if count_chunk == 0:
            outcome.to_csv(
                'dataset/preprocessed/{}/{}/user_properties.csv'.format(
                    cluster, mode))
        else:
            with open(
                    'dataset/preprocessed/{}/{}/user_properties.csv'.format(
                        cluster, mode), 'a') as f:
                outcome.to_csv(f, header=False)

        count_chunk += 1
        print('chunk {} over {} completed'.format(count_chunk, len(groups)))