def read_feature(self, one_hot=False, create_not_existing_features=True):
        """
        it reads a feature from disk and returns it.
        if one_hot = False, it returns it as was saved.
        if one_hot = True, returns the onehot of the categorical columns, by means of self.columns_to_onehot
        """
        path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format(
            self.cluster, self.mode, self.name)
        if not os.path.exists(path):

            if create_not_existing_features:
                choice = 'y'
                print('Missing feature: creating')
            else:
                choice = yesno_choice(
                    'feature \'{}\' does not exist. want to create?'.format(
                        self.name))
            if choice == 'y':
                self.save_feature()
            else:
                return

        index_col = 0 if self.save_index else None
        df = pd.read_csv(path, index_col=index_col)
        #df = df.drop('Unnamed: 0', axis=1)

        print('{} feature read'.format(self.name))

        # then proceed with one hot
        if one_hot:
            for t in self.columns_to_onehot:
                col = df[t[0]]
                one_hot_prefix = t[2] if len(t) == 3 else t[0]
                if t[1] == 'single':
                    oh = pd.get_dummies(col, prefix=one_hot_prefix)
                elif t[1] == 'binary':
                    ce = BinaryEncoder(cols=t[0])
                    oh = ce.fit_transform(col)
                else:
                    mid = col.apply(lambda x: x.split('|')
                                    if isinstance(x, str) else x)
                    mid.fillna(value='', inplace=True)
                    mlb = MultiLabelBinarizer()
                    oh = mlb.fit_transform(mid)
                    oh = pd.DataFrame(oh, columns=mlb.classes_)
                    oh = oh.astype(np.uint8)
                    oh = oh.add_prefix(one_hot_prefix)

                df = df.drop([t[0]], axis=1)
                df = pd.concat([df, oh], axis=1)

            print('{} onehot completed'.format(self.name))

        df = self.post_loading(df)
        return df
    def fit(self):
        check_folder('models')
        if self.ask_to_load:
            if os.path.isfile('models/{}.model'.format(self.name)):
                if yesno_choice(
                        'the exact same model was yet created. want to load?'
                ) == 'y':
                    self.xg.load_model('models/{}.model'.format(self.name))
                    return

        if self.class_weights:
            X_train, y_train, group, _, weights, _ = data.dataset_xgboost_train(
                mode=self.mode,
                cluster=self.cluster,
                class_weights=self.class_weights,
                kind=self.kind)
        else:
            X_train, y_train, group, _, _ = data.dataset_xgboost_train(
                mode=self.mode,
                cluster=self.cluster,
                class_weights=self.class_weights,
                kind=self.kind)
        print('data for train ready')

        if self.class_weights:
            self.xg.fit(X_train, y_train, group, sample_weight=weights)
        elif self.weights_position:
            bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(
                cluster, mode, kind)
            w = np.load(os.path.join(bp, 'weights_position.npy'))
            print(w.size)
            print(group.shape)
            self.xg.fit(X_train, y_train, group, sample_weight=w)
        elif self.log_weights:
            bp = 'dataset/preprocessed/{}/{}/xgboost/{}/'.format(
                cluster, mode, kind)
            w = np.load(os.path.join(bp, 'log_weights.npy'))
            print(w.size)
            print(group.shape)
            self.xg.fit(X_train, y_train, group, sample_weight=w)
        else:
            self.xg.fit(X_train, y_train, group)

        print('fit done')
        self.xg.save_model('models/{}.model'.format(self.name))
        print('model saved')
 def save_feature(self, overwrite_if_exists=None):
     """
     overwrite_if_exists: if true overwrite without asking; if false do not overwrite, if None ask before overwrite
     """
     path = 'dataset/preprocessed/{}/{}/feature/{}/features.csv'.format(
         self.cluster, self.mode, self.name)
     if os.path.exists(path):
         if overwrite_if_exists == None:
             choice = yesno_choice(
                 'The feature \'{}\' already exists. Want to recreate?'.
                 format(self.name))
             if choice == 'n':
                 return
         elif not overwrite_if_exists:
             return
     df = self.extract_feature()
     check_folder(path)
     df.to_csv(path, index=self.save_index)
    def interactive_model(mode):
        cell_type = menu.single_choice(
            'Choose a network architecture:',
            ['LSTM', 'GRU', 'default architecture'],
            [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto'])
        print()
        if cell_type == 'auto':
            cell_type = 'GRU'
            rec_layers = 1
            dense_layers = 2
            units = 4
            weights = True
        else:
            rec_layers = int(input('Insert number of recurrent layers: '))
            units = int(input('Insert number of units per layer: '))
            dense_layers = int(input('Insert number of dense layers: '))
            weights = menu.yesno_choice('Do you want to use sample weights?',
                                        lambda: True, lambda: None)
            #tb_path = menu.yesno_choice('Do you want to enable Tensorboard?', lambda: 'recommenders/tensorboard', lambda: None)

        pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'],
                                 [lambda: 6, lambda: 12])
        dataset = SequenceDatasetForClassification(
            f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification_p{pad}'
        )

        if weights is not None:
            weights = dataset.get_sample_weights()

        model = RNNClassificationRecommender(
            dataset,
            use_generator=False,
            cell_type=cell_type,
            input_shape=(dataset.rows_per_sample, 168),
            num_recurrent_layers=rec_layers,
            num_recurrent_units=units,
            optimizer='adam',
            num_dense_layers=dense_layers,
            #class_weights=weights
            sample_weights=weights)

        return model
    train_len = len(new)
    old_starting_index = test.index[0]
    new = pd.concat([new, test])
    print("Supersampling ended for mode={}, saving df".format(mode))
    new_train = new.iloc[:train_len - 1]
    new_test = new.iloc[train_len:]
    #    new_starting_index = new_test.index[0]
    #    offset = new_starting_index - old_starting_index
    #    target_indices += offset
    target_indices = data.target_indices(mode, "no_cluster")
    np.save(path + "/" + mode + "/target_indices", target_indices)
    new_train.to_csv(path + "/" + mode + "/train.csv", index=True)
    new_test.to_csv(path + "/" + mode + "/test.csv", index=True)


if __name__ == '__main__':
    if already_existing():
        answer = menu.yesno_choice(
            title='Another supersampling detected, do you want to recreate it?',
            callback_yes=lambda: True,
            callback_no=lambda: False)
        if answer:
            rmtree(path)
        else:
            exit(0)
    os.mkdir(path)
    modes = ["small", "local", "full"]
    for m in modes:
        os.mkdir(path + "/" + m)
        supersampling(m)
Beispiel #6
0
        TimePerImpression,
        TimesUserInteractedWithImpression,
        TimingFromLastInteractionImpression,
        TopPopInteractionClickoutPerImpression,
        TopPopPerImpression,
        User2Item,
        #UserFeature
    ]
    



    assert features_array[0] == ImpressionLabel, 'first feature must be the label!'


    choice = menu.yesno_choice(['want the scores?'])
    if choice == 'y':
        base_path_stacking = 'scores_stacking'
        stacking_scores_path = ['xgboost_nobias.csv.gz', 'catboost_rank.csv.gz','rnn_GRU_2layers_64units_2dense_class_nobias_05952.csv.gz',
                                'scores_pairwise_soft_zero_one_loss.csv.gz']
        stacking_scores_path = [f'{base_path_stacking}/{a}' for a in stacking_scores_path]
    else:
        stacking_scores_path = []


    mode = menu.mode_selection()
    cluster = menu.cluster_selection()
    dataset_name = input('insert dataset name\n')

    choice = menu.single_choice(['select mode'], ['normal', 'cv'])
    if choice == 'cv':
Beispiel #7
0
def create_full_df(custom_preprocess_function):
    """
    Save the dataframe containing train.csv and test.csv contiguosly with reset indexes. Also save the config file
    containing the number of rows in the original train.csv (max_train_idx). This is used to know which indices
    indicates train rows (idx < max_train_idx) and test rows (idx >= max_train_idx).

    pass a custom preprocess function to personalize the original train and test df from which the creation
    of the full df starts
    """

    train_df, test_df = custom_preprocess_function(data.original_train_df().reset_index(drop=True), \
                                                        data.original_test_df().reset_index(drop=True))

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # TEST
    len_original_train = train_df.shape[0]
    compressed = menu.yesno_choice(
        title='Do you want the compressed version? (no for the original full)',
        callback_yes=lambda: True,
        callback_no=lambda: False)

    # TRAIN; FIXING DUPLICATED SESSION_ID <-> STEP PAIRS
    print('Fixing wrong duplicated steps in train...')
    train_df = reset_step_for_duplicated_sessions(train_df)

    # TRAIN; MERGING DUPLICATES
    if compressed:
        train_df = merge_duplicates(train_df)
    else:
        train_df["frequence"] = 1

    len_train = train_df.shape[0]
    train_df.to_csv(data.FULL_PATH)
    del train_df

    # save config file
    data.save_config(data.TRAIN_LEN_KEY, len_train)

    # TEST
    with open(data.FULL_PATH, 'a', encoding='utf-8') as f:

        # restore index summing the len of the original train (to be the same as without merging)
        test_df.index += len_original_train

        # TEST; FIXING DUPLICATED SESSION_ID <-> STEP PAIRS
        print('Fixing wrong duplicated steps in test...')
        test_df = reset_step_for_duplicated_sessions(test_df)

        # TEST; MERGING DUPLICATES
        if compressed:
            test_df = merge_duplicates(test_df)
        else:
            test_df["frequence"] = 1

        # TEST; DELETING UNNFORMATIVE INTERACTIONS
        mask = (test_df["action_type"] !=
                "clickout item") & (test_df["reference"].isnull())
        test_df = test_df.drop(test_df[mask].index)

        test_df.to_csv(f, header=False)
Beispiel #8
0
def preprocess():
    """
    Preprocess menu

    NOTE: it is required to have the original CSV files in the folder dataset/original
    """
    def _create_csvs():
        print('creating CSV...')

        # create no_cluster/full
        path = 'dataset/preprocessed/no_cluster'
        full = data.full_df()
        train_len = data.read_config()[data.TRAIN_LEN_KEY]

        train = full.iloc[0:train_len]
        test = full.iloc[train_len:len(full)]
        target_indices = get_target_indices(test)

        check_folder('dataset/preprocessed/no_cluster/full')
        train.to_csv(os.path.join(path, 'full/train.csv'))
        test.to_csv(os.path.join(path, 'full/test.csv'))
        np.save(os.path.join(path, 'full/train_indices'), train.index)
        np.save(os.path.join(path, 'full/test_indices'), test.index)
        np.save(os.path.join(path, 'full/target_indices'), target_indices)

        no_of_rows_in_small = int(
            input('How many rows do you want in small.csv? '))
        train_small = get_small_dataset(train,
                                        maximum_rows=no_of_rows_in_small)
        check_folder('dataset/preprocessed/no_cluster/small')
        split(train_small, os.path.join(path, 'small'))

        check_folder('dataset/preprocessed/no_cluster/local')
        split(train, os.path.join(path, 'local'))

        # create item_metadata in preprocess folder
        original_item_metadata = data.accomodations_original_df()
        original_item_metadata.to_csv(data.ITEMS_PATH)

        # append missing accomodations to item metadata
        append_missing_accomodations('full')

    def _preprocess_item_metadata():
        # interactively enable preprocessing function
        labels = ['Remove \'From n stars\' attributes']
        pre_processing_f = [remove_from_stars_features]
        menu_title = 'Choose the preprocessing function(s) to apply to the accomodations.\nPress numbers to enable/disable the options, press X to confirm.'
        activated_prefns = menu.options(pre_processing_f,
                                        labels,
                                        title=menu_title,
                                        custom_exit_label='Confirm')

        # preprocess accomodations dataframe
        preprocess_accomodations_df(activated_prefns)

    def _create_urm_session_aware():
        """
        NOTE: CHANGE THE PARAMETERS OF THE SEQUENCE AWARE URM HERE !!!!
        """
        create_urm.urm_session_aware(mode, cluster, time_weight='lin')

    def _create_urm_clickout():
        """
        NOTE: CHANGE THE PARAMETERS OF THE CLICKOUT_ONLY URM HERE !!!!
        """
        create_urm.urm(mode, cluster, clickout_score=5, impressions_score=1)

    def _merge_sessions():
        print("Merging similar sessions (same user_id and city)")
        print("Loading full_df")
        full_df = data.full_df()
        print("Sorting, grouping, and other awesome things")
        grouped = full_df.sort_values(["user_id", "timestamp"],
                                      ascending=[True, True]).groupby(
                                          ["user_id", "city"])
        new_col = np.array(["" for _ in range(len(full_df))], dtype=object)
        print("Now I'm really merging...")
        for name, g in tqdm(grouped):
            s_id = g.iloc[0]["session_id"]
            new_col[g.index.values] = s_id
        print("Writing on the df")
        full_df["unified_session_id"] = pd.Series(new_col)
        print("Saving new df to file")
        with open(data.FULL_PATH, 'w', encoding='utf-8') as f:
            full_df.to_csv(f)
        data.refresh_full_df()

    print("Hello buddy... Copenaghen is waiting...")
    print()

    # create full_df.csv
    # pick your custom preprocessing function

    # original
    # funct = no_custom_preprocess_function

    # unroll
    funct = unroll_custom_preprocess_function

    check_folder(data.FULL_PATH)
    if os.path.isfile(data.FULL_PATH):
        menu.yesno_choice('An old full dataframe has been found. Do you want to delete it and create again?', \
            callback_yes=(lambda: create_full_df(funct)))
    else:
        print('The full dataframe (index master) is missing. Creating it...',
              end=' ',
              flush=True)
        create_full_df(funct)
        print('Done!')

    # create CSV files
    menu.yesno_choice(
        title=
        'Do you want to merge similar sessions (adding unified_session_id)?',
        callback_yes=_merge_sessions)

    # create CSV files
    menu.yesno_choice(title='Do you want to create the CSV files?',
                      callback_yes=_create_csvs)

    # preprocess item_metadata
    menu.yesno_choice(title='Do you want to preprocess the item metadata?',
                      callback_yes=_preprocess_item_metadata)

    # create ICM
    menu.yesno_choice(title='Do you want to create the ICM matrix files?',
                      callback_yes=create_icm.create_ICM)

    # create URM
    lbls = [
        'Create URM from LOCAL dataset', 'Create URM from FULL dataset',
        'Create URM from SMALL dataset', 'Skip URM creation'
    ]
    callbacks = [lambda: 'local', lambda: 'full', lambda: 'small', lambda: 0]
    res = menu.single_choice(title='What do you want to do?',
                             labels=lbls,
                             callbacks=callbacks)

    if res is None:
        exit(0)

    if res != 0:
        # initialize the train and test dataframes
        mode = res

        # get the cluster
        print('for which cluster do you want to create the URM ???')
        cluster = input()
        callbacks = [_create_urm_session_aware, _create_urm_clickout]
        menu.single_choice(title='Which URM do you want create buddy?',
                           labels=['Sequence-aware URM', 'Clickout URM'],
                           callbacks=callbacks)

    return
if __name__ == "__main__":

    mode = menu.mode_selection()
    #cluster_name = 'cluster_recurrent'
    cluster = menu.single_choice(
        'Which cluster?',
        ['cluster recurrent', 'cluster len <= 6', 'cluster len > 6'],
        callbacks=[
            lambda: ClusterRecurrent, lambda: ClusterUpToLen6,
            lambda: ClusterOverLen6
        ])
    c = cluster()

    # create the cluster
    cluster_choice = menu.yesno_choice('Do you want to create the cluster?',
                                       lambda: True, lambda: False)
    if cluster_choice:
        c.save(mode)
        print()

    only_test = False
    if mode != 'small':
        only_test = menu.yesno_choice(
            'Do you want to create only the test dataset?', lambda: True,
            lambda: False)

    binary_dataset = menu.single_choice('Which dataset?',
                                        ['Standard', 'Binary'], [False, True])

    sess_length = int(
        input(
Beispiel #10
0
        [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto'])
    print()
    if cell_type == 'auto':
        cell_type = 'GRU'
        epochs = 1
        rec_layers = 1
        dense_layers = 2
        units = 4
        tb_path = None
    else:
        epochs = int(input('Insert number of epochs: '))
        rec_layers = int(input('Insert number of recurrent layers: '))
        units = int(input('Insert number of units per layer: '))
        dense_layers = int(input('Insert number of dense layers: '))
        tb_path = menu.yesno_choice('Do you want to enable Tensorboard?',
                                    lambda: 'recommenders/tensorboard',
                                    lambda: None)

    dataset = SequenceDatasetForRegression(
        f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_regression')

    model = RNNRegressionRecommender(dataset,
                                     use_generator=False,
                                     cell_type=cell_type,
                                     num_recurrent_layers=rec_layers,
                                     num_recurrent_units=units,
                                     num_dense_layers=dense_layers)
    model.fit(epochs=epochs)

    print('\nFit completed!')