Exemple #1
0
    def train():
        mode = menu.mode_selection()

        # build the model
        opt = menu.single_choice('Optimizer?', ['Adam', 'RMSProp'],
                                 ['adam', 'rmsprop'])
        lr = menu.single_choice('Learning rate?', ['e-3', 'e-4', 'e-5'],
                                [1e-3, 1e-4, 1e-5])
        if opt == 'adam':
            optim = keras.optimizers.Adam(lr=lr)
        else:
            optim = keras.optimizers.RMSprop(lr=lr)

        model = interactive_model(mode, optim=optim)

        # fit the model
        model.fit(epochs=10000)
        print('\nFit completed!')

        best_accuracy = np.max(model.history.history['val_acc'])

        model.save(folderpath='saved_models/',
                   suffix='_{}'.format(round(best_accuracy,
                                             5)).replace('.', ''))

        # evaluate
        report = model.evaluate()
        bot.send_message(report, account='parro')

        print('Opt: {}'.format(opt))
        print('Lr: {}'.format(lr))
    def interactive_model(mode):
        cell_type = menu.single_choice(
            'Choose a network architecture:',
            ['LSTM', 'GRU', 'default architecture'],
            [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto'])
        print()
        if cell_type == 'auto':
            cell_type = 'GRU'
            rec_layers = 1
            dense_layers = 2
            units = 4
            weights = True
        else:
            rec_layers = int(input('Insert number of recurrent layers: '))
            units = int(input('Insert number of units per layer: '))
            dense_layers = int(input('Insert number of dense layers: '))
            weights = menu.yesno_choice('Do you want to use sample weights?',
                                        lambda: True, lambda: None)
            #tb_path = menu.yesno_choice('Do you want to enable Tensorboard?', lambda: 'recommenders/tensorboard', lambda: None)

        pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'],
                                 [lambda: 6, lambda: 12])
        dataset = SequenceDatasetForClassification(
            f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_classification_p{pad}'
        )

        if weights is not None:
            weights = dataset.get_sample_weights()

        model = RNNClassificationRecommender(
            dataset,
            use_generator=False,
            cell_type=cell_type,
            input_shape=(dataset.rows_per_sample, 168),
            num_recurrent_layers=rec_layers,
            num_recurrent_units=units,
            optimizer='adam',
            num_dense_layers=dense_layers,
            #class_weights=weights
            sample_weights=weights)

        return model
Exemple #3
0
    def create_feature():
        mode = 'full'
        model = interactive_model(mode)

        model_checkpoints = os.listdir('saved_models')
        checkpoint_path = menu.single_choice('Choose the model checkpoint:',
                                             model_checkpoints)
        checkpoint_path = os.path.join('saved_models', checkpoint_path)

        print('Loading {}...'.format(checkpoint_path), end='\r', flush=True)
        model.load(checkpoint_path)
        print('Done!', flush=True)

        print('Creating feature for {}...'.format(mode))
        model.create_feature()
Exemple #4
0
    def interactive_model(mode, optim='adam'):
        pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'],
                                 [lambda: 6, lambda: 12])
        dataset = SequenceDatasetForBinaryClassification(
            f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_binary_classification_p{pad}'
        )

        weights = dataset.get_class_weights()

        model = RNNBinaryClassificator(dataset,
                                       input_shape=(dataset.rows_per_sample,
                                                    168),
                                       cell_type='gru',
                                       num_recurrent_layers=2,
                                       num_recurrent_units=64,
                                       num_dense_layers=2,
                                       class_weights=weights,
                                       optimizer=optim)

        return model
Exemple #5
0
        TopPopPerImpression,
        User2Item,
        #UserFeature
    ]
    



    assert features_array[0] == ImpressionLabel, 'first feature must be the label!'


    choice = menu.yesno_choice(['want the scores?'])
    if choice == 'y':
        base_path_stacking = 'scores_stacking'
        stacking_scores_path = ['xgboost_nobias.csv.gz', 'catboost_rank.csv.gz','rnn_GRU_2layers_64units_2dense_class_nobias_05952.csv.gz',
                                'scores_pairwise_soft_zero_one_loss.csv.gz']
        stacking_scores_path = [f'{base_path_stacking}/{a}' for a in stacking_scores_path]
    else:
        stacking_scores_path = []


    mode = menu.mode_selection()
    cluster = menu.cluster_selection()
    dataset_name = input('insert dataset name\n')

    choice = menu.single_choice(['select mode'], ['normal', 'cv'])
    if choice == 'cv':
        create_dataset_cv(mode, cluster, features_array, dataset_name, k=5)
    else:
        create_dataset(mode, cluster, features_array, dataset_name, stacking_scores_path)
Exemple #6
0
        model.save(folderpath='saved_models/',
                   suffix='_{}'.format(round(best_accuracy,
                                             5)).replace('.', ''))

        # evaluate
        report = model.evaluate()
        bot.send_message(report, account='parro')

        print('Opt: {}'.format(opt))
        print('Lr: {}'.format(lr))

    def create_feature():
        mode = 'full'
        model = interactive_model(mode)

        model_checkpoints = os.listdir('saved_models')
        checkpoint_path = menu.single_choice('Choose the model checkpoint:',
                                             model_checkpoints)
        checkpoint_path = os.path.join('saved_models', checkpoint_path)

        print('Loading {}...'.format(checkpoint_path), end='\r', flush=True)
        model.load(checkpoint_path)
        print('Done!', flush=True)

        print('Creating feature for {}...'.format(mode))
        model.create_feature()

    activity = menu.single_choice('What do you want to do?',
                                  ['Train', 'Create feature'],
                                  [train, create_feature])
Exemple #7
0
            #'sample_weights': dataset.get_sample_weights()
        }
        fit_params = {'epochs': int(input('Insert number of epochs: '))}

        kfscorer = KFoldScorer(model_class=RNNClassificationRecommender, init_params=init_params, k=5)

        kfscorer.fit_predict(dataset, multithreading=True, fit_params=fit_params)

    
    def scores_bin():
        dataset = DatasetScoresBinaryClassification(f'dataset/preprocessed/cluster_recurrent/small/dataset_binary_classification_p6')

        init_params = {
            'dataset': dataset,
            'input_shape': (6,168),
            'cell_type': 'gru',
            'num_recurrent_layers': 2,
            'num_recurrent_units': 64,
            'num_dense_layers': 2,
            'optimizer': 'adam',
            #'class_weights': dataset.get_class_weights(),
            'sample_weights': dataset.get_sample_weights()
        }
        fit_params = {'epochs': int(input('Insert number of epochs: '))}

        kfscorer = KFoldScorer(model_class=RNNBinaryClassificator, init_params=init_params, k=5)

        kfscorer.fit_predict(dataset, multithreading=True, fit_params=fit_params)

    menu.single_choice('Which model?', ['RNN', 'RNN binary'], [scores_rnn, scores_bin])
Exemple #8
0
            result_predictions.append((index, ordered_impressions))

        print('prediction created !!!')

        return result_predictions

    def get_scores_batch(self):
        return None


if __name__ == "__main__":
    import utils.menu as menu

    mode = menu.mode_selection()
    cell_type = menu.single_choice(
        'Choose a network architecture:',
        ['LSTM', 'GRU', 'default architecture'],
        [lambda: 'LSTM', lambda: 'GRU', lambda: 'auto'])
    print()
    if cell_type == 'auto':
        cell_type = 'GRU'
        epochs = 1
        rec_layers = 1
        dense_layers = 2
        units = 4
        tb_path = None
    else:
        epochs = int(input('Insert number of epochs: '))
        rec_layers = int(input('Insert number of recurrent layers: '))
        units = int(input('Insert number of units per layer: '))
        dense_layers = int(input('Insert number of dense layers: '))
        tb_path = menu.yesno_choice('Do you want to enable Tensorboard?',
Exemple #9
0
def preprocess():
    """
    Preprocess menu

    NOTE: it is required to have the original CSV files in the folder dataset/original
    """
    def _create_csvs():
        print('creating CSV...')

        # create no_cluster/full
        path = 'dataset/preprocessed/no_cluster'
        full = data.full_df()
        train_len = data.read_config()[data.TRAIN_LEN_KEY]

        train = full.iloc[0:train_len]
        test = full.iloc[train_len:len(full)]
        target_indices = get_target_indices(test)

        check_folder('dataset/preprocessed/no_cluster/full')
        train.to_csv(os.path.join(path, 'full/train.csv'))
        test.to_csv(os.path.join(path, 'full/test.csv'))
        np.save(os.path.join(path, 'full/train_indices'), train.index)
        np.save(os.path.join(path, 'full/test_indices'), test.index)
        np.save(os.path.join(path, 'full/target_indices'), target_indices)

        no_of_rows_in_small = int(
            input('How many rows do you want in small.csv? '))
        train_small = get_small_dataset(train,
                                        maximum_rows=no_of_rows_in_small)
        check_folder('dataset/preprocessed/no_cluster/small')
        split(train_small, os.path.join(path, 'small'))

        check_folder('dataset/preprocessed/no_cluster/local')
        split(train, os.path.join(path, 'local'))

        # create item_metadata in preprocess folder
        original_item_metadata = data.accomodations_original_df()
        original_item_metadata.to_csv(data.ITEMS_PATH)

        # append missing accomodations to item metadata
        append_missing_accomodations('full')

    def _preprocess_item_metadata():
        # interactively enable preprocessing function
        labels = ['Remove \'From n stars\' attributes']
        pre_processing_f = [remove_from_stars_features]
        menu_title = 'Choose the preprocessing function(s) to apply to the accomodations.\nPress numbers to enable/disable the options, press X to confirm.'
        activated_prefns = menu.options(pre_processing_f,
                                        labels,
                                        title=menu_title,
                                        custom_exit_label='Confirm')

        # preprocess accomodations dataframe
        preprocess_accomodations_df(activated_prefns)

    def _create_urm_session_aware():
        """
        NOTE: CHANGE THE PARAMETERS OF THE SEQUENCE AWARE URM HERE !!!!
        """
        create_urm.urm_session_aware(mode, cluster, time_weight='lin')

    def _create_urm_clickout():
        """
        NOTE: CHANGE THE PARAMETERS OF THE CLICKOUT_ONLY URM HERE !!!!
        """
        create_urm.urm(mode, cluster, clickout_score=5, impressions_score=1)

    def _merge_sessions():
        print("Merging similar sessions (same user_id and city)")
        print("Loading full_df")
        full_df = data.full_df()
        print("Sorting, grouping, and other awesome things")
        grouped = full_df.sort_values(["user_id", "timestamp"],
                                      ascending=[True, True]).groupby(
                                          ["user_id", "city"])
        new_col = np.array(["" for _ in range(len(full_df))], dtype=object)
        print("Now I'm really merging...")
        for name, g in tqdm(grouped):
            s_id = g.iloc[0]["session_id"]
            new_col[g.index.values] = s_id
        print("Writing on the df")
        full_df["unified_session_id"] = pd.Series(new_col)
        print("Saving new df to file")
        with open(data.FULL_PATH, 'w', encoding='utf-8') as f:
            full_df.to_csv(f)
        data.refresh_full_df()

    print("Hello buddy... Copenaghen is waiting...")
    print()

    # create full_df.csv
    # pick your custom preprocessing function

    # original
    # funct = no_custom_preprocess_function

    # unroll
    funct = unroll_custom_preprocess_function

    check_folder(data.FULL_PATH)
    if os.path.isfile(data.FULL_PATH):
        menu.yesno_choice('An old full dataframe has been found. Do you want to delete it and create again?', \
            callback_yes=(lambda: create_full_df(funct)))
    else:
        print('The full dataframe (index master) is missing. Creating it...',
              end=' ',
              flush=True)
        create_full_df(funct)
        print('Done!')

    # create CSV files
    menu.yesno_choice(
        title=
        'Do you want to merge similar sessions (adding unified_session_id)?',
        callback_yes=_merge_sessions)

    # create CSV files
    menu.yesno_choice(title='Do you want to create the CSV files?',
                      callback_yes=_create_csvs)

    # preprocess item_metadata
    menu.yesno_choice(title='Do you want to preprocess the item metadata?',
                      callback_yes=_preprocess_item_metadata)

    # create ICM
    menu.yesno_choice(title='Do you want to create the ICM matrix files?',
                      callback_yes=create_icm.create_ICM)

    # create URM
    lbls = [
        'Create URM from LOCAL dataset', 'Create URM from FULL dataset',
        'Create URM from SMALL dataset', 'Skip URM creation'
    ]
    callbacks = [lambda: 'local', lambda: 'full', lambda: 'small', lambda: 0]
    res = menu.single_choice(title='What do you want to do?',
                             labels=lbls,
                             callbacks=callbacks)

    if res is None:
        exit(0)

    if res != 0:
        # initialize the train and test dataframes
        mode = res

        # get the cluster
        print('for which cluster do you want to create the URM ???')
        cluster = input()
        callbacks = [_create_urm_session_aware, _create_urm_clickout]
        menu.single_choice(title='Which URM do you want create buddy?',
                           labels=['Sequence-aware URM', 'Clickout URM'],
                           callbacks=callbacks)

    return
                              mode,
                              cluster,
                              TRAIN_LEN,
                              TEST_LEN,
                              rows_per_sample=pad_sessions_length)
    #X_sparse_cols=x_sparse_cols, Y_sparse_cols=ref_classes)


if __name__ == "__main__":

    mode = menu.mode_selection()
    #cluster_name = 'cluster_recurrent'
    cluster = menu.single_choice(
        'Which cluster?',
        ['cluster recurrent', 'cluster len <= 6', 'cluster len > 6'],
        callbacks=[
            lambda: ClusterRecurrent, lambda: ClusterUpToLen6,
            lambda: ClusterOverLen6
        ])
    c = cluster()

    # create the cluster
    cluster_choice = menu.yesno_choice('Do you want to create the cluster?',
                                       lambda: True, lambda: False)
    if cluster_choice:
        c.save(mode)
        print()

    only_test = False
    if mode != 'small':
        only_test = menu.yesno_choice(
from recommenders.lightGBM import lightGBM
from skopt import gp_minimize
from skopt import dummy_minimize
from skopt.space import Real, Integer, Categorical
from skopt.utils import use_named_args
import data
from utils.menu import single_choice

class OptimizerWrapper:
    def __init__(self, recommender_class, mode, cluster, dataset_name):
        self.space, self.objective = recommender_class.get_optimize_params(mode, cluster, dataset_name)
    def optimize_bayesian(self):
        best_param = gp_minimize(self.objective, self.space, n_random_starts=10, n_calls=100)
        print(best_param)
    def optimize_random(self):
        best_param = dummy_minimize(self.objective, self.space, n_calls=1000)


if __name__ == '__main__':
    opt_technique = single_choice('optimization technique', ['bayesian', 'random'])
    mode = single_choice('insert mode:', ['local', 'small'])
    cluster = single_choice('insert cluster', ['no_cluster'])
    dataset_name = input('insert_the_dataset_name')
    opt = OptimizerWrapper(lightGBM, mode=mode, cluster=cluster, dataset_name=dataset_name)
    if opt_technique == 'bayesian':
        opt.optimize_bayesian()
    else:
        opt.optimize_random()

    # save the dataset config file that stores dataset length and the list of sparse columns
    features_cols = list(data.accomodations_one_hot().columns) if add_item_features else []    
    x_sparse_cols = devices_classes + actions_classes + features_cols
    datasetconfig.save_config(path, mode, cluster, TRAIN_LEN, TEST_LEN, train_name=TRAIN_NAME,
                            rows_per_sample=pad_sessions_length,
                            X_sparse_cols=x_sparse_cols, Y_sparse_cols=features_cols)

    



if __name__ == "__main__":
        
    mode = menu.mode_selection()
    #cluster_name = 'cluster_recurrent'
    cluster = menu.single_choice('Which cluster?', ['cluster recurrent','cluster recurrent len <= 6'],
                                    callbacks=[lambda: ClusterRecurrent, lambda: ClusterRecurrentUpToLen6])
    c = cluster()

    # create the cluster
    cluster_choice = menu.yesno_choice('Do you want to create the cluster?', lambda: True, lambda: False)
    if cluster_choice:
        print('Creating the cluster...')
        c.save(mode)
        print()

    only_test = False
    if mode != 'small':
        only_test = menu.yesno_choice('Do you want to create only the test dataset?', lambda: True, lambda: False)
    
    sess_length = int(input('Insert the desired sessions length, -1 to not to pad/truncate the sessions: '))
                 username='******',
                 password='******')

    def scores():
        mode = 'full'
        model = interactive_model(mode)

        checkpoint_path = menu.checkpoint_selection(
            checkpoints_dir='saved_models')

        print('Loading {}...'.format(checkpoint_path), end='\r', flush=True)
        model.load(checkpoint_path)
        print('Done!', flush=True)

        # get scores for train and test and save them
        scores_folder = 'scores'
        check_folder(scores_folder)

        for st in ['train', 'test']:
            print('Building scores for {} {}...'.format(st, mode))
            scores = model.get_scores_batch(scores_type=st)

            print('Saving scores for {} {}...'.format(st, mode))
            scores_filename = '{}_scores_{}'.format(model.name, st)
            np.save(os.path.join(scores_folder, scores_filename),
                    np.array(scores))

    activity = menu.single_choice('What do you want to do?',
                                  ['Train', 'Submission', 'Scores'],
                                  [train, submission, scores])
def _mrr(y_true, y_pred):
    y_pred = y_pred.get_label()
    l = memoryview(np.array(y_pred, dtype=np.int32))
    p = memoryview(np.array(y_true, dtype=np.float32))
    g = memoryview(np.array(_group_t, dtype=np.int32))
    mrr = mrr_cython(l, p, g, len(_group_t))
    return 'MRR', -mrr


if __name__ == '__main__':
    from utils.menu import mode_selection
    from utils.menu import cluster_selection
    from utils.menu import single_choice
    from utils.menu import options

    modality = single_choice('smart evaluate or normal recommender?',
                             ['smart evaluate', 'normal recommender'])

    if modality == 'normal recommender':
        kind = input('pick the kind: ')
        mode = mode_selection()
        cluster = cluster_selection()
        sel = options(['evaluate', 'export the sub', 'export the scores'], [
            'evaluate', 'export the sub', 'export the scores'
        ], 'what do you want to do after model fitting and the recommendations?'
                      )

        model = XGBoostWrapper(mode=mode, cluster=cluster, kind=kind)
        if 'export the sub' in sel and 'export the scores' in sel:
            model.run(export_sub=True, export_scores=True)
        elif 'export the sub' in sel and 'export the scores' not in sel:
            model.run(export_sub=True, export_scores=False)
        ##PlatformReferencePercentageOfInteractions,
        PriceQuality,
        # RefPopAfterFirstPosition,
        SessionActionNumRefDiffFromImpressions,
        SessionDevice,
        SessionFilterActiveWhenClickout,
        SessionLength,
        # SessionNumClickouts,
        # SessionNumFilterSel,
        # SessionNumInterItemImage,
        # SessionNumNotNumeric,
        SessionSortOrderWhenClickout,
        StatisticsTimeFromLastAction,
        TimePerImpression,
        TimesUserInteractedWithImpression,
        TimingFromLastInteractionImpression,
        TopPopInteractionClickoutPerImpression,
        TopPopPerImpression,
        User2Item,
        #UserFeature
    ]

    mode = single_choice('select mode:', ['full', 'local', 'small'])
    cluster = single_choice('select cluster:', ['no_cluster'])
    #dataset_name=single_choice('select dataset name:',['prova', 'dataset1', 'dataset2', 'old'])
    dataset_name = input('insert the dataset name:\n')
    create_lightGBM_dataset(mode=mode,
                            cluster=cluster,
                            features_array=features_array,
                            dataset_name=dataset_name)
    cf.check_folder(
        f'{_BASE_PATH}/output_dir_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")}'
    )
    _OUTPUT_DIR = f'{_BASE_PATH}/output_dir_{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M")}'

    flags_dict['output_dir'] = _OUTPUT_DIR
    # let the user choose the params
    train_batch_size = int(input('insert train batch size'))
    learning_rate = float(input('insert learning rate'))
    dropout_rate = float(input('insert dropout rate'))
    hidden_layer_dims = input(
        'insert hidden layer dims as numbers separeted by spaces').split(' ')
    loss = menu.single_choice('select the loss', [
        'pairwise_hinge_loss', 'pairwise_logistic_loss',
        'pairwise_soft_zero_one_loss', 'softmax_loss',
        'sigmoid_cross_entropy_loss', 'mean_squared_loss', 'list_mle_loss',
        'approx_ndcg_loss'
    ])

    group_size = int(input('insert the group_size:\n'))

    # update flag dict
    flags_dict['train_batch_size'] = train_batch_size

    flags_dict['learning_rate'] = learning_rate
    flags_dict['dropout_rate'] = dropout_rate
    flags_dict['hidden_layer_dims'] = hidden_layer_dims
    flags_dict['group_size'] = group_size
    flags_dict['loss'] = loss

    if _MODE == 'full':
Exemple #17
0
from numpy.linalg import norm as L2Norm
from sklearn.utils import shuffle
from sklearn.metrics import classification_report

import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, GRU, Embedding, Dropout, TimeDistributed
from keras.callbacks import EarlyStopping

if __name__ == "__main__":
    import utils.menu as menu
    tqdm.pandas()

    mode = menu.mode_selection()
    opt = menu.single_choice('Optimizer?', ['Adam', 'RMSProp'],
                             ['adam', 'rmsprop'])
    lr = menu.single_choice('Learning rate?', ['e-3', 'e-4', 'e-5'],
                            [1e-3, 1e-4, 1e-5])

    pad = menu.single_choice('Which dataset?', ['Padded 6', 'Padded 12'],
                             [lambda: 6, lambda: 12])
    dataset = SequenceDatasetForBinaryClassification(
        f'dataset/preprocessed/cluster_recurrent/{mode}/dataset_binary_classification_p{pad}'
    )

    print('Loading data...')
    x, y = dataset.load_Xtrain(), dataset.load_Ytrain()
    x, y = shuffle(x, y)
    print()

    perc = np.sum(y) / len(y)