Esempio n. 1
0
def main():
    for d_name in args.d_name:
        dataset = loaddata_utils.load_data(d_name)
        X, y, problem = dataset['full']['X'], dataset['full']['y'], dataset['problem']

        if 'xgb' in args.model_name:
            model = get_xgb_model(args.model_name, problem)


        X, y, problem = dataset['full']['X'], dataset['full']['y'], dataset['problem']

        cv = StratifiedShuffleSplit(n_splits=5, test_size=0.15, random_state=1377)
        # hyperparameteres....
        model = get_ebm_model(model_name, problem, random_state=1377)



        param_grid = [
            {'min_child_weight': [0., 0.5, 1., 2., 3.]},
            {'learning_rate': [0.5, 0.2, 0.1, 0.05]},
            {'reg_lambda': [1.0, 0.1, 0.01, 0.]},
        ]

        # with parallel_backend('threading'):
        cv_model = GridSearchCV(model, param_grid=param_grid, n_jobs=5, scoring='roc_auc', cv=cv, refit=False)
        cv_model.fit(X, y)
Esempio n. 2
0
def convert_old_model(model, d_name):
    if hasattr(model, 'cat_columns'):
        return model

    from loaddata_utils import load_data
    X = load_data(d_name)['full']['X']
                    
    if isinstance(model, MyFLAMClassifier) or isinstance(model, MyFLAMRegressor) \
        or isinstance(model, MyRSplineClassifier) or isinstance(model, MyRSplineRegressor): # get the old df, transform and set it back
        df = model.get_GAM_plot_dataframe()
        model.GAM_plot_dataframe = model.revert_dataframe(df)
        
    if (isinstance(model, MyBaggingRegressor) or isinstance(model, MyBaggingClassifier)):
        model.not_revert = True

    model.cat_columns = X.columns[X.dtypes == object].values.tolist()

    if isinstance(model, LabelEncodingFitMixin):
        raise Exception('Should just discard these models!')
    return model
Esempio n. 3
0
def main():
    if not os.path.exists(os.path.join(args.output_dir, args.identifier)):
        os.mkdir(os.path.join(args.output_dir, args.identifier))

    csv_path = os.path.join('./results/', '%s.csv' % args.identifier)
    if not os.path.exists('./results'):
        os.mkdir('./results/')

    curr_content_lookup = None
    if os.path.exists(csv_path):
        curr_content_lookup = pd.read_csv(csv_path).set_index(
            ['d_name', 'model_name', 'split_idx']).sort_index()

    for d_name in args.d_name:
        global dataset  # to make it accessible in the function get_model()
        dataset = load_data(d_name)
        print(d_name)

        # Handle the spline lam parameters. Reset for every datasets
        args.lam = None

        X, y, problem = dataset['full']['X'], dataset['full']['y'], dataset[
            'problem']
        test_size = args.test_size

        args.split_cls = StratifiedShuffleSplit if problem == 'classification' else ShuffleSplit
        train_test_ss = args.split_cls(n_splits=args.n_splits,
                                       test_size=test_size,
                                       random_state=args.random_state)

        idxes_generator = train_test_ss.split(X, y)
        for split_idx, (train_idx, test_idx) in enumerate(idxes_generator):
            if split_idx < args.start_split:
                continue

            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
            # print('y_train mean:', np.mean(y_train), 'y_test mean:', np.mean(y_test))
            # print('y_train shape:', y_train.shape, 'y_test shape:', y_test.shape)

            for model_name in args.model_name:
                if args.check_in_records and curr_content_lookup is not None \
                    and (d_name, model_name, split_idx) in curr_content_lookup.index:
                    print('Found in the record. Skip! "%s %s %d"' %
                          (d_name, model_name, split_idx))
                    continue

                print('Start running "%s %s %d"' %
                      (d_name, model_name, split_idx))
                start_time = time.time()

                additional_model_args = {}
                # Set the range of hyperparameter to search for each dataset to save time
                if model_name.startswith('spline') and 'search_lam' in dataset:
                    additional_model_args['search_lam'] = dataset['search_lam']
                if model_name.startswith('spline') and 'n_splines' in dataset:
                    additional_model_args['n_splines'] = dataset['n_splines']
                if model_name.startswith('rspline') and 'discrete' in dataset:
                    additional_model_args['discrete'] = dataset['discrete']
                if model_name.startswith('rspline') and 'maxk' in dataset:
                    additional_model_args['maxk'] = dataset['maxk']

                exp_mode_fn = eval('get_%s' % args.exp_mode)
                experiment_result = exp_mode_fn(X_train, y_train, X_test,
                                                y_test, problem, d_name,
                                                model_name, split_idx,
                                                **additional_model_args)
                if experiment_result is None:
                    continue

                record = OrderedDict()
                record['d_name'] = d_name
                record['model_name'] = model_name
                record['split_idx'] = split_idx
                record['n_splits'] = args.n_splits
                record['random_state'] = args.random_state
                record['fit_time'] = float(time.time() - start_time)
                record['test_size'] = test_size

                record.update(experiment_result)

                # Follow the column order
                output_csv(csv_path, record)

                print('finish %s %s %d/%d and %s with %.1fs' %
                      (args.exp_mode, d_name, split_idx, args.n_splits,
                       model_name, float(time.time() - start_time)))
                import gc
                gc.collect()
Esempio n. 4
0
def main():
    if not os.path.exists(args.data_path):
        exit('Exit! Not existing this file %s' % args.data_path)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    data_path_filename = args.data_path.split('/')[-1].split('.')[0]
    output_dir = os.path.join(
        args.output_dir, '%s-%s-df' % (args.identifier, data_path_filename))
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    records_df = pd.read_csv(args.data_path)
    if args.d_name is None:
        args.d_name = records_df.d_name.unique()
    if args.model_name is None:
        args.model_name = ['gnd_truth'
                           ] + records_df.model_name.unique().tolist()

    for d_name in args.d_name:
        output_path = os.path.join(output_dir, '%s.pkl' % (d_name))

        result_dict = {}
        if os.path.exists(output_path):
            with open(output_path, 'rb') as fp:
                result_dict = pickle.load(fp)

        with Timer(d_name, remove_start_msg=False):
            # Handle ss baseline
            df2 = records_df[records_df.d_name == d_name]
            if len(df2) == 0:
                print('No record found for this dataset %s' % (d_name))
                continue

            dset = load_data(d_name)
            default_x_values_lookup, default_x_counts = {}, {}
            for feat_name in dset['full']['X']:
                X_uni, X_counts = np.unique(dset['full']['X'][feat_name],
                                            return_counts=True)

                default_x_values_lookup[feat_name] = X_uni
                default_x_counts[feat_name] = X_counts

            # To create importance, we cache the map to get the counts for each feature value
            X_map_dict = {}
            for feat_name in dset['full']['X']:
                X_map = pd.Series(X_counts,
                                  X_uni)  # map unique value to counts
                X_map_dict[feat_name] = X_map

            for model_name in args.model_name:
                # Handle gnd_truth model class in ss experiments
                if model_name == 'gnd_truth' and d_name.startswith('ss'):
                    if 'gnd_truth' not in result_dict or args.overwrite:
                        gnd_truth_models = mypickle_load(dset['models_path'])

                        result_dict[
                            'gnd_truth'] = get_GAM_plot_dataframe_by_models(
                                gnd_truth_models, default_x_values_lookup)

                        # X_values_counts = dset['full']['X'].apply(lambda x: x.value_counts().sort_index().to_dict(), axis=0)
                        # result_dict['gnd_truth']['sample_weights'] = result_dict['gnd_truth'].feat_name.apply(
                        #     lambda x: np.array(list(X_values_counts[x].values()), dtype=np.int) if x != 'offset' else None)
                        result_dict['gnd_truth']['sample_weights'] = result_dict['gnd_truth'].apply(
                            lambda row: None
                                if row.feat_name == 'offset' \
                                else default_x_counts[row.feat_name]
                        , axis=1)

                        result_dict['gnd_truth']['importance'] = result_dict['gnd_truth'].apply(
                            lambda row: -1
                                if row.feat_name == 'offset' \
                                else np.average(np.abs(row.y), weights=row.sample_weights)
                        , axis=1)

                        pickle.dump(result_dict, open(output_path, 'wb'))
                        print('Finish this %s gnd_truth' % (d_name))
                    else:
                        print('Already finish this %s %s' %
                              (d_name, model_name))
                    continue

                df = df2[df2.model_name == model_name]

                if len(df) == 0:
                    print('No record found for this model %s in dataset %s' %
                          (model_name, d_name))
                    continue

                if not args.overwrite and model_name in result_dict:
                    print('Already finish this %s %s' % (d_name, model_name))
                    continue

                if 'rf' in model_name or 'xgb-d3' in model_name or 'skgbt-d3' in model_name:
                    print(model_name, 'is not a GAM. Skip!')
                    continue

                with Timer('loading %s to check if it is a GAM' % model_name):
                    model = mypickle_load(df.model_path.iloc[0])
                    if not hasattr(model, 'is_GAM') or not model.is_GAM:
                        print(model_name, 'is not a GAM. Skip!')
                        continue

                with Timer(d_name + ' ' + model_name):
                    # use a generator to save memory of loading each model to get df
                    models = model_generator(df.model_path.tolist())
                    result_dict[model_name] = get_GAM_plot_dataframe_by_models(
                        models, default_x_values_lookup)
                    result_dict[model_name]['importance'] = result_dict[model_name].apply(
                        lambda row: -1
                            if row.feat_name == 'offset' \
                            else np.average(np.abs(row.y), weights=default_x_counts[row.feat_name])
                    , axis=1)

                    pickle.dump(result_dict, open(output_path, 'wb'))
Esempio n. 5
0
def main():
    if not os.path.exists(args.data_path):
        exit('Exit! Not existing this file %s' % args.data_path)

    # Read into the inputs
    records_df = pd.read_csv(args.data_path)
    if args.model_name is not None:
        records_df = records_df.loc[vector_in(records_df.model_name,
                                              args.model_name)]
    if args.d_name is not None:
        records_df = records_df.loc[vector_in(records_df.d_name, args.d_name)]
    if args.end_splits is not None:
        records_df = records_df.loc[records_df.split_idx < args.end_splits]

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    data_path_filename = args.data_path.split('/')[-1].split('.')[0]
    output_path = os.path.join(
        args.output_dir, '%s-fimp-%s-%s.tsv' %
        (args.identifier, args.exp_mode, data_path_filename))

    # Check the record if overwrite flag is 1!
    if args.overwrite and os.path.exists(output_path):
        if args.model_name is None and args.d_name is None:
            os.remove(output_path)
        else:
            records_df_index = records_df.set_index(['d_name',
                                                     'model_name']).index
            df = pd.read_csv(output_path, sep='\t')

            new_df = pd.DataFrame([
                row for r_idx, row in df.iterrows()
                if (row.d_name, row.model_name) not in records_df_index
            ])
            if len(new_df) == 0:
                os.remove(output_path)
            else:
                new_df.to_csv(output_path, sep='\t', index=None)

    curr_content_lookup = None
    if not args.overwrite and os.path.exists(output_path):
        curr_content_lookup = pd.read_csv(output_path, sep='\t') \
            .set_index(['d_name', 'model_name', 'split_idx', 'metric']).sort_index()

    print('Total df size: ', len(records_df))
    for d_name, df in records_df.groupby('d_name'):
        dataset = load_data(d_name)

        if dataset['problem'] == 'regression' and args.metric != 'mse':
            print(
                'Regression dataset only uses mse as the metric. Skip dataset %s for metric %s.'
                % (d_name, args.metric))
            continue

        for row_idx, (df_idx, record) in enumerate(df.iterrows()):
            if curr_content_lookup is not None and (
                    d_name, record.model_name, record.split_idx,
                    args.metric) in curr_content_lookup.index:
                print('Found in the record. Skip! "%s %s %d %s"' \
                    % (d_name, record.model_name, record.split_idx, args.metric))
                continue

            model = mypickle_load(record.model_path)
            if not hasattr(model,
                           'is_GAM') or not model.is_GAM:  # model is not a GAM
                continue

            with Timer(
                    'handling record dataset %s %s %d with idx %d of total %d (%d)'
                    % (d_name, record.model_name, record.split_idx, row_idx,
                       df.shape[0], df_idx)):
                # Reload the train and test set for that record
                X_train, X_test, y_train, y_test = \
                    load_train_test_data(dataset, record.split_idx,
                        record.n_splits, record.test_size, record.random_state)

                # Record the metadata
                the_result = OrderedDict()
                for k in [
                        'd_name', 'model_name', 'model_path', 'split_idx',
                        'n_splits', 'test_size', 'random_state'
                ]:
                    the_result[k] = record[k]
                the_result['metric'] = args.metric

                for mode_name, X_selection, y_selection, X_report, y_report in [
                        # ('train', X_train, y_train, None, None),
                        # ('test', X_test, y_test, None, None),
                        # ('train_test', X_train, y_train, X_test, y_test),
                    ('test_test', X_test.iloc[:int(X_test.shape[0] / 2)],
                     y_test.iloc[:int(X_test.shape[0] / 2)],
                     X_test.iloc[int(X_test.shape[0] / 2):],
                     y_test.iloc[int(X_test.shape[0] / 2):]),
                ]:
                    exp_obj = args.exp_cls(X_selection, y_selection,
                                           dataset['problem'], model,
                                           args.metric, args.n_features_limit,
                                           X_report, y_report)

                    exp_result = exp_obj.run_exp()
                    for k in exp_result:
                        the_result['%s_%s' % (mode_name, k)] = exp_result[k]

                output_csv(output_path, the_result, delimiter='\t')