Beispiel #1
0
def run(model_fname, out_csv, num_iteration=-1):
    test_df = data2.load('test')
    click_ids = data2.load_click_ids()

    m = load_model(model_fname)
    predictors = m.feature_name()

    print('test_df: ', len(test_df))
    print('predictors: ', predictors)

    if num_iteration != -1 and num_iteration != m.best_iteration - 1:
        print('best iter: {}, specified: {}', m.best_iteration, num_iteration)

    preds = m.predict(test_df[predictors], num_iteration=num_iteration)

    # generated using map_clickid.ipynb
    mapping = pd.read_csv('../cache/test_mapping.csv')
    print('len before: ', len(mapping))

    mapping = mapping.drop_duplicates(subset=['click_id'])
    print('len after duplicates removed: ', len(mapping))

    mapping = mapping.set_index(['click_id_v0'])
    print(mapping.head(10))

    preds_df = pd.DataFrame(preds, columns=['is_attributed'])
    preds_df['click_id_v0'] = click_ids
    print(preds_df[preds_df.click_id_v0 == 21290878])

    preds_df = preds_df.set_index(['click_id_v0'])
    preds_df = mapping.join(preds_df, how='left')

    subm = pd.read_csv('../input/test.csv', usecols=['click_id'])

    preds_df = preds_df.reset_index().set_index(['click_id'])
    subm = subm.set_index(['click_id'])
    subm = subm.join(preds_df, how='left')

    subm = subm.reset_index()
    subm[['click_id', 'is_attributed']].to_csv(out_csv, index=False)
    print('saved ', out_csv)
Beispiel #2
0
def run_train(days, iterations, seed):
    trainval_df = data2.load('train')

    target = 'is_attributed'
    categorical = ['app', 'device', 'os', 'channel', 'hour', 'binip']

    excluded = ['click_time', 'ip', 'day']

    # i'm not convinced yet restricting training days is good
    train_cond = (trainval_df.day.isin(days)) & (trainval_df.hour.isin(
        [4, 5, 9, 10, 13, 14]))
    train_df = trainval_df[train_cond]

    for column in excluded:
        del trainval_df[column]

    gc.collect()

    predictors = list(
        sorted([c for c in trainval_df.columns if c not in ['is_attributed']]))

    val_dfs = None
    iterations = 57
    run(train_df, val_dfs, predictors, target, categorical, iterations, seed)
Beispiel #3
0
    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)
        sys.stdout.flush()

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data, data_path=args.data_path, log=result, **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]

    split = StratifiedShuffleSplit(target, n_iter=1, test_size=args.test_size)
    train, test = next(iter(split))

    data = data.iloc[train, :]
    target = target.iloc[train]

    target = LabelEncoder().fit_transform(target)
    result['test_samples'] = test.tolist()

    data = ExpressionDiscretizer().fit(data).transform(data)
Beispiel #4
0
from __future__ import division, print_function

from data2 import load

if __name__ == '__main__':

    data, factors = load('mdd_raw')

Beispiel #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None)
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data, data_path=args.data_path, log=result, **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    preprocess_steps = [('scaler', StandardScaler())]

    # RFE
    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        result['experiments'].append({
            'iteration': i,
            'train_samples': data.index[train].tolist(),
            'subsets': []
        })
        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Evaluating with {} features and selecting {}.'
                      .format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step))
            # Train with current subset
            pipeline = preprocess_steps + [('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))]
            pipeline = Pipeline(pipeline)

            features = np.arange(n_features)[support_]
            pipeline.fit(data.iloc[train, features], target.iloc[train])

            # Save results for current set of features
            grid = pipeline.steps[-1][1]
            result['experiments'][-1]['subsets'].append({
                'n_features': np.sum(support_),
                'features': data.columns[features].tolist(),
                'best_params': grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(data.iloc[train, features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(data.iloc[test, features]).tolist()
                }
            })

            # Select best subset
            coef_ = safe_sqr(grid.coef_)

            if coef_.ndim > 1:
                ranks = np.argsort(coef_.sum(axis=0))
            else:
                ranks = np.argsort(coef_)

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            support_[features[ranks][:step]] = False
            ranking_[np.logical_not(support_)] += 1

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

    if args.verbose:
        print('# OK')
Beispiel #6
0
import pandas as pd
import numpy as np
import lightgbm as lgb

import data2
import train2

from dataset import *
from util import info

SEED = 0

from sklearn.preprocessing import RobustScaler

if __name__ == '__main__':
    trainval_df = data2.load('train')

    target = 'is_attributed'
    categorical = ['app', 'device', 'os', 'channel', 'hour', 'binip']

    # faster feedback
    train_cond = (trainval_df.day == 8) & (trainval_df.hour.isin(
        [4, 5, 9, 10, 13, 14]))
    train_df = trainval_df[train_cond]
    #train_df = trainval_df.iloc[:-VALID_ROWS]

    #info('shuffling train')
    #train_df = train_df.iloc[np.random.permutation(len(train_df))]

    # used to save memory only, as when building lgbm dataset we specify
    # columns to be used explicitly
Beispiel #7
0
from sklearn.metrics import mean_squared_error

parser = argparse.ArgumentParser(description='XGBoost for BNP')
parser.add_argument('-f','--n_features', help='Number of features', type=int, default=1200)
parser.add_argument('-n','--n_rounds', help='Number of Boost iterations', type=int, default=2000)
parser.add_argument('-e','--eta', help='Learning rate', type=float, default=0.01)
parser.add_argument('-r','--r_seed', help='Set random seed', type=int, default=3)
parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1)
parser.add_argument('-ct','--cat_trans', help='Category transformation method', type=str, default='std')
parser.add_argument('-cv','--cv', action='store_true')
parser.add_argument('-codetest','--codetest', action='store_true')
parser.add_argument('-getcached', '--getcached', action='store_true')
m_params = vars(parser.parse_args())

# Load data
X, y, X_sub, ids = data.load(m_params)

print("Two Sigma: GBT Classifier...\n") 
clf =  GradientBoostingRegressor(learning_rate=0.02, n_estimators=500, subsample=0.6, min_samples_split=5, min_samples_leaf=1,  max_depth=4, random_state=1, verbose=1)

# do cross validation scoring
kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=1)
scr = np.zeros([len(kf)])
oob_pred = np.zeros(X.shape[0])
sub_pred = np.zeros(X_sub.shape[0])
for i, (tr_ix, val_ix) in enumerate(kf):
	clf.fit(X[tr_ix], y[tr_ix])
	pred = clf.predict(X[val_ix].toarray())
	oob_pred[val_ix] = np.array(pred)
	sub_pred += clf.predict(X_sub.toarray()) / 5
	scr[i] = mean_squared_error(y[val_ix], np.array(pred))
Beispiel #8
0
    parser.add_argument('--test-size', type=float, default=0.1)
    # parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x))
    parser.add_argument('--clf')
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--data-path', default='./bucket/data/')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    # if args.tissue is not None:
    #     condition = factors['source tissue'] == args.tissue
    #     if condition.sum() == 0:
    #         result['error'] = '{} is not a valid tissue.'.format(args.tissue)
    #         save_experiment(result, folder=args.results_path, filename=None, error=True,
    #                         verbose=args.verbose)
    #         sys.exit()
    #     data = data[condition]
    #     factors = factors[condition]

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))
    result['selections'] = []

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10)))
    result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    feature_names = data.columns

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 1

    support_ = np.ones(n_features, dtype=np.bool)
    ranking_ = np.ones(n_features, dtype=np.int)
    # Elimination
    t0 = time()
    d0 = datetime.now()
    while np.sum(support_) > n_features_to_select:
        step = 10**int(np.log10(np.sum(support_) - 1))
        odd_step = np.sum(support_) - step * (np.sum(support_) // step)
        if odd_step > 0:
            step = odd_step

        if args.verbose:
            print('[{}] Selecting best {:d} features.'.format(
                datetime.now() - d0,
                np.sum(support_) - step))
        # Remaining features
        features = np.arange(n_features)[support_]

        coef_ = None
        test_scores = []
        for train, test in split:
            # Rank the remaining features
            if args.n_folds == 'loo':
                cv = LeaveOneOut(len(train))
            else:
                cv = args.n_folds
            estimator = GridWithCoef(clf, param_grid, cv=cv)

            estimator.fit(data.iloc[train, features], target.iloc[train])
            if coef_ is None:
                coef_ = safe_sqr(estimator.coef_)
            else:
                coef_ += safe_sqr(estimator.coef_)

            test_scores.append(
                estimator.score(data.iloc[test, features], target.iloc[test]))

        if coef_.ndim > 1:
            ranks = np.argsort(coef_.sum(axis=0))
        else:
            ranks = np.argsort(coef_)

        # for sparse case ranks is matrix
        ranks = np.ravel(ranks)

        # Eliminate the worse features
        threshold = min(step, np.sum(support_) - n_features_to_select)
        support_[features[ranks][:threshold]] = False
        ranking_[np.logical_not(support_)] += 1

        result['selections'].append({
            'scores':
            test_scores,
            'n_features':
            np.sum(support_),
            'features':
            feature_names[support_].tolist()
        })

        with open(result_file, 'w') as f:
            json.dump(result,
                      f,
                      sort_keys=True,
                      indent=2,
                      separators=(',', ': '))

    if args.verbose:
        print('# OK')
Beispiel #10
0
    result['data'] = mrmr_results['data']
    result['tissue'] = mrmr_results['tissue']
    result['target'] = mrmr_results['target']
    result['test_samples'] = mrmr_results['test_samples']
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)
        sys.stdout.flush()

    load_params = {}
    if result['data'] == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(result['data'], data_path=args.data_path, log=result, **load_params)
    if result['tissue']:
        data = data[factors['source tissue'] == result['tissue']]
        factors = factors[factors['source tissue'] == result['tissue']]
    target = factors[result['target']]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    train_samples = np.ones(data.shape[0], dtype=np.bool)
    train_samples[result['test_samples']] = False

    train_data = data.loc[train_samples, :]
    train_target = target.loc[train_samples]
    test_data = data.iloc[result['test_samples'], :]
    test_target = target.iloc[result['test_samples']]
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None)
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    parser.add_argument('--filter', default='anova')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)
        sys.stdout.flush()

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data, data_path=args.data_path, log=result, **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]
    target_num = LabelEncoder().fit_transform(target)

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    score_params = {}
    preprocessor = None
    if args.filter == 'anova':
        score_features = anova
    elif args.filter == 'infogain_10':
        score_features = relevance
        score_params = {'bins': 10}
    elif args.filter == 'infogain_exp':
        preprocessor = ExpressionDiscretizer()
        score_features = relevance
        score_params = {'bins': 3}
    elif args.filter == 'chi2':
        preprocessor = ExpressionDiscretizer()
        score_features = chi_squared
    else:
        raise ValueError('Filter {} unknown.'.format(args.filter))

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, '{}_{}.json'.format(args.filter, experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))
        sys.stdout.flush()

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        if preprocessor:
            preprocessor.fit(data.iloc[train, :])
            train_data = preprocessor.transform(data.iloc[train, :])
            test_data = preprocessor.transform(data.iloc[test, :])
        else:
            train_data = data.iloc[train, :]
            test_data = data.iloc[test, :]

        scores_ = score_features(train_data, target_num[train], **score_params)
        result['experiments'].append({
            'iteration': i,
            'train_samples_label': data.index[train].tolist(),
            'train_samples_idx': train.tolist(),
            'scores': scores_.tolist()
        })
        if args.verbose:
            print('[{}] Features scored.'.format(datetime.now() - d0))
            sys.stdout.flush()

        result['experiments'][-1]['subsets'] = []
        current_size = n_features
        sorted_features = np.argsort(scores_)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Fitting with {} features.'.format(datetime.now() - d0, current_size))
                sys.stdout.flush()

            features = sorted_features[-current_size:]

            grid = GridWithCoef(clf, param_grid, cv=args.n_folds)
            grid.fit(train_data.iloc[:, features], target.iloc[train])

            # Save results for current set of features
            result['experiments'][-1]['subsets'].append({
                'n_features': current_size,
                'features': data.columns[features].tolist(),
                'best_params': grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(train_data.iloc[:, features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(test_data.iloc[:, features]).tolist()
                }
            })

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

            current_size -= step

    if args.verbose:
        print('# OK')
        sys.stdout.flush()
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))
    result['selections'] = []

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10)))
    result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    feature_names = data.columns

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 1

    support_ = np.ones(n_features, dtype=np.bool)
    ranking_ = np.ones(n_features, dtype=np.int)
    # Elimination
    t0 = time()
    d0 = datetime.now()
    while np.sum(support_) > n_features_to_select:
        step = 10 ** int(np.log10(np.sum(support_) - 1))
        odd_step = np.sum(support_) - step * (np.sum(support_) // step)
        if odd_step > 0:
            step = odd_step

        if args.verbose:
            print('[{}] Selecting best {:d} features.'
                  .format(datetime.now() - d0, np.sum(support_) - step))
        # Remaining features
        features = np.arange(n_features)[support_]

        coef_ = None
        test_scores = []
        for train, test in split:
            # Rank the remaining features
            if args.n_folds == 'loo':
                cv = LeaveOneOut(len(train))
            else:
                cv = args.n_folds
            estimator = GridWithCoef(clf, param_grid, cv=cv)

            estimator.fit(data.iloc[train, features], target.iloc[train])
            if coef_ is None:
                coef_ = safe_sqr(estimator.coef_)
            else:
                coef_ += safe_sqr(estimator.coef_)

            test_scores.append(estimator.score(data.iloc[test, features], target.iloc[test]))

        if coef_.ndim > 1:
            ranks = np.argsort(coef_.sum(axis=0))
        else:
            ranks = np.argsort(coef_)

        # for sparse case ranks is matrix
        ranks = np.ravel(ranks)

        # Eliminate the worse features
        threshold = min(step, np.sum(support_) - n_features_to_select)
        support_[features[ranks][:threshold]] = False
        ranking_[np.logical_not(support_)] += 1

        result['selections'].append({
            'scores': test_scores,
            'n_features': np.sum(support_),
            'features': feature_names[support_].tolist()
        })

        with open(result_file, 'w') as f:
            json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

    if args.verbose:
        print('# OK')
Beispiel #13
0
    parser.add_argument('--test-size', type=float, default=0.1)
    # parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x))
    parser.add_argument('--clf')
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--data-path', default='./bucket/data/')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    # if args.tissue is not None:
    #     condition = factors['source tissue'] == args.tissue
    #     if condition.sum() == 0:
    #         result['error'] = '{} is not a valid tissue.'.format(args.tissue)
    #         save_experiment(result, folder=args.results_path, filename=None, error=True,
    #                         verbose=args.verbose)
    #         sys.exit()
    #     data = data[condition]
    #     factors = factors[condition]

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    result['split'] = {
        'type': 'StratifiedShuffleSplit',
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x)
                        if x is not None else None)
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    parser.add_argument('--filter', default='anova')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)
        sys.stdout.flush()

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data,
                         data_path=args.data_path,
                         log=result,
                         **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]
    target_num = LabelEncoder().fit_transform(target)

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    score_params = {}
    preprocessor = None
    if args.filter == 'anova':
        score_features = anova
    elif args.filter == 'infogain_10':
        score_features = relevance
        score_params = {'bins': 10}
    elif args.filter == 'infogain_exp':
        preprocessor = ExpressionDiscretizer()
        score_features = relevance
        score_params = {'bins': 3}
    elif args.filter == 'chi2':
        preprocessor = ExpressionDiscretizer()
        score_features = chi_squared
    else:
        raise ValueError('Filter {} unknown.'.format(args.filter))

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path,
                       '{}_{}.json'.format(args.filter, experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))
        sys.stdout.flush()

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        if preprocessor:
            preprocessor.fit(data.iloc[train, :])
            train_data = preprocessor.transform(data.iloc[train, :])
            test_data = preprocessor.transform(data.iloc[test, :])
        else:
            train_data = data.iloc[train, :]
            test_data = data.iloc[test, :]

        scores_ = score_features(train_data, target_num[train], **score_params)
        result['experiments'].append({
            'iteration':
            i,
            'train_samples_label':
            data.index[train].tolist(),
            'train_samples_idx':
            train.tolist(),
            'scores':
            scores_.tolist()
        })
        if args.verbose:
            print('[{}] Features scored.'.format(datetime.now() - d0))
            sys.stdout.flush()

        result['experiments'][-1]['subsets'] = []
        current_size = n_features
        sorted_features = np.argsort(scores_)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Fitting with {} features.'.format(
                    datetime.now() - d0, current_size))
                sys.stdout.flush()

            features = sorted_features[-current_size:]

            grid = GridWithCoef(clf, param_grid, cv=args.n_folds)
            grid.fit(train_data.iloc[:, features], target.iloc[train])

            # Save results for current set of features
            result['experiments'][-1]['subsets'].append({
                'n_features':
                current_size,
                'features':
                data.columns[features].tolist(),
                'best_params':
                grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(train_data.iloc[:,
                                                           features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(test_data.iloc[:,
                                                          features]).tolist()
                }
            })

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result,
                          f,
                          sort_keys=True,
                          indent=2,
                          separators=(',', ': '))

            current_size -= step

    if args.verbose:
        print('# OK')
        sys.stdout.flush()
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x)
                        if x is not None else None)
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data,
                         data_path=args.data_path,
                         log=result,
                         **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    preprocess_steps = [('scaler', StandardScaler())]

    # RFE
    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        result['experiments'].append({
            'iteration':
            i,
            'train_samples':
            data.index[train].tolist(),
            'subsets': []
        })
        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Evaluating with {} features and selecting {}.'.
                      format(datetime.now() - d0, np.sum(support_),
                             np.sum(support_) - step))
            # Train with current subset
            pipeline = preprocess_steps + [
                ('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))
            ]
            pipeline = Pipeline(pipeline)

            features = np.arange(n_features)[support_]
            pipeline.fit(data.iloc[train, features], target.iloc[train])

            # Save results for current set of features
            grid = pipeline.steps[-1][1]
            result['experiments'][-1]['subsets'].append({
                'n_features':
                np.sum(support_),
                'features':
                data.columns[features].tolist(),
                'best_params':
                grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(data.iloc[train,
                                                     features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(data.iloc[test, features]).tolist()
                }
            })

            # Select best subset
            coef_ = safe_sqr(grid.coef_)

            if coef_.ndim > 1:
                ranks = np.argsort(coef_.sum(axis=0))
            else:
                ranks = np.argsort(coef_)

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            support_[features[ranks][:step]] = False
            ranking_[np.logical_not(support_)] += 1

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result,
                          f,
                          sort_keys=True,
                          indent=2,
                          separators=(',', ': '))

    if args.verbose:
        print('# OK')
parser.add_argument('--radius', '-r', type=int, default=3,
                    help='Radius parameter of NFP')
parser.add_argument('--hidden-dim', '-H', type=int, default=128,
                    help='Number of hidden units of NFP')
parser.add_argument('--out-dim', '-O', type=int, default=128,
                    help='Number of output units of NFP')
args = parser.parse_args()


if args.mode == 1:
    train, val, max_degree, atom2id, C = data.load()
    nfp = model.NFP(args.hidden_dim, args.out_dim, max_degree, len(atom2id),
                    args.radius, concat_hidden=True)
    converter = data.concat_example
elif args.mode == 2:
    train, val, max_degree, atom2id, C = data2.load()
    nfp = model2.NFP(args.hidden_dim, args.out_dim, max_degree, len(atom2id),
                    args.radius, concat_hidden=True)
    converter = chainer.dataset.concat_examples
elif args.mode == 3:
    train, val, max_degree, atom2id, C = data2.load()
    nfp = model3.NFP(args.hidden_dim, args.out_dim, max_degree, len(atom2id),
                     args.radius, concat_hidden=True)
    converter = chainer.dataset.concat_examples
else:
    train, val, max_degree, atom2id, C = data_ggnn.load()
    nfp = ggnn.GGNN(args.hidden_dim, args.out_dim, len(atom2id),
                    args.radius, concat_hidden=True)
    converter = chainer.dataset.concat_examples

print('data', max_degree, len(atom2id), C)