Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))
    result['selections'] = []

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10)))
    result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    feature_names = data.columns

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 1

    support_ = np.ones(n_features, dtype=np.bool)
    ranking_ = np.ones(n_features, dtype=np.int)
    # Elimination
    t0 = time()
    d0 = datetime.now()
    while np.sum(support_) > n_features_to_select:
        step = 10**int(np.log10(np.sum(support_) - 1))
        odd_step = np.sum(support_) - step * (np.sum(support_) // step)
        if odd_step > 0:
            step = odd_step

        if args.verbose:
            print('[{}] Selecting best {:d} features.'.format(
                datetime.now() - d0,
                np.sum(support_) - step))
        # Remaining features
        features = np.arange(n_features)[support_]

        coef_ = None
        test_scores = []
        for train, test in split:
            # Rank the remaining features
            if args.n_folds == 'loo':
                cv = LeaveOneOut(len(train))
            else:
                cv = args.n_folds
            estimator = GridWithCoef(clf, param_grid, cv=cv)

            estimator.fit(data.iloc[train, features], target.iloc[train])
            if coef_ is None:
                coef_ = safe_sqr(estimator.coef_)
            else:
                coef_ += safe_sqr(estimator.coef_)

            test_scores.append(
                estimator.score(data.iloc[test, features], target.iloc[test]))

        if coef_.ndim > 1:
            ranks = np.argsort(coef_.sum(axis=0))
        else:
            ranks = np.argsort(coef_)

        # for sparse case ranks is matrix
        ranks = np.ravel(ranks)

        # Eliminate the worse features
        threshold = min(step, np.sum(support_) - n_features_to_select)
        support_[features[ranks][:threshold]] = False
        ranking_[np.logical_not(support_)] += 1

        result['selections'].append({
            'scores':
            test_scores,
            'n_features':
            np.sum(support_),
            'features':
            feature_names[support_].tolist()
        })

        with open(result_file, 'w') as f:
            json.dump(result,
                      f,
                      sort_keys=True,
                      indent=2,
                      separators=(',', ': '))

    if args.verbose:
        print('# OK')
Ejemplo n.º 2
0
    all_features = mrmr_results['subsets'][-1]['features']

    result['experiments'] = [{
        'iteration': 0,
        'subsets': []
    }]
    d0 = datetime.now()
    current_size = 10
    while current_size <= max_features:
        if args.verbose:
            print('[{}] Fitting with {} features.'.format(datetime.now() - d0, current_size))
            sys.stdout.flush()

        features = all_features[:current_size]

        grid = GridWithCoef(clf, param_grid, cv=args.n_folds)
        grid.fit(train_data.iloc[:, features], train_target)

        # Save results for current set of features
        result['experiments'][-1]['subsets'].append({
            'n_features': current_size,
            'features': data.columns[features].tolist(),
            'best_params': grid.best_params_,
            'train': {
                'y_true': train_target.tolist(),
                'y_pred': grid.predict(train_data.iloc[:, features]).tolist()
            },
            'test': {
                'y_true': test_target.tolist(),
                'y_pred': grid.predict(test_data.iloc[:, features]).tolist()
            }
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None)
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    parser.add_argument('--filter', default='anova')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)
        sys.stdout.flush()

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data, data_path=args.data_path, log=result, **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]
    target_num = LabelEncoder().fit_transform(target)

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    score_params = {}
    preprocessor = None
    if args.filter == 'anova':
        score_features = anova
    elif args.filter == 'infogain_10':
        score_features = relevance
        score_params = {'bins': 10}
    elif args.filter == 'infogain_exp':
        preprocessor = ExpressionDiscretizer()
        score_features = relevance
        score_params = {'bins': 3}
    elif args.filter == 'chi2':
        preprocessor = ExpressionDiscretizer()
        score_features = chi_squared
    else:
        raise ValueError('Filter {} unknown.'.format(args.filter))

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, '{}_{}.json'.format(args.filter, experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))
        sys.stdout.flush()

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        if preprocessor:
            preprocessor.fit(data.iloc[train, :])
            train_data = preprocessor.transform(data.iloc[train, :])
            test_data = preprocessor.transform(data.iloc[test, :])
        else:
            train_data = data.iloc[train, :]
            test_data = data.iloc[test, :]

        scores_ = score_features(train_data, target_num[train], **score_params)
        result['experiments'].append({
            'iteration': i,
            'train_samples_label': data.index[train].tolist(),
            'train_samples_idx': train.tolist(),
            'scores': scores_.tolist()
        })
        if args.verbose:
            print('[{}] Features scored.'.format(datetime.now() - d0))
            sys.stdout.flush()

        result['experiments'][-1]['subsets'] = []
        current_size = n_features
        sorted_features = np.argsort(scores_)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Fitting with {} features.'.format(datetime.now() - d0, current_size))
                sys.stdout.flush()

            features = sorted_features[-current_size:]

            grid = GridWithCoef(clf, param_grid, cv=args.n_folds)
            grid.fit(train_data.iloc[:, features], target.iloc[train])

            # Save results for current set of features
            result['experiments'][-1]['subsets'].append({
                'n_features': current_size,
                'features': data.columns[features].tolist(),
                'best_params': grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(train_data.iloc[:, features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(test_data.iloc[:, features]).tolist()
                }
            })

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

            current_size -= step

    if args.verbose:
        print('# OK')
        sys.stdout.flush()
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))
    result['selections'] = []

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10)))
    result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    feature_names = data.columns

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 1

    support_ = np.ones(n_features, dtype=np.bool)
    ranking_ = np.ones(n_features, dtype=np.int)
    # Elimination
    t0 = time()
    d0 = datetime.now()
    while np.sum(support_) > n_features_to_select:
        step = 10 ** int(np.log10(np.sum(support_) - 1))
        odd_step = np.sum(support_) - step * (np.sum(support_) // step)
        if odd_step > 0:
            step = odd_step

        if args.verbose:
            print('[{}] Selecting best {:d} features.'
                  .format(datetime.now() - d0, np.sum(support_) - step))
        # Remaining features
        features = np.arange(n_features)[support_]

        coef_ = None
        test_scores = []
        for train, test in split:
            # Rank the remaining features
            if args.n_folds == 'loo':
                cv = LeaveOneOut(len(train))
            else:
                cv = args.n_folds
            estimator = GridWithCoef(clf, param_grid, cv=cv)

            estimator.fit(data.iloc[train, features], target.iloc[train])
            if coef_ is None:
                coef_ = safe_sqr(estimator.coef_)
            else:
                coef_ += safe_sqr(estimator.coef_)

            test_scores.append(estimator.score(data.iloc[test, features], target.iloc[test]))

        if coef_.ndim > 1:
            ranks = np.argsort(coef_.sum(axis=0))
        else:
            ranks = np.argsort(coef_)

        # for sparse case ranks is matrix
        ranks = np.ravel(ranks)

        # Eliminate the worse features
        threshold = min(step, np.sum(support_) - n_features_to_select)
        support_[features[ranks][:threshold]] = False
        ranking_[np.logical_not(support_)] += 1

        result['selections'].append({
            'scores': test_scores,
            'n_features': np.sum(support_),
            'features': feature_names[support_].tolist()
        })

        with open(result_file, 'w') as f:
            json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

    if args.verbose:
        print('# OK')
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x)
                        if x is not None else None)
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    parser.add_argument('--filter', default='anova')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)
        sys.stdout.flush()

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data,
                         data_path=args.data_path,
                         log=result,
                         **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]
    target_num = LabelEncoder().fit_transform(target)

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    score_params = {}
    preprocessor = None
    if args.filter == 'anova':
        score_features = anova
    elif args.filter == 'infogain_10':
        score_features = relevance
        score_params = {'bins': 10}
    elif args.filter == 'infogain_exp':
        preprocessor = ExpressionDiscretizer()
        score_features = relevance
        score_params = {'bins': 3}
    elif args.filter == 'chi2':
        preprocessor = ExpressionDiscretizer()
        score_features = chi_squared
    else:
        raise ValueError('Filter {} unknown.'.format(args.filter))

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path,
                       '{}_{}.json'.format(args.filter, experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))
        sys.stdout.flush()

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        if preprocessor:
            preprocessor.fit(data.iloc[train, :])
            train_data = preprocessor.transform(data.iloc[train, :])
            test_data = preprocessor.transform(data.iloc[test, :])
        else:
            train_data = data.iloc[train, :]
            test_data = data.iloc[test, :]

        scores_ = score_features(train_data, target_num[train], **score_params)
        result['experiments'].append({
            'iteration':
            i,
            'train_samples_label':
            data.index[train].tolist(),
            'train_samples_idx':
            train.tolist(),
            'scores':
            scores_.tolist()
        })
        if args.verbose:
            print('[{}] Features scored.'.format(datetime.now() - d0))
            sys.stdout.flush()

        result['experiments'][-1]['subsets'] = []
        current_size = n_features
        sorted_features = np.argsort(scores_)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Fitting with {} features.'.format(
                    datetime.now() - d0, current_size))
                sys.stdout.flush()

            features = sorted_features[-current_size:]

            grid = GridWithCoef(clf, param_grid, cv=args.n_folds)
            grid.fit(train_data.iloc[:, features], target.iloc[train])

            # Save results for current set of features
            result['experiments'][-1]['subsets'].append({
                'n_features':
                current_size,
                'features':
                data.columns[features].tolist(),
                'best_params':
                grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(train_data.iloc[:,
                                                           features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(test_data.iloc[:,
                                                          features]).tolist()
                }
            })

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result,
                          f,
                          sort_keys=True,
                          indent=2,
                          separators=(',', ': '))

            current_size -= step

    if args.verbose:
        print('# OK')
        sys.stdout.flush()
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x)
                        if x is not None else None)
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data,
                         data_path=args.data_path,
                         log=result,
                         **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    preprocess_steps = [('scaler', StandardScaler())]

    # RFE
    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        result['experiments'].append({
            'iteration':
            i,
            'train_samples':
            data.index[train].tolist(),
            'subsets': []
        })
        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Evaluating with {} features and selecting {}.'.
                      format(datetime.now() - d0, np.sum(support_),
                             np.sum(support_) - step))
            # Train with current subset
            pipeline = preprocess_steps + [
                ('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))
            ]
            pipeline = Pipeline(pipeline)

            features = np.arange(n_features)[support_]
            pipeline.fit(data.iloc[train, features], target.iloc[train])

            # Save results for current set of features
            grid = pipeline.steps[-1][1]
            result['experiments'][-1]['subsets'].append({
                'n_features':
                np.sum(support_),
                'features':
                data.columns[features].tolist(),
                'best_params':
                grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(data.iloc[train,
                                                     features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(data.iloc[test, features]).tolist()
                }
            })

            # Select best subset
            coef_ = safe_sqr(grid.coef_)

            if coef_.ndim > 1:
                ranks = np.argsort(coef_.sum(axis=0))
            else:
                ranks = np.argsort(coef_)

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            support_[features[ranks][:step]] = False
            ranking_[np.logical_not(support_)] += 1

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result,
                          f,
                          sort_keys=True,
                          indent=2,
                          separators=(',', ': '))

    if args.verbose:
        print('# OK')