Esempio n. 1
0
    def search_grid(self, X, y, param_grid, verbose):
        if '__algorithm' in param_grid.keys():
            algorithm = param_grid['__algorithm']
        else:
            algorithm = self.best_algorithm

        if '__best_parameter' in param_grid.keys(
        ) and param_grid['__best_parameter']:
            self.param_base = self.best_param.copy()

        param_grid = ParameterGrid({
            p[0]: p[1]
            for p in param_grid.items() if not p[0].startswith('__')
        })
        for param in param_grid:
            trainer = crf.Trainer(verbose=verbose)
            param_train = self.param_base.copy()
            param_train.update(param)
            trainer.select(algorithm, self.graphical_model)
            trainer.set_params(param_train)

            if isinstance(self.cv, int):
                cv = KFold(n=len(X),
                           n_folds=self.cv,
                           shuffle=True,
                           random_state=None)

            print('Parameter: (%s) %s' % (algorithm, param_train))
            cv_score = []
            for j, indices in enumerate(cv):
                X_train, y_train = X[indices[0]], y[indices[0]]
                X_test, y_test = X[indices[1]], y[indices[1]]

                for xseq, yseq in zip(X_train, y_train):
                    trainer.append(xseq, yseq)
                start = time.time()
                trainer.train('model')
                fit_elapsed_in_sec = time.time() - start
                trainer.clear()

                tagger = crf.Tagger()
                tagger.open('model')
                start = time.time()
                y_pred = [tagger.tag(xseq) for xseq in X_test]
                predict_elapsed_in_sec = time.time() - start
                tagger.close()
                score = self.scorer(y_pred, y_test)

                print(
                    '  cv(%i): score %.4f, train size %i, test size %i, train elapsed %.4f sec, test elapsed %.4f sec'
                    % (j, score, X_train.shape[0], X_test.shape[0],
                       fit_elapsed_in_sec, predict_elapsed_in_sec))
                cv_score.append(score)

            score = np.mean(cv_score)
            if self.best_score < score:
                self.best_score = score
                self.best_param = param_train
                self.best_algorithm = algorithm
            del cv_score[:]
Esempio n. 2
0
def test_parameter_grid():
    # Test basic properties of ParameterGrid.
    params1 = {"foo": [1, 2, 3]}
    grid1 = ParameterGrid(params1)
    assert_true(isinstance(grid1, Iterable))
    assert_true(isinstance(grid1, Sized))
    assert_equal(len(grid1), 3)
    assert_grid_iter_equals_getitem(grid1)

    params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]}
    grid2 = ParameterGrid(params2)
    assert_equal(len(grid2), 6)

    # loop to assert we can iterate over the grid multiple times
    for i in xrange(2):
        # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
        points = set(tuple(chain(*(sorted(p.items())))) for p in grid2)
        assert_equal(
            points,
            set(("bar", x, "foo", y)
                for x, y in product(params2["bar"], params2["foo"])))

    assert_grid_iter_equals_getitem(grid2)

    # Special case: empty grid (useful to get default estimator settings)
    empty = ParameterGrid({})
    assert_equal(len(empty), 1)
    assert_equal(list(empty), [{}])
    assert_grid_iter_equals_getitem(empty)
    assert_raises(IndexError, lambda: empty[1])

    has_empty = ParameterGrid([{'C': [1, 10]}, {}, {'C': [.5]}])
    assert_equal(len(has_empty), 4)
    assert_equal(list(has_empty), [{'C': 1}, {'C': 10}, {}, {'C': .5}])
    assert_grid_iter_equals_getitem(has_empty)
Esempio n. 3
0
    def fit(self, X, y):

        self.folds = _generate_fold_indices(y, self.test_size, self.seed,
                                            self.n_folds)
        assert len(self.folds) == self.n_folds

        # adaptive
        if self.best_params is not None and self.adaptive:
            param_grid = {}
            for key, best_param in self.best_params.iteritems():
                i = self.param_grid[key].index(best_param)
                if i != 0 and i != len(self.param_grid[key]) - 1:
                    param_grid[key] = [
                        self.param_grid[key][j] for j in [i - 1, i, i + 1]
                    ]
                elif i == 0:
                    param_grid[key] = [
                        self.param_grid[key][j] for j in [i, i + 1, i + 2]
                    ]
                elif i == len(self.param_grid[key]) - 1:
                    param_grid[key] = [
                        self.param_grid[key][j] for j in [i - 2, i - 1, i]
                    ]

            self.param_list = list(ParameterGrid(param_grid))
        else:
            self.param_list = list(ParameterGrid(self.param_grid))

        self.results = [0 for _ in xrange(len(self.param_list))]

        for i, params in enumerate(self.param_list):
            scores = []

            for train_id, test_id in self.folds:
                model = self.base_model_cls(**params)
                model.fit(X[train_id], y[train_id])
                pred = model.predict(X[test_id])

                scores.append(self.score(y[test_id], pred))

            self.results[i] = np.mean(scores)

        self.best_params = self.param_list[np.argmax(self.results)]

        if self.refit:
            self.best_model = self.base_model_cls(**self.best_params)
            self.best_model.fit(X, y)

        assert self.refit
        return self.best_model
Esempio n. 4
0
def create_model_instances(config):

    print('Creating model instances...')

    models = []

    model_classess = {
        'ridge': RidgeClassifier,
        'perceptron': Perceptron,
        'passive_aggressive': PassiveAggressiveClassifier,
        'sgd': SGDClassifier,
        'nearest_centroid': NearestCentroid,
        'multinomial_nb': MultinomialNB,
        'linear_svc': LinearSVC,
        'svc': SVC,
        'dtree': DecisionTreeClassifier,
        'forest': RandomForestClassifier,
        'gbc': GradientBoostingClassifier,
        'extra': ExtraTreesClassifier
    }

    for model_name, param_grid in config.items():

        model_class = model_classess[model_name]
        print('    Building %s models' % str(model_class).split('.')[-1][:-2])

        models.extend([model_class(**p) for p in ParameterGrid(param_grid)])

    print('Created ' + str(len(models)) + ' model instances.')

    return models
Esempio n. 5
0
def trainIngredient(model, grid, train, cv, refit=True, n_jobs=5):
    from joblib import Parallel, delayed
    from sklearn.grid_search import ParameterGrid
    from numpy import zeros
    from sklearn.metrics import accuracy_score
    pred = zeros((train.shape[0], train.cuisine.unique().shape[0]))
    best_score = 0
    for g in ParameterGrid(grid):
        model.set_params(**g)
        results = Parallel(n_jobs=n_jobs)(
            delayed(fitIngredients)(train, list(cv), i, model)
            for i in range(cv.n_folds))
        for i in results:
            pred[i['index'], :] = i['pred']
        score = accuracy_score(train.cuisine, pred.argmax(1))
        if score > best_score:
            best_score = score
            best_pred = pred.copy()
            best_grid = g
    print("Best Score: %0.5f" % best_score)
    print("Best Grid", best_grid)
    if refit:
        X2 = splitIngredients(train)
        model.set_params(**best_grid)
        model.fit(X2.ingredient, X2.cuisine)
    return best_pred, IngredientModel(model)
Esempio n. 6
0
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({
        "max_samples": [0.5, 1.0],
        "bootstrap": [True, False]
    })

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(n_estimators=10,
                                                random_state=1,
                                                **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(n_estimators=10,
                                               random_state=1,
                                               **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results)
            assert_array_equal(sparse_results, dense_results)
Esempio n. 7
0
def test_parameters_sampler_replacement():
    # raise error if n_iter too large
    params = {'first': [0, 1], 'second': ['a', 'b', 'c']}
    sampler = ParameterSampler(params, n_iter=7)
    assert_raises(ValueError, list, sampler)
    # degenerates to GridSearchCV if n_iter the same as grid_size
    sampler = ParameterSampler(params, n_iter=6)
    samples = list(sampler)
    assert_equal(len(samples), 6)
    for values in ParameterGrid(params):
        assert_true(values in samples)

    # test sampling without replacement in a large grid
    params = {'a': range(10), 'b': range(10), 'c': range(10)}
    sampler = ParameterSampler(params, n_iter=99, random_state=42)
    samples = list(sampler)
    assert_equal(len(samples), 99)
    hashable_samples = [
        "a%db%dc%d" % (p['a'], p['b'], p['c']) for p in samples
    ]
    assert_equal(len(set(hashable_samples)), 99)

    # doesn't go into infinite loops
    params_distribution = {'first': bernoulli(.5), 'second': ['a', 'b', 'c']}
    sampler = ParameterSampler(params_distribution, n_iter=7)
    samples = list(sampler)
    assert_equal(len(samples), 7)
Esempio n. 8
0
def test_spectral_biclustering():
    """Test Kluger methods on a checkerboard dataset."""
    param_grid = {'method': ['scale', 'bistochastic', 'log'],
                  'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [3],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_checkerboard((30, 30), 3, noise=0.5,
                                      random_state=random_state)
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralBiclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)

            if issparse(mat) and kwargs['method'] == 'log':
                # cannot take log of sparse matrix
                assert_raises(ValueError, model.fit, mat)
                continue
            else:
                model.fit(mat)

            assert_equal(model.rows_.shape, (9, 30))
            assert_equal(model.columns_.shape, (9, 30))
            assert_array_equal(model.rows_.sum(axis=0),
                               np.repeat(3, 30))
            assert_array_equal(model.columns_.sum(axis=0),
                               np.repeat(3, 30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
Esempio n. 9
0
def tune_parameters(estimator, name, param_grid, X, y, cv):

    logging.info('Tuning parameters for %s model' % name)
    grid_iterable = ParameterGrid(param_grid)

    logging.info('Fitting {0} folds for each of {1} candidates, totalling '
                 '{2} fits'.format(len(cv), len(grid_iterable),
                                   len(cv) * len(grid_iterable)))

    best_score, best_params = None, None
    for grid in grid_iterable:
        estimator.set_params(**grid)
        logging.info('Params: %s' % grid)
        mean_score, opt_n_estimators = cross_validation(estimator,
                                                        X,
                                                        y,
                                                        cv,
                                                        use_watch_list=True)

        if isinstance(estimator, xgb.XGBRegressor):
            grid['n_estimators'] = opt_n_estimators
        if (best_score is None) or (best_score > mean_score):
            best_score, best_params = mean_score, grid

    logging.info('Best parameters: %s, best score: %.5f' %
                 (best_params, best_score))
    logging.info('Parameters are tuned for %s model' % name)

    return best_params, best_score
def clf_loop(models_to_run, clfs, grid, X, y, y_lab):
    results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters',
                                       'weighted_roc_score'))
    for n in range(1, 2):
        # create training and valdation sets
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    model = clf.fit(X_train, y_train)
                    #predict_proba(X_test)[:,1]
                    y_pred_probs = model.predict_proba(X_test)
                    y_pred_probs = pd.DataFrame(
                        [row[:, 1] for row in y_pred_probs]).transpose()
                    #y_pred_probs = y_pred_probs.set_index(testing.vessel_number)
                    y_pred_probs.columns = y_lab
                    # you can also store the model, feature importances, and prediction scores
                    # we're only storing the metrics for now
                    y_pred_probs_sorted, y_test_sorted = zip(
                        *sorted(zip(y_pred_probs, y_test), reverse=True))
                    results_df.loc[len(results_df)] = [
                        models_to_run[index], clf, p,
                        roc_auc_scorer(y_test, y_pred_probs)
                    ]
                except IndexError:
                    print('IndexError:')
                    continue
    return results_df
Esempio n. 11
0
def initialize_regressors(sgd_grid):
    l = SGDRegressor()
    regressions_to_fit = []
    for params in ParameterGrid(sgd_grid):
        l.set_params(**params)
        regressions_to_fit.append(clone(l))
    return regressions_to_fit
Esempio n. 12
0
def develop():

    param_grid = [
        {
            "n_estimators"             : [200],
            "criterion"                : ["gini"],
            "max_features"             : ["auto", "sqrt", "log2"],
            "max_depth"                : [None],
            "min_samples_split"        : [5, 10, 15, 20], #[25, 50],
            "min_samples_leaf"         : [5, 10, 15, 20], #[25, 30, 35, 40, 45, 50, 55],
            "min_weight_fraction_leaf" : [0.0],
            "max_leaf_nodes"           : [None],
            "bootstrap"                : [True],
            "oob_score"                : [False],
            "n_jobs"                   : [-1],
            "random_state"             : [None],
            "verbose"                  : [0],
            "warm_start"               : [False],
            "class_weight"             : [None],
        }
    ]
    param_list = list(ParameterGrid(param_grid))
    
    process_list = []
    for param_dict in param_list:
        process = Process(target=train_validate_test,
                          args=(param_dict, ))
        process_list.append(process)

    for process in process_list:
        process.start()

    for process in process_list:
        process.join()
Esempio n. 13
0
    def requires(self):
        # get models to use
        models_to_run = [x.strip() for x in self.models_used.split('-')[1:]]
        # Construct union of all codes for computing it's compliment for
        # 'other' in RunML

        clfs, grid = run_reg_models.define_clfs_params()

        final_weight = float(self.final_weight)
        overunder = [float(i) for i in self.overunder.split()]

        station_names = []
        for item in self.stations.split():
            station_names.append('STA' + "%02d" % int(item))

        # Wrapper loop for running all models
        runs = []
        counter = 0
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                for station in station_names:
                    runs.append(
                        RunMLReg(p,
                                 clf,
                                 features=self.features,
                                 station=station,
                                 final_weight=final_weight,
                                 schedule=self.schedule,
                                 overunder=overunder,
                                 model_counter=counter,
                                 label=self.label))
                    counter += 1

        return runs
Esempio n. 14
0
def grid_search_cv(datafile, model, grid_params, chunk_size):
    print("Starting batch with chunk size of %d lines" % chunk_size)

    kfold_offsets = load_or_compute_kfold(datafile, CV_N_FOLDS, chunk_size,
                                          KFOLD_OFFSETS_CACHE)

    best_mcc = None
    best_params = None

    for param in list(ParameterGrid(grid_params)):

        model_obj = model()

        model_obj.set_params(**param)

        if best_params is None:
            best_params = model_obj.get_params()

        print("Evaluating model %s" % model_obj.get_params())

        scores = cross_validate(datafile, model_obj, kfold_offsets, chunk_size)
        avg_score = reduce(lambda x, y: x + y, scores) / len(scores)
        print("Average MCC %0.6f" % avg_score)

        if best_mcc is None:
            best_mcc = avg_score
        else:
            if avg_score > best_mcc:
                best_mcc = avg_score
                best_params = model_obj.get_params()

    return best_mcc, best_params
    def __init__(self, conf, process_num, stage):
        self.conf = conf
        self.input_layer_dimension = 1024
        self.label_names = conf['label_names']
        self.EF_ratio_list = conf['enrichment_factor']['ratio_list']

        self.process_num = process_num
        self.stage = stage

        if self.stage == 0:
            self.label_names = [self.label_names[0]]

        cnt = 0
        for param in ParameterGrid(conf['params']):
            if cnt != self.process_num:
                cnt += 1
                continue

            self.param = param
            self.n_estimators = param['n_estimators']
            self.max_features = param['max_features']
            self.min_samples_leaf = param['min_samples_leaf']
            self.class_weight = param['class_weight']
            print('Testing set:', param)
            break

        if self.max_features == "None":
            self.max_features = None
        if self.class_weight == "None":
            self.class_weight = None

        self.model_dict = {}
        return
Esempio n. 16
0
def generateParams():
    params = {
        'max_features': 'sqrt',
        'n_estimators': 1000,
        'learning_rate': 0.01
    }
    #params          = {'kernel' : 'linear' }

    # Set the parameters by cross-validation
    paramaters_grid = {
        'max_depth': [3, 4, 5, 6, 7, 8],
        'min_samples_split': [2, 3, 4, 5, 6, 7],
        'min_samples_leaf': [3, 2, 4, 5, 6, 7],
        'n_estimators': [50, 75, 100, 150, 200, 300, 250],
        'learning_rate': [0.005, 0.01, 0.02, 0.03, 0.04, 0.05]
    }
    # Set the parameters by cross-validation
    #paramaters_grid    = {'C': [0.0000001, 0.001, 0.005, 0.008, 0.01, 0.02, 0.05, 0.07, 0.09, 0.1, 0.2, 0.3, 0.4, 0.5, 1, 10, 100, 0.004]};

    paramaters_search = list(ParameterGrid(paramaters_grid))

    parameters_to_try = []
    for ps in paramaters_search:
        params = {'max_features': 'sqrt'}
        for param in ps.keys():
            params[str(param)] = ps[param]
        parameters_to_try.append(copy.copy(params))

    return parameters_to_try
Esempio n. 17
0
def trainFeatureModel(train, target, model, grid, cv, n_jobs=-1):
    from sklearn.grid_search import ParameterGrid
    from sklearn.metrics import accuracy_score
    from joblib import Parallel, delayed
    from numpy import zeros
    pred = zeros((train.shape[0], target.unique().shape[0]))
    best_score = 0
    best_grid = {}
    for g in ParameterGrid(grid):
        model.set_params(**g)
        if len([
                True for x in list(g.keys())
                if x.find('nthread') != -1 or x.find('n_jobs') != -1
        ]) > 0:
            results = [
                fitSklearn(train, target, list(cv), i, model, True)
                for i in range(cv.n_folds)
            ]
        else:
            results = Parallel(n_jobs=n_jobs)(
                delayed(fitSklearn)(train, target, list(cv), i, model, True)
                for i in range(cv.n_folds))
        for i in results:
            pred[i['index'], :] = i['pred']
        score = accuracy_score(target, pred.argmax(1))
        if score > best_score:
            best_score = score
            best_pred = pred.copy()
            best_grid = g
    print("Best Score: %0.5f" % best_score)
    print("Best Grid:", best_grid)
    model.set_params(**best_grid)
    model.fit(train, target)
    return best_pred, model
Esempio n. 18
0
def clf_loop(models_to_run, clfs, grid, X, y):
    results_df = pd.DataFrame(columns=('model_type', 'clf', 'parameters', 'auc-roc', 'p_at_5', 'p_at_10', 'p_at_20'))
    for n in range(1, 2):
        # create training and valdation sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print
            models_to_run[index]
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    start_time = time.time()
                    clf.set_params(**p)
                    y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
                    # you can also store the model, feature importances, and prediction scores
                    # we're only storing the metrics for now
                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
                    run_time = time.time() - start_time
                    results_df.loc[len(results_df)] = [models_to_run[index], clf, run_time,
                                                       roc_auc_score(y_test, y_pred_probs),
                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                                                       precision_at_k(y_test_sorted, y_pred_probs_sorted, 20.0)]
                    if NOTEBOOK == 1:
                        plot_precision_recall_n(y_test, y_pred_probs, clf)
                except (IndexError):
                    print('Error:')
                    continue
    return results_df
Esempio n. 19
0
def param_search(estimator, param_dict, n_iter=None, seed=None):
    """
    Generator for cloned copies of `estimator` set with parameters
    as specified by `param_dict`. `param_dict` can contain either lists
    of parameter values (grid search) or a scipy distribution function
    to be sampled from. If distributions, you must specify `n_iter`.

    Parameters:
    ___________

    estimator:
        sklearn-like estimator

    param_dict:
        dict of parameter name: values, where values can be an iterable
        or a distribution function

    n_iter:
        number of draws to take from parameter distributions
    """

    if n_iter is None:
        param_iter = ParameterGrid(param_dict)
    else:
        param_iter = ParameterSampler(param_dict, n_iter, random_state=seed)

    estimators = []
    for params in param_iter:
        new_estimator = sklearn.clone(estimator)
        new_estimator.set_params(**params)
        estimators.append(new_estimator)
    return estimators
Esempio n. 20
0
    def fit(self, X, y):
        assert (self.X_tune is not None)
        if self.verbose:
            print('grid searching...')
        #ret = list((self._evaluateParameters(X, y, params) for params in ParameterGrid(self.searchParameters)))
        ret = self.parallel(
            joblib.delayed(_evaluateParameters)
            (X, y, self.X_tune, self.y_tune, self.classifierType,
             self.parameters, params, self.scoring)
            for params in ParameterGrid(self.searchParameters))

        if len(ret):
            bestClassifier, bestScore = max(ret, key=lambda x: x[1])
            if self.verbose:
                for c, s in ret:
                    print(
                        {
                            i: c.get_params()[i]
                            for i in set(c.get_params())
                            & set(self.searchParameters)
                        }, s)
            print("Best: {} Score: {}".format(
                {
                    i: bestClassifier.get_params()[i]
                    for i in set(bestClassifier.get_params())
                    & set(self.searchParameters)
                }, bestScore))

            self.classifier = bestClassifier

            return bestClassifier

        return None
Esempio n. 21
0
def test_spectral_coclustering():
    """Test Dhillon's Spectral CoClustering on a simple problem."""
    param_grid = {'svd_method': ['randomized', 'arpack'],
                  'n_svd_vecs': [None, 20],
                  'mini_batch': [False, True],
                  'init': ['k-means++'],
                  'n_init': [10],
                  'n_jobs': [1]}
    random_state = 0
    S, rows, cols = make_biclusters((30, 30), 3, noise=0.5,
                                    random_state=random_state)
    S -= S.min()  # needs to be nonnegative before making it sparse
    S = np.where(S < 1, 0, S)  # threshold some values
    for mat in (S, csr_matrix(S)):
        for kwargs in ParameterGrid(param_grid):
            model = SpectralCoclustering(n_clusters=3,
                                         random_state=random_state,
                                         **kwargs)
            model.fit(mat)

            assert_equal(model.rows_.shape, (3, 30))
            assert_array_equal(model.rows_.sum(axis=0), np.ones(30))
            assert_array_equal(model.columns_.sum(axis=0), np.ones(30))
            assert_equal(consensus_score(model.biclusters_,
                                         (rows, cols)), 1)
Esempio n. 22
0
def test_classification():
    """Check classification for various parameter settings."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                        iris.target,
                                                        random_state=rng)
    grid = ParameterGrid({
        "max_samples": [0.5, 1.0],
        "max_features": [1, 2, 4],
        "bootstrap": [True, False],
        "bootstrap_features": [True, False]
    })

    for base_estimator in [
            None,
            DummyClassifier(),
            Perceptron(),
            DecisionTreeClassifier(),
            KNeighborsClassifier(),
            SVC()
    ]:
        for params in grid:
            BaggingClassifier(base_estimator=base_estimator,
                              random_state=rng,
                              **params).fit(X_train, y_train).predict(X_test)
Esempio n. 23
0
def grid_search(lb_view, model, cv_split_filenames, param_grid):
    """
        Parameters: 
            lb_view = A load-balanced IPython.parallel client.
            model = tuple containing in the [0] index the alias name for the classifier
            and in the [1] index the instantiation of the classifier itself.
            cv_split_filenames = list of cross-val dataset filenames. 
            param_grid = dictionary of all the hyper-parameters for the pipeline
            objects to be trained.
       
        Output:
            List of parameters and list of asynchronous client tasks handles 
    """
    all_tasks = []
    all_parameters = list(ParameterGrid(param_grid))

    for i, params in enumerate(all_parameters):
        task_for_params = []

        for j, cv_split_filename in enumerate(cv_split_filenames):
            t = lb_view.apply(compute_evaluation, cv_split_filename, model,
                              params)
            task_for_params.append(t)

        all_tasks.append(task_for_params)

    return all_parameters, all_tasks
Esempio n. 24
0
 def get_parameters(self):
     '''
     Returns a list of all possible combinations of parameters.
     :return: list with a dictionary of parameter_name: value
     '''
     specific_params = list(ParameterGrid(self.parameters))
     return specific_params
Esempio n. 25
0
def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test):

    columns = ['classifier', 'parameters', 'pat5']
    temp_results_df = pd.DataFrame(data=np.zeros((0, len(columns))),
                                   columns=columns)

    for index, clf in enumerate([clfs[x] for x in models_to_run]):
        parameter_values = grid[models_to_run[index]]
        for p in ParameterGrid(parameter_values):
            try:
                clf.set_params(**p)
                y_pred_probs = clf.fit(X_train,
                                       y_train).predict_proba(X_test)[:, 1]
                # need to change this to store model and also get feature importances

                # threshold = np.sort(y_pred_probs)[::-1][int(.05*len(y_pred_probs))]
                # print threshold
                # print precision_at_k(y_test,y_pred_probs,.05)
                # plot_precision_recall_n(y_test,y_pred_probs,clf)
                temp_results_df.loc[len(temp_results_df)] = [
                    x, clf, precision_at_k(y_test, y_pred_probs, .05)
                ]
            except IndexError, e:
                print 'Error:', e
                continue
Esempio n. 26
0
def tune_parameters(estimator, name, param_grid, X, y, cv):

    info('Tuning parameters for %s model' % name)
    grid_iterable = ParameterGrid(param_grid)

    info('Fitting {0} folds for each of {1} candidates, totalling {2}'
         'fits'.format(len(cv), len(grid_iterable),
                       len(cv) * len(grid_iterable)))

    best_score, best_params = None, None
    for i, grid in enumerate(grid_iterable):
        estimator.set_params(**grid)
        info('Params: %s' % grid)
        mean_score, opt_n_estimators = cross_validation(
            estimator, X, y, cv, True)

        if isinstance(estimator, xgb.XGBClassifier):
            grid['n_estimators'] = opt_n_estimators
        if (best_score is None) or (best_score > mean_score):
            best_score, best_params = mean_score, grid

    info('Best parameters: %s, best score: %.5f' % (best_params, best_score))
    info('Parameters are tuned for %s model' % name)

    return best_params, best_score
Esempio n. 27
0
def magic_loop(models_to_run, clfs, grid, X, y):
    '''
    Takes a list of models to use, two dictionaries of classifiers and parameters, and array of X
    Set to find ten models with best precision at 5 percent recall
    '''
    table = {}
    top = []
    for i in range(10):
        top.append((0, " "))
    heapq.heapify(top)
    k = 0.05
    for n in range(1, 2):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            for p in ParameterGrid(grid[models_to_run[index]]):
                try:
                    clf.set_params(**p)
                    print (clf)
                    y_pred_probs = clf.fit(X_train, y_train).predict_proba(X_test)[:,1]
                    plot_precision_recall_n(y_test, y_pred_probs, clf)
                    l = scoring(k, y_test, y_pred_probs)
                    m, s = top[0]
                    p = l['precision']
                    if p > m:
                        heapq.heapreplace(top, (p, clf))
                        table[str(clf)] = l
                except: 
                    print ('Error:')
                    continue
    print (top)
    return top, table
Esempio n. 28
0
 def __init__(self,
              estimator,
              param_grid,
              scoring=None,
              cv=4,
              refit=True,
              verbose=False,
              population_size=50,
              mutation_prob=0.10,
              tournament_size=3,
              generations_number=10,
              n_jobs=1,
              iid=True,
              pre_dispatch='2*n_jobs',
              error_score='raise',
              fit_params=None):
     super(EvolutionaryAlgorithmSearchCV,
           self).__init__(estimator, scoring, fit_params, n_jobs, iid,
                          refit, cv, pre_dispatch, error_score)
     _check_param_grid(param_grid)
     self.param_grid = param_grid
     self.possible_params = list(ParameterGrid(self.param_grid))
     self.individual_size = int(ceil(log(len(self.possible_params), 2)))
     self.population_size = population_size
     self.generations_number = generations_number
     self.best_estimator_ = None
     self.best_score_ = None
     self.best_params_ = None
     self._individual_evals = {}
     self.mutation_prob = mutation_prob
     self.tournament_size = tournament_size
Esempio n. 29
0
 def __init__(self,
              params,
              log_files_path,
              output_dim=1,
              output_activation=None,
              batch_size=16,
              epoch_count=1,
              score_func='r2_score',
              patience=150,
              class_mode='categorical',
              dropout=0.5,
              loss='categorical_crossentropy',
              default_activation='relu',
              default_optimizer=Adadelta()):
     self.epoch_count = epoch_count
     self.batch_size = batch_size
     self.log_files_path = log_files_path
     self.createDirectoryIfNotExist(self.log_files_path)
     self.param_grid = ParameterGrid(params)
     self.patience = patience
     self.dropout = dropout
     self.score_func = score_func
     self.output_dim = output_dim
     self.output_activation = output_activation
     self.default_activation = default_activation
     self.default_optimizer = default_optimizer
     self.loss = loss
     self.class_mode = class_mode
Esempio n. 30
0
def grid_search_param_model(model_name, grid_search_param_dict, X, y,
                            data_process_param_dict):
    param_grid = list(ParameterGrid(grid_search_param_dict))
    if model_name == "tree":
        for param in param_grid:
            tree_clf = DecisionTreeClassifier(
                max_depth=param.get("max_depth", None))
            param.update({'model': model_name})
            print param
            param.update(data_process_param_dict)
            cross_validate_model_param_v2(tree_clf, param, dataset=X, label=y)
    if model_name == "adaboosting":
        for param in param_grid:
            tree_clf = DecisionTreeClassifier(
                max_depth=param.get("max_depth", None))
            ada_boost_clf = AdaBoostClassifier(
                base_estimator=tree_clf,
                n_estimators=param.get("n_estimators", None),
                learning_rate=param.get("learning_rate", None))
            param.update({'model': model_name})
            print param
            param.update(data_process_param_dict)
            cross_validate_model_param_v2(ada_boost_clf,
                                          param,
                                          dataset=X,
                                          label=y)
    elif model_name == 'linearsvc':
        for param in param_grid:
            print "%s" % param
            param.update({'model': model_name})

            linsvc_clf = LinearSVC(C=param.get("C", None))
            if "class_weight" in param.keys():
                class_weight = param['class_weight']
                class_weight_key = class_weight.keys()
                class_weight_key = [unicode(key) for key in class_weight_key]
                class_weight_value = class_weight.values()
                class_weight = dict(zip(class_weight_key, class_weight_value))
                param.update({'class_weight': class_weight})
            param.update(data_process_param_dict)
            cross_validate_model_param_v2(linsvc_clf,
                                          param,
                                          dataset=X,
                                          label=y)
    elif model_name == "random_forest":
        for param in param_grid:
            print "%s" % param
            param.update({'model': model_name})
            rf_clf = RandomForestClassifier(n_estimators=param.get(
                "n_estimators", None),
                                            max_depth=param.get("max_depth"))
            param.update(data_process_param_dict)
            cross_validate_model_param_v2(rf_clf, param, dataset=X, label=y)
    elif model_name == "svm":
        for param in param_grid:
            print "%s" % param
            param.update({'model': model_name})
            svm_clf = SVC(C=param.get("C"), gamma=param.get("gamma"))
            param.update(data_process_param_dict)
            cross_validate_model_param_v2(svm_clf, param, dataset=X, label=y)