def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test,
             training_dates, testing_dates):
    """Runs the loop using models_to_run, clfs, gridm and the data
    """
    results = []
    for n in range(1, 2):
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    model_fit = clf.fit(X_train, y_train)
                    y_pred_probs = model_fit.predict_proba(X_test)[:, 1]
                    # Store metrics and model info for comparison
                    y_pred_probs_sorted, y_test_sorted = zip(
                        *sorted(zip(y_pred_probs, y_test), reverse=True))
                    row = [
                        training_dates,
                        testing_dates,
                        models_to_run[index],
                        clf,
                        p,
                        baseline(y_test),
                        roc_auc_score(y_test, y_pred_probs),
                        accuracy_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                        accuracy_at_k(y_test_sorted, y_pred_probs_sorted,
                                      20.0),
                        accuracy_at_k(y_test_sorted, y_pred_probs_sorted,
                                      50.0),
                        f1_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                        f1_at_k(y_test_sorted, y_pred_probs_sorted, 20.0),
                        f1_at_k(y_test_sorted, y_pred_probs_sorted, 50.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       1.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       5.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       10.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       20.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       50.0),
                        recall_at_k(y_test_sorted, y_pred_probs_sorted, 1.0),
                        recall_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                        recall_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                        recall_at_k(y_test_sorted, y_pred_probs_sorted, 20.0),
                        recall_at_k(y_test_sorted, y_pred_probs_sorted, 50.0),
                    ]
                    results.append(row)
                    #plot_precision_recall_n(y_test,y_pred_probs,clf)
                except IndexError as e:
                    print('Error:', e)
                    continue

    return results
Beispiel #2
0
def best_classifier(X,Y,Xvs,Yvs):
    parameters = {'C':[3,13,67,330,1636,8103]}
    pg = ParameterGrid(parameters)
    clas = Parallel(n_jobs=4)(delayed(pfit)(p,X,Y,Xvs,Yvs) for p in pg)
    clas.sort(reverse=True)
    (sc,cla) = clas[0]
    print '-'*20
    print 'best is ',cla,sc
    print '-'*20
    return cla,sc
Beispiel #3
0
    def __get_param_iterable(self, param_grid):
        if self.ramdonized_search_enable:
            parameter_iterable = ParameterSampler(
                param_grid,
                self.randomized_search_n_iter,
                random_state=self.ramdonized_search_random_state)
        else:
            parameter_iterable = ParameterGrid(param_grid)

        return parameter_iterable
Beispiel #4
0
 def fit(self, X, y):
 
     if self.verbose >=1:
         print 'Running GridSearchCV...'
             
     #p = Pool(processes=self.n_jobs)
     p = MyPool(processes=self.n_jobs)
     
     #Get parameters for multiprocessing task:
     #get parameters that belongs only to first estimator. The remaining belongs to the other estimators
     param_grid_mine, param_grid_others = self.split_params(self.param_grid, self.estimators.steps[0][0])
     params_vars = list(ParameterGrid(param_grid_mine)) #vary all parameters for this estimator only
     
     if False: #set to TRUE to make debug of inner functions easier. It does not use multi-task in this case
         print 'vai dar erro, remover isso daki!'
         lst_scores, lst_params = self.est_var(self.estimators.steps, self.param_grid, X, y,params_vars=params_vars)
         
     params_pool =[]
 
     #divide the first estimator parameters among n_jobs
     for i in range(self.n_jobs):
         params_pool.append(params_vars[i::self.n_jobs])
         
     #for multiple arguments and python 3.3, use pool.starmap()        
     j=len(params_pool)
     results = p.map(unwrap_self_est_var, zip([self]*j, [self.estimators.steps]*j, [param_grid_others]*j, [X]*j, [y]*j, [None]*j, [None]*j, params_pool))
     
     p.close() #terminate process
     p.join()
     
     lst_scores = []
     lst_params = []
     for result in results:
         lst_scores.extend(result[0])
         lst_params.extend(result[1])
     
     #get the best score and best parameters
     best_score = np.asarray(lst_scores).min()
     idx_best = np.where(lst_scores == best_score)
     best_params = [lst_params[i] for i in idx_best[0]]
     
     self.best_score_ =best_score 
     self.best_params_ = best_params
     
     if self.refit:
         # fit the best estimator using the entire dataset
         self.set_params(self.best_params_[0])
         self.estimators= self.estimators.fit(X, y)
         
     if self.verbose>=1:
         print 'best_params = '
         for best_param in best_params:
             print best_param+'\n'
 
     return self
def test_parameter_grid():
    """Test basic properties of ParameterGrid."""
    params1 = {"foo": [1, 2, 3]}
    grid1 = ParameterGrid(params1)
    assert_true(isinstance(grid1, Iterable))
    assert_true(isinstance(grid1, Sized))
    assert_equal(len(grid1), 3)

    params2 = {"foo": [4, 2], "bar": ["ham", "spam", "eggs"]}
    grid2 = ParameterGrid(params2)
    assert_equal(len(grid2), 6)

    # loop to assert we can iterate over the grid multiple times
    for i in xrange(2):
        # tuple + chain transforms {"a": 1, "b": 2} to ("a", 1, "b", 2)
        points = set(tuple(chain(*p.items())) for p in grid2)
        assert_equal(
            points,
            set(("foo", x, "bar", y)
                for x, y in product(params2["foo"], params2["bar"])))
Beispiel #6
0
    def run(self):
        with self.input()[0] as i:
            graphIndex, GR = i.query()

        with self.input()[1] as i:
            R = i.query()

        train_index = np.array(self.train_index)
        test_index = list(range(len(GR)))
        test_index = [i for i in test_index if i not in train_index]

        y_bin = [None] * len(GR)
        for index, D in R.items():
            if index not in graphIndex:
                continue
            gI = graphIndex[index]
            score = D['score']
            time = evalTime(D['time_rank'])
            y_bin[gI] = (score['IUV'], score['ESBMC'], time)
        y_bin = np.array(y_bin)
        y_test = y_bin[test_index]
        y_train = y_bin[train_index]

        TGR = GR[train_index][:, train_index]
        EGR = GR[test_index][:, train_index]

        param_grid = {
            'C_tester': self.C_tester.value,
            'C_verifier': self.C_verifier.value,
            'C_time': self.C_time.value
        }
        max_mean = -math.inf
        max_param = None
        for params in ParameterGrid(param_grid):

            tick(self)

            mean, _ = self._k_fold_cv_gram(
                TGR, y_train,
                (params['C_tester'], params['C_verifier'], params['C_time']))
            if mean > max_mean:
                max_mean = mean
                max_param = params

        max_param['mean'] = max_mean
        max_param['h'] = self.h
        max_param['D'] = self.D
        max_param['train_matrix'] = TGR.tolist()
        max_param['train_y'] = y_train.tolist()
        max_param['test_matrix'] = EGR.tolist()
        max_param['test_y'] = y_test.tolist()

        with self.output() as o:
            o.emit(max_param)
def search_test_params(base_clf, cv_params, X, y, train, test, scoring):
    parameter_iterable = ParameterGrid(cv_params)
    grid_scores = Parallel(n_jobs=-1)(
        delayed(_fit_and_score)(clone(base_clf), X, y, scoring,
                                train, test, 0, parameters,
                                None, return_parameters=True)
            for parameters in parameter_iterable)
    # grid_scores = [_fit_and_score(clone(base_clf), X, y, scoring, train, test, 0, parameters, None, return_parameters=True) for parameters in parameter_iterable]
    grid_scores = sorted(grid_scores, key=lambda x: x[0], reverse=True)
    scores, _, _, parameters = grid_scores[0]
    return scores, parameters
Beispiel #8
0
 def generate_models(self, input_shape, output_dim):
     loss_type = self.grid.params_grid["loss"][0]
     for layers in self.create_network_structures(self.grid.params_grid["layers"], self.grid.params_grid["layer_nums"], input_shape):
         print "Current network: %s" % "->".join(layers)
         flat_params_grid = self.grid.create_flat_layers_grid(layers, input_shape, output_dim)
         for optimizer_name in self.grid.params_grid["optimizers"]:
             flat_grid = flat_params_grid.copy()
             flat_grid.update(self.grid.create_flat_optimizer_grid(optimizer_name))
             for params in ParameterGrid(flat_grid):
                 nn_params = self.grid.fold_params(params)
                 yield self.model_factory.create_model(layers, nn_params, loss_type)
    def _grid_search_params_iter(self, train_X, train_y):
        if callable(self.inner_cv):
            inner_cv = self.inner_cv(train_X, train_y)
        else:
            inner_cv = _check_cv(self.inner_cv, train_X, train_y, classifier=is_classifier(self.estimator))

        param_iter = ParameterGrid(self.param_grid)
        LOG.info("Performing grid search over %d configurations" % len(param_iter))

        for fold_id, (train_index, test_index) in enumerate(inner_cv):
            for parameters in param_iter:
                yield fold_id + 1, train_index, test_index, parameters
Beispiel #10
0
def cv_trials(X, y, folds, model, hyper):
    N = len(y)

    cv_scores = []
    predictions = {
        'pred': np.zeros(N, dtype=np.bool),
        'proba': np.zeros(N),
        'foldno': np.zeros(N, dtype=np.int32) - 1,
    }
    pg = list(ParameterGrid(hyper))
    for foldno, (train, val, test) in enumerate(folds):
        train_X, train_y = X[train], y[train]
        val_X, val_y = X[val], y[val]
        test_X, test_y = X[test], y[test]
        best_params = None
        best_val_f1 = None
        for these_params in pg:
            model.set_params(**these_params)
            model.fit(train_X, train_y)
            this_val_f1 = metrics.f1_score(val_y, model.predict(val_X), average="weighted")
            if not best_params or this_val_f1 > best_val_f1:
                best_params = these_params
                best_val_f1 = this_val_f1
        if len(pg) > 1:
            model.set_params(**best_params)
            model.fit(train_X, train_y)
        train_f1 = metrics.f1_score(train_y, model.predict(train_X), average="weighted")

        preds_y = model.predict(test_X)
        predictions['pred'][test] = preds_y

        predictions['foldno'][test] = foldno

        fold_eval = {'f1': metrics.f1_score(test_y, preds_y, average="weighted"),
                      'p': metrics.precision_score(test_y, preds_y, average="weighted"),
                      'r': metrics.recall_score(test_y, preds_y, average="weighted"),
                      'a': metrics.accuracy_score(test_y, preds_y)}
        print "[%02d] Best hyper [train %.3f -> val %.3f -> test %.3f] %s" % (foldno, train_f1, best_val_f1, fold_eval['f1'], best_params)


        cv_scores.append(fold_eval)
        np.set_printoptions(suppress=True)

    # now we want to compute global evaluations, and consolidate metrics
    cv_scores = consolidate(cv_scores)

    preds_y = predictions['pred']
    pooled_eval = {'f1': metrics.f1_score(y, preds_y, average="weighted"),
                    'p': metrics.precision_score(y, preds_y, average="weighted"),
                    'r': metrics.recall_score(y, preds_y, average="weighted"),
                    'a': metrics.accuracy_score(y, preds_y)}

    return pooled_eval, predictions, cv_scores
    def _create_batches(self):
        param_iter = ParameterGrid(self.param_grid)

        # divide work into batches equal to the communicator's size
        work_batches = [[] for _ in range(comm_size)]
        i = 0
        for fold_id, (train_index, test_index) in enumerate(self.cv_iter):
            for parameters in param_iter:
                work_batches[i % comm_size].append((fold_id + 1, train_index, test_index, parameters))
                i += 1

        return work_batches
Beispiel #12
0
def grid_generator(search_space):
    param_grid = ParameterGrid(search_space)
    all_params = []
    for p in param_grid:
        all_params.append(p)
    for key in search_space.keys():
        if (isinstance(search_space[key], dict)):
            new_params=[]
            for param in all_params:
                if (search_space[key][param[key]] is None):
                    new_params.append(param)
                else:
                    param_grid = ParameterGrid(search_space[key][param[key]])
                    add_params = [p for p in param_grid]
                    for aparam in add_params:
                        tparam = copy.copy(param)
                        tparam.update(aparam)
                        new_params.append(tparam)
            all_params = new_params
    for param in all_params:
        yield param
Beispiel #13
0
def performGridSearchWithKx2CV(clf, X, y, params):

    grid = list(ParameterGrid(params))
    performances = []

    for p in grid:
        new_clf = base.clone(clf)
        new_clf.set_params(**p)
        performances.append(np.mean(getKx2CVScores(new_clf, X, y)[0]))
        print performances[-1]

    return [grid, performances]
Beispiel #14
0
 def fit(self, X, y=None):
     """Run fit with all sets of parameters.
     Parameters
     ----------
     X : array-like, shape = [n_samples, n_features]
         Training vector, where n_samples is the number of samples and
         n_features is the number of features.
     y : array-like, shape = [n_samples] or [n_samples, n_output], optional
         Target relative to X for classification or regression;
         None for unsupervised learning.
     """
     return self._fit(X, y, ParameterGrid(self.param_grid))
Beispiel #15
0
def clf_loop(models_to_run, clfs, grid, X, y):
    results_df = pd.DataFrame(
        columns=('model_type', 'clf', 'parameters', 'time_used', 'auc-roc',
                 'p_at_5', 'recall_at_5', 'accuracy_at_5', 'f1_at_5',
                 'p_at_10', 'recall_at_10', 'accuracy_at_10', 'f1_at_10',
                 'p_at_20', 'recall_at_20', 'accuracy_at_20', 'f1_at_20'))
    for n in range(1, 2):
        # create training and valdation sets
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.25,
                                                            random_state=0)
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    start_time = time.time()
                    y_pred_probs = clf.fit(X_train,
                                           y_train).predict_proba(X_test)[:, 1]
                    end_time = time.time()
                    elapsed_time = end_time - start_time
                    # you can also store the model, feature importances, and prediction scores
                    # we're only storing the metrics for now
                    y_pred_probs_sorted, y_test_sorted = zip(
                        *sorted(zip(y_pred_probs, y_test), reverse=True))
                    results_df.loc[len(results_df)] = [
                        models_to_run[index], clf, p, elapsed_time,
                        roc_auc_score(y_test, y_pred_probs),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       5.0),
                        recall(y_test_sorted, y_pred_probs_sorted, 5.0),
                        accuracy(y_test_sorted, y_pred_probs_sorted, 5.0),
                        f1(y_test_sorted, y_pred_probs_sorted, 5.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       10.0),
                        recall(y_test_sorted, y_pred_probs_sorted, 10.0),
                        accuracy(y_test_sorted, y_pred_probs_sorted, 10.0),
                        f1(y_test_sorted, y_pred_probs_sorted, 10.0),
                        precision_at_k(y_test_sorted, y_pred_probs_sorted,
                                       20.0),
                        recall(y_test_sorted, y_pred_probs_sorted, 20.0),
                        accuracy(y_test_sorted, y_pred_probs_sorted, 20.0),
                        f1(y_test_sorted, y_pred_probs_sorted, 20.0)
                    ]
                    if NOTEBOOK == 1:
                        eval_model_precision_recall(y_test, y_pred_probs, clf)
                        #eval_model_roc(y_test,y_pred_probs,clf)
                except IndexError:
                    print("IndexError")
                    continue
    return results_df
Beispiel #16
0
    def initialize_classifiers(self, model_setup):

        for clf_name, clf in self.valid_classifiers.items():

            if clf_name in model_setup:
                parameter_grid = model_setup[clf_name]
                for params in ParameterGrid(parameter_grid):
                    clf.set_params(**params)
                    self.classifiers_to_fit.append(clone(clf))
            else:
                continue
        assert self.classifiers_to_fit != [], "Could not find any valid classifiers"
Beispiel #17
0
def trainSklearn(model,
                 grid,
                 train,
                 target,
                 cv,
                 refit=True,
                 n_jobs=5,
                 multi=False):
    """
	Train a sklearn pipeline or model using textual data as input.
	"""
    from joblib import Parallel, delayed
    from sklearn.grid_search import ParameterGrid
    from numpy import zeros
    if multi:
        pred = zeros((train.shape[0], target.unique().shape[0]))
        from sklearn.metrics import accuracy_score
        score_func = accuracy_score
    else:
        from sklearn.metrics import roc_auc_score
        score_func = roc_auc_score
        pred = zeros(train.shape[0])
    best_score = 0
    for g in ParameterGrid(grid):
        model.set_params(**g)
        if len([True for x in list(g.keys()) if x.find('nthread') != -1]) > 0:
            results = [
                fitSklearn(train, target, list(cv), i, model, multi)
                for i in range(cv.n_folds)
            ]
        else:
            results = Parallel(n_jobs=n_jobs)(
                delayed(fitSklearn)(train, target, list(cv), i, model, multi)
                for i in range(cv.n_folds))
        if multi:
            for i in results:
                pred[i['index'], :] = i['pred']
            score = score_func(target, pred.argmax(1))
        else:
            for i in results:
                pred[i['index']] = i['pred']
            score = score_func(target, pred)
        if score > best_score:
            best_score = score
            best_pred = pred.copy()
            best_grid = g
    print("Best Score: %0.5f" % best_score)
    print("Best Grid", best_grid)
    if refit:
        model.set_params(**best_grid)
        model.fit(train, target)
    return best_pred, model
Beispiel #18
0
def create_grid(mp_in=None, npoints=3, provided_keys=None, ga=None):
    '''
    Description, create a grid of evenly spaced samples in model parameters to search over.
    Inputs: npoints, type: Integer: number of sample points per parameter
    nparams, type: Integer: number of parameters to use, conflicts, with next argument.
    nparams, iterates through a list of parameters, and assigns the nparams to use via stupid counting.
    provided keys: explicitly define state the model parameters that are used to create the grid of samples, by
    keying into an existing of parameters.

    This method needs the user of the method to declare a dictionary of model parameters in a path:
    neuronunit.optimization.model_parameters.

    Miscallenous, once grid created by this function
    has been evaluated using neuronunit it can be used for informing a more refined second pass fine grained grid

    # smaller is a dictionary thats not necessarily as big
    # as the grid defined in the model_params file. Its not necessarily
    # a smaller dictionary, if it is smaller it is reduced by reducing sampling
    # points.

    '''
    if type(mp_in) is type(None):
        from neuronunit.models.NeuroML2 import model_parameters as modelp
        mp_in = OrderedDict(modelp.model_params)

    whole_p_set = {}

    whole_p_set = OrderedDict(sample_points(copy.copy(mp_in), npoints=npoints))

    print(type(provided_keys), 'provided keys')
    if type(provided_keys) is type(dict):

        subset = OrderedDict(
            {k: whole_p_set[k]
             for k in list(provided_keys.keys())})

    elif len(provided_keys) == 1 or type(provided_keys) is type(str('')):
        subset = OrderedDict({provided_keys: whole_p_set[provided_keys]})

    else:
        subset = OrderedDict({k: whole_p_set[k] for k in provided_keys})

    maps = create_a_map(subset)
    if type(ga) is not type(None):
        if npoints > 1:
            for k, v in subset.items():
                v[0] = v[0] * 1.0 / 3.0
                v[1] = v[1] * 2.0 / 3.0

    # The function of maps is to map floating point sample spaces onto a  monochromataic matrix indicies.
    grid = list(ParameterGrid(subset))
    return grid, maps
Beispiel #19
0
def run_the_models(data, models_to_run, response, features):
    """
    This runs models and produces evaluation output:
    inputs:
        data: dataframe with data
        models_to_run: list of models to run 
        responce: column name of y variable
        features: list of column names for model features 
    returns:
        dataframe 
    """
    thresholds = [0.01, 0.02, 0.05, 0.10, 0.20, 0.30, 0.50]
    precision_cols = ["precision_at_{}".format(str(x)) for x in thresholds]
    recall_cols = ["recall_at_{}".format(str(x)) for x in thresholds]
    cols = [
        'model', 'parameters', 'train_start', 'train_end', 'test_start',
        'test_end', 'f1_score', 'auc'
    ] + precision_cols + recall_cols
    model_results = []
    splits = temporal_train_test_split(data, 'date_posted', freq='6M')
    for train, test in splits:
        X_train, X_test, y_train, y_test = split_data_by_time(
            data, 'date_posted', train, test, response, features)
        for m in models_to_run:
            if m not in MODELS:
                print(m, 'bad model')
                break
            clf = MODELS[m]
            parameter_grid = ParameterGrid(PARAMS[m])
            for p in parameter_grid:
                try:
                    # initialize list to keep track of results
                    res = [m, p, train[0], train[1], test[0], test[1]]
                    clf.set_params(**p)
                    clf.fit(X_train, y_train)
                    predicted_scores = clf.predict_proba(X_test)[:, 1]
                    predicted_vals = clf.predict(X_test)
                    true_labels = y_test
                    precise = calculate_precision_at_threshold_multi(
                        predicted_scores, true_labels, thresholds)
                    recall = calculate_recall_at_threshold_multi(
                        predicted_scores, true_labels, thresholds)
                    auc = sklearn.metrics.roc_auc_score(
                        true_labels, predicted_vals)
                    f1 = sklearn.metrics.f1_score(true_labels, predicted_vals)
                    # append metrics to list
                    res = res + [auc, f1] + precise + recall
                    model_results.append(res)
                except Exception as e:
                    print(e, m, p)
        df = pd.DataFrame(model_results, columns=cols)
    return df
Beispiel #20
0
def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test,
             test_end_year, label):

    #results_df=pd.DataFrame(columns=('timestamp','model_type','clf','parameters','auc-roc','baseline','p_at_0.5','p_at_1','p_at_5','p_at_10','p_at_10'))
    index = 0
    for index, clf in enumerate([clfs[x] for x in models_to_run]):
        print 'clf:', clf
        print models_to_run[index]
        clf_nm = models_to_run[index]
        parameter_values = grid[models_to_run[index]]
        for p in ParameterGrid(parameter_values):
            try:
                clf.set_params(**p)
                print clf
                y_pred_probs = clf.fit(X_train,
                                       y_train).predict_proba(X_test)[:, 1]
                print('Model fitted')
                #you can also store the model, feature importances, and prediction scores
                #we're only string the metrics for now
                y_pred_probs_sorted, y_test_sorted = zip(
                    *sorted(zip(y_pred_probs, y_test), reverse=True))
                print('scoring')
                results_df = pd.DataFrame(
                    columns=('timestamp', 'model_type', 'clf', 'parameters',
                             'auc-roc', 'baseline', 'p_at_0.5', 'p_at_1',
                             'p_at_5', 'p_at_10', 'p_at_10'))

                results_df.loc[len(results_df)] = [
                    str(datetime.datetime.now()), models_to_run[index], clf, p,
                    roc_auc_score(y_test, y_pred_probs),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 100),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 0.5),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 1.0),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 20.0)
                ]
                filename = "results" + "_" + str(
                    test_end_year) + "_" + label + '.csv'
                results_df.to_csv(filename, index=True, mode='a')

                if NOTEBOOK == 1:
                    plot_precision_recall_n(y_test,
                                            y_pred_probs,
                                            clf,
                                            fname=clf_nm + '_' + str(index) +
                                            '_' + label + '_' +
                                            str(test_end_year) + '.png')
                    index += 1
            except IndexError, e:
                print 'Error:', e
                continue
Beispiel #21
0
def create_misc_confs():
    from sklearn.grid_search import ParameterGrid
    params = {
        'break_width': [1.5, 2.0, 3.6, 5.0],
        'recognizer': ['probout', 'hmm'],
        'combine_hangoff': [.4, .6, .8],
        'postprocess': [True, False],
        'segmenter': ['experimental', 'stochastic'],
        'line_cluster_pos': ['top', 'center'],
    }
    grid = ParameterGrid(params)
    for pr in grid:
        Config(save_conf=True, **pr)
def parallel_grid_search(lb_view, clf, cv_split_filenames, param_grid):
    all_tasks = []
    all_parameters = list(ParameterGrid(param_grid))

    # iterate over parameter combinations
    for i, params in enumerate(all_parameters):
        task_for_params = []
        # iterate over the K folds
        for j, cv_split_filename in enumerate(cv_split_filenames):
            t = lb_view.apply( compute_evaluation, cv_split_filename, clf, params)
            task_for_params.append(t)
        all_tasks.append(task_for_params)
    return all_parameters, all_tasks
Beispiel #23
0
def test_iforest():
    """Check Isolation Forest for various parameter settings."""
    X_train = np.array([[0, 1], [1, 2]])
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({"n_estimators": [3],
                          "max_samples": [0.5, 1.0, 3],
                          "bootstrap": [True, False]})

    with ignore_warnings():
        for params in grid:
            IsolationForest(random_state=rng,
                            **params).fit(X_train).predict(X_test)
Beispiel #24
0
    def fit(self, X, y):
        """Actual fitting,  performing the search over parameters."""

        parameter_iterable = ParameterGrid(self.param_grid)

        estimator = self.estimator
        cv = self.cv

        n_samples = _num_samples(X)
        X, y = indexable(X, y)

        if y is not None:
            if len(y) != n_samples:
                raise ValueError('Target variable (y) has a different number '
                                 'of samples (%i) than data (X: %i samples)' %
                                 (len(y), n_samples))
        cv = check_cv(cv, X, y, classifier=is_classifier(estimator))

        if self.verbose > 0:
            if isinstance(parameter_iterable, Sized):
                n_candidates = len(parameter_iterable)
                print("Fitting {0} folds for each of {1} candidates, totalling"
                      " {2} fits".format(len(cv), n_candidates,
                                         n_candidates * len(cv)))

        base_estimator = clone(self.estimator)

        pre_dispatch = self.pre_dispatch

        out = Parallel(
            n_jobs=self.n_jobs,
            verbose=self.verbose,
            pre_dispatch=pre_dispatch)(delayed(cv_fit_and_score)(
                clone(base_estimator), X, y, self.scoring, parameters, cv=cv)
                                       for parameters in parameter_iterable)

        best = sorted(out, key=lambda x: x[0])[-1]
        self.best_params_ = best[1]
        self.best_score_ = best[0]

        if self.refit:
            # fit the best estimator using the entire dataset
            # clone first to work around broken estimators
            best_estimator = clone(base_estimator).set_params(**best[1])
            if y is not None:
                best_estimator.fit(X, y, **self.fit_params)
            else:
                best_estimator.fit(X, **self.fit_params)
            self.best_estimator_ = best_estimator

        return self
Beispiel #25
0
def clf_loop(models_to_run, clfs, grid, X_train, X_test, y_train, y_test):
    '''
    Runs the loop using models_to_run, clfs, gridm and the data
    '''
    results_df =  pd.DataFrame(columns=('model_type','clf', 'parameters', 'auc-roc', 'baseline',
                                        'p_at_1', 'p_at_2', 'p_at_5', 'p_at_10', 'p_at_20',
                                        'p_at_30', 'p_at_50',
                                        'r_at_1', 'r_at_2', 'r_at_5', 'r_at_10', 'r_at_20',
                                        'r_at_30', 'r_at_50',
                                        'f_at_1', 'f_at_2', 'f_at_5', 'f_at_10', 'f_at_20',
                                        'f_at_30', 'f_at_50'))
                                        
    for n in range(1, 2):
        for index,clf in enumerate([clfs[x] for x in models_to_run]):
            print(models_to_run[index])
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    clf.set_params(**p)
                    y_pred_probs = clf.fit(X_train, y_train.values.ravel()).predict_proba(X_test)[:,1]
                    y_pred_probs_sorted, y_test_sorted = zip(*sorted(zip(y_pred_probs, y_test), reverse=True))
                    y_pred_probs_sorted = np.asarray(y_pred_probs_sorted)
                    results_df.loc[len(results_df)] = [models_to_run[index],clf, p,
                                                       roc_auc_score(y_test, y_pred_probs),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,100.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,1.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,2.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,5.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,10.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,20.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,30.0),
                                                       precision_at_k(y_test_sorted,y_pred_probs_sorted,50.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 1.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 2.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 5.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 10.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 20.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 30.0),
                                                       recall_at_k(y_test_sorted,y_pred_probs_sorted, 50.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 1.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 2.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 5.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 10.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 20.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 30.0),
                                                       F_score_at_k(y_test_sorted,y_pred_probs_sorted, 50.0)]
                    plot_precision_recall_n(y_test,y_pred_probs,clf)
                except IndexError as e:
                    print('Error:',e)
                    continue
    return results_df
def run_models():
    stream = open("generate_predictions.yml", 'r')
    docs = yaml.load_all(stream)

    for doc in docs:
        train_df = pd.read_csv(doc['train_table_name'])
        test_df = pd.read_csv(doc['test_table_name'])
        models_to_run = doc['models_to_run'].replace(' ', '').split(',')
        prediction_var = doc['prediction_var']
        feats_to_use = doc['feats_to_use'].replace(' ', '').split(',')

    X_train = train_df[feats_to_use]
    X_test = test_df[feats_to_use]

    y_train = train_df[prediction_var]
    y_test = test_df[prediction_var]

    clfs, grid = define_clfs_params()

    for n in range(1, 2):
        for index, clf in enumerate([clfs[x] for x in models_to_run]):
            parameter_values = grid[models_to_run[index]]
            for p in ParameterGrid(parameter_values):
                try:
                    filename = models_to_run[index] + '-' + str(p).replace(
                        ' ', '').strip('{}').replace('\'', '').replace(
                            ',', '-').replace(
                                ':', '_') + '-' + '+'.join(feats_to_use)
                    if os.path.isfile("./model_output/" + filename + ".p"):
                        continue
                    print clf
                    clf.set_params(**p)
                    y_pred_probs = clf.fit(X_train,
                                           y_train).predict_proba(X_test)[:, 1]
                    try:
                        zipped_imps = sorted(zip(X_train.columns,
                                                 clf.feature_importances_),
                                             key=lambda x: x[1])
                        top_3_feats = [i[0] for i in zipped_imps[:3]]
                    except AttributeError:
                        top_3_feats = ['NA']
                    print "---------------"
                    result = pd.DataFrame()
                    result['true_val'] = y_test
                    result['score'] = y_pred_probs
                    pickle.dump([result, top_3_feats],
                                open("./model_output/" + filename + ".p",
                                     "wb"))
                except IndexError, e:
                    print 'Error:', e
                    continue
Beispiel #27
0
def clf_loop(models_to_run, clfs, grid, X_train, y_train, X_test, y_test):
    results_df = pd.DataFrame(
        columns=('model_type', 'clf', 'parameters', 'time_used', 'auc-roc',
                 'accuracy', 'p_at_5', 'p_at_10', 'p_at_30', 'p_at_50',
                 'r_at_5', 'r_at_10', 'r_at_30', 'r_at_50', 'a_at_5',
                 'a_at_10', 'a_at_30', 'a_at_50', 'f1_at_5', 'f1_at_10',
                 'f1_at_30', 'f1_at_50', 'feature_importance',
                 'col_used_for_feat_importance'))

    for index, clf in enumerate([clfs[x] for x in models_to_run]):
        print(models_to_run[index])
        parameter_values = grid[models_to_run[index]]
        for p in ParameterGrid(parameter_values):
            try:
                clf.set_params(**p)
                start_time = time.time()
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                y_pred_probs = list(clf.predict_proba(X_test)[:, 1])
                end_time = time.time()
                elapsed_time = end_time - start_time
                y_pred_probs_sorted, y_test_sorted = zip(
                    *sorted(zip(y_pred_probs, y_test), reverse=True))
                results_df.loc[len(results_df)] = [
                    models_to_run[index], clf, p, elapsed_time,
                    roc_auc_score(y_test, y_pred_probs),
                    accuracy_score(y_test, y_pred),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),
                    precision_at_k(y_test_sorted, y_pred_probs_sorted, 50.0),
                    recall_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                    recall_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                    recall_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),
                    recall_at_k(y_test_sorted, y_pred_probs_sorted, 50.0),
                    accuracy_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                    accuracy_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                    accuracy_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),
                    accuracy_at_k(y_test_sorted, y_pred_probs_sorted, 50.0),
                    f1_at_k(y_test_sorted, y_pred_probs_sorted, 5.0),
                    f1_at_k(y_test_sorted, y_pred_probs_sorted, 10.0),
                    f1_at_k(y_test_sorted, y_pred_probs_sorted, 30.0),
                    f1_at_k(y_test_sorted, y_pred_probs_sorted, 50.0),
                    get_feature_importance(clf, models_to_run[index]),
                    list(X_train.columns.values)
                ]

            except IndexError:
                print("IndexError")
                continue
    return results_df
Beispiel #28
0
    def optimize(self, data, search_space, val_data=None):
        """
        Parameters:
        -----------
        data: [X, Y] - arrays
            This data will be used for crossval training (considering 'train_test_split' parameter)

        search_space: dict
            Dict with parameters to optimize. E.g. 'units' : [[1000,1000,500], [2500,1000]]

        val_data: [X, Y] - arrays
            Default - None. If specified than optimizer metric will be evaluated on val_data. Also if specified than 'train_test_split' parameter will be ignored.

        """
        train_manager = self._create_train_manager(data, val_data,
                                                   search_space)
        param_grid = ParameterGrid(search_space)
        all_params = []
        for p in param_grid:
            all_params.append(p)
        for key in search_space.keys():
            if (isinstance(search_space[key], dict)):
                new_params = []
                for param in all_params:
                    if (search_space[key][param[key]] is None):
                        new_params.append(param)
                    else:
                        param_grid = ParameterGrid(
                            search_space[key][param[key]])
                        add_params = [p for p in param_grid]
                        for aparam in add_params:
                            tparam = copy.copy(param)
                            tparam.update(aparam)
                            new_params.append(tparam)
                all_params = new_params
        for param in all_params:
            param_to_pass = self._preprocess_params(param)
            train_manager.train(**param)
Beispiel #29
0
def generateParams():
    # Set the parameters by cross-validation
    paramaters_grid    = {'eta': [0.05], 'min_child_weight' : [4],  'colsample_bytree' : [0.8], 'subsample' : [0.90], 'gamma' : [0], 'max_depth' : [12]};

    paramaters_search  = list(ParameterGrid(paramaters_grid));

    parameters_to_try  = [];
    for ps in paramaters_search:
        params           = {'eval_metric' : 'mlogloss', 'objective' : 'multi:softprob', 'num_class' : 9, 'nthread' : 8};
        for param in ps.keys():
            params[str(param)] = ps[param];
        parameters_to_try.append(copy.copy(params));

    return parameters_to_try;     
 def pipeline(self):
     if self.param_dist:
         grid_size = len(ParameterGrid(self.param_dist))
         if grid_size < self.n_iter_search:
             classifier = GridSearchCV(self.classifier, self.param_dist)
         else:
             classifier = RandomizedSearchCV(
                 self.classifier,
                 param_distributions=self.param_dist,
                 n_iter=self.n_iter_search,
                 random_state=self.random_state)
     else:
         classifier = self.classifier
     return Pipeline(self.transformers + [('classifier', classifier)])