Example #1
1
def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.grid_search import RandomizedSearchCV
    print "-- {} --".format("Fine-tuning Gradient Boosting Regression")
    rf = GradientBoostingRegressor(
        n_estimators=1000
    )
    param_dist = {
        "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2],
        "max_depth": sp_randint(1, 15),
        "min_samples_split": sp_randint(1, 15),
        "min_samples_leaf": sp_randint(1, 15),
        "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "max_features": sp_randint(1, 15)
    }
    n_iter_search = 300
    random_search = RandomizedSearchCV(
        rf,
        param_distributions=param_dist,
        n_iter=n_iter_search,
        n_jobs=-1,
        cv=5,
        verbose=1
    )

    start = time()
    random_search.fit(data_train_x, data_train_y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
Example #2
0
def Decision_tree(Xtrain, Ytrain, Xtest):

    tuned_parameters = {
        'splitter': ['best', 'random'],
        "max_features": ["log2", "sqrt"],
        'min_samples_split': np.arange(30, 60, 5),
        'min_samples_leaf': np.arange(7, 14),
        'max_depth': np.arange(700, 1389, 10)
    }
    """Randomized optimizationSearch which used cross validation to optimized best parameters for the estimator. 
    In contrast to GridSearchCV, not all parameter values are tried out, 
    but rather a fixed number of parameter settings is sampled from the specified distributions.
    The number of parameter settings that are tried is given by n_iter.
    """
    Multreg = RandomizedSearchCV(DecisionTreeRegressor(random_state=0),
                                 param_distributions=tuned_parameters,
                                 cv=10,
                                 n_iter=int(args[1]),
                                 n_jobs=-1,
                                 random_state=0)

    #Fitting decision tree model
    Multreg.fit(Xtrain, Ytrain)
    #Predicting with unseen testing set
    YMultreg = Multreg.predict(Xtest)
    # save the model to disk
    filename = 'finalized_DC.sav'
    pickle.dump(Multreg, open(filename, 'wb'))
    return YMultreg
Example #3
0
def grid_search(symbol='MSFT'):
    """Find optimal SVC parameters"""
    from scipy.stats import randint as sp_randint
    X, y = build_data()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        X, y, test_size=0.4, random_state=0)

    param_grid = [
        {
            'C': [.1, 1, 10, 100, 1000],
            'gamma': [1e-2, 1e-3, 1e-4],
            'kernel': ['linear', 'rbf']
        },
        # {'C': [1, 10, 100, 1000], 'gamma': [.001, .0001], 'kernel': ['linear', 'rbf']}
    ]

    param_dist = {
        'C': [.001, .01, .1, 1, 10, 100],
        'gamma': [1e2, 1e-1, 1e-2, 1e-3, 1e-4],
        'kernel': ['linear', 'rbf']
    }
    n_iter_search = 20
    clf = svm.SVC()
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search)

    #
    # clf = GridSearchCV(estimator=svm.SVC(C=1), param_grid=param_grid, cv=5)
    # clf.fit(X_train, y_train)
    # return clf

    random_search.fit(X_train, y_train)
    return random_search
Example #4
0
def test_randomized_search_grid_scores():
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0)

    # XXX: as of today (scipy 0.12) it's not possible to set the random seed
    # of scipy.stats distributions: the assertions in this test should thus
    # not depend on the randomization
    params = dict(C=distributions.expon(scale=10), gamma=distributions.expon(scale=0.1))
    n_cv_iter = 3
    n_search_iter = 30
    search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False)
    search.fit(X, y)
    assert_equal(len(search.grid_scores_), n_search_iter)

    # Check consistency of the structure of each cv_score item
    for cv_score in search.grid_scores_:
        assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
        # Because we set iid to False, the mean_validation score is the
        # mean of the fold mean scores instead of the aggregate sample-wise
        # mean score
        assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score)
        assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys())))

    # Check the consistency with the best_score_ and best_params_ attributes
    sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score))
    best_score = sorted_grid_scores[-1].mean_validation_score
    assert_equal(search.best_score_, best_score)

    tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score]
    assert_true(
        search.best_params_ in tied_best_params,
        "best_params_={0} is not part of the" " tied best models: {1}".format(search.best_params_, tied_best_params),
    )
def evalModel(train_data, eval_data, train_labels, eval_labels, seed):
    joined_data = np.concatenate((train_data,eval_data),axis=0)
    joined_labels = np.concatenate((train_labels,eval_labels),axis=0)
    train_mask = np.zeros(train_data.shape[0]) - 1.0
    eval_mask = np.zeros(eval_data.shape[0])
    joined_mask = np.concatenate((train_mask,eval_mask),axis=0)
    ps = PredefinedSplit(test_fold=joined_mask)
    loss  = make_scorer(get_rmsle, greater_is_better=False)
    train_data = sparse.csr_matrix(train_data)
    eval_data = sparse.csr_matrix(eval_data)
    
    clf = RandomForestRegressor(random_state=seed, verbose=1)
    #clf.fit(train_data, train_labels)
    #preds = clf.predict(eval_data)
    #print(get_rmsle(eval_labels, preds))
    ## achieves 0.263
    
    # specify parameters and distributions to sample from
    param_dist = {"n_estimators": sp_randint(300, 800),
                  "max_depth": sp_randint(10, 50),
                  "max_features": ['auto','sqrt','log2'],
                  "min_samples_split": sp_randint(1, 11),
                  "min_samples_leaf": sp_randint(1, 11)}
    
    # run randomized search
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=ps, scoring=loss,
                                       n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2)
    
    start = time()
    random_search.fit(joined_data, joined_labels)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
Example #6
0
def train_cv():
    # ---------------------- load the data
    train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
    Xtrain = train_df[feature_names]
    ytrain = train_df["Survived"]

    # ---------------------- train
    loss = ['deviance', 'exponential']
    learning_rate = np.logspace(-5,1)
    n_estimate_dist = sp_randint(1000,4800)
    max_depth_dist = sp_randint(1,10)
    param_dist = dict(loss=loss,
                    learning_rate=learning_rate,
                    n_estimators=n_estimate_dist,
                    max_depth=max_depth_dist)

    gbdt = GradientBoostingClassifier(verbose=1)
    searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1)

    print "--------------------- RandomizedSearchCV begins"
    searchcv.fit(Xtrain,ytrain)      
    print "--------------------- RandomizedSearchCV ends"
    print "best score: ",searchcv.best_score_                                  
    print "best parameters: ",searchcv.best_params_

    common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_)
    print "--------------------- GBDT saved into file"
def scale_pca_rf_pipe_new_import():
  from h2o.estimators.pca import H2OPrincipalComponentAnalysisEstimator
  iris = h2o.import_file(path=pyunit_utils.locate("smalldata/iris/iris_wheader.csv"))

  # build transformation pipeline using sklearn's Pipeline and H2O transforms
  pipe = Pipeline([
                    ("standardize", H2OScaler()),
                    ("pca", H2OPrincipalComponentAnalysisEstimator().init_for_pipeline()),
                    ("rf", H2ORandomForestEstimator())
                  ])

  params = {"standardize__center":    [True, False],             # Parameters to test
            "standardize__scale":     [True, False],
            "pca__k":                 randint(2, iris[1:].shape[1]),
            "rf__ntrees":             randint(50,60),
            "rf__max_depth":          randint(4,8),
            "rf__min_rows":           randint(5,10),
            "pca__transform":         ["none", "standardize"],
            }

  custom_cv = H2OKFold(iris, n_folds=5, seed=42)
  random_search = RandomizedSearchCV(pipe, 
                                     params,
                                     n_iter=5,
                                     scoring=make_scorer(h2o_r2_score),
                                     cv=custom_cv,
                                     random_state=42,
                                     n_jobs=1)

  random_search.fit(iris[1:],iris[0])

  print(random_search.best_estimator_)
Example #8
0
def _compute_thresh(this_data, ch_type, cv=10):
    """ Compute the rejection threshold for one channel.

    Parameters
    ----------
    this_data: array (n_epochs, n_times)
        Data for one channel.
    ch_type: str
        'mag', 'grad' or 'eeg'.
    cv : iterator
        Iterator for cross-validation.
    """
    est = ChannelAutoReject()

    Limits = namedtuple('Limits', 'low high')
    limits = dict(eeg=Limits(low=20e-7, high=400e-6),
                  grad=Limits(low=400e-13, high=20000e-13),
                  mag=Limits(low=400e-15, high=20000e-15))

    param_dist = dict(
        thresh=uniform(limits[ch_type].low, limits[ch_type].high))
    rs = RandomizedSearchCV(
        est,  # XXX : is random really better than grid?
        param_distributions=param_dist,
        n_iter=20,
        cv=cv)
    rs.fit(this_data)
    best_thresh = rs.best_estimator_.thresh

    return best_thresh
Example #9
0
def optimized_classifier(X, y, classifier, distributions, scorer='f1_weighted', n_iter=30, cv=3):
    """
    Return best classifier and scores for X,y from a randomized search over parameters

    X             -- Features for each sample
    y             -- Class label for each sample
    classifier    -- An estimator class or pipeline from sklearn
    distributions -- The parameter distributions to search for that estimator
    scorer        -- Scoring function (e.g. accuracy or f1)
    n_iter        -- The number of random iterations to try
    """
    # Make a pipeline out of the classifier, to allow for feature scaling in the first step.

    # Add prefix to parameters to support use in pipeline
    class_name = classifier.__class__.__name__.lower()
    distributions = dict((class_name + "__" + key, val) for key, val in distributions.iteritems())

    # It is important to handle scaling here so we don't accidentally overfit some to the
    # test data by scaling using that information as well.
    classifier = make_pipeline(preprocessing.RobustScaler(), classifier)
    randomized_search = RandomizedSearchCV(
        classifier, param_distributions=distributions, n_iter=n_iter, scoring=scorer, cv=cv, n_jobs=1)
    randomized_search.fit(X, y)

    print randomized_search.best_estimator_
    print "Validation Score ({}): {:.2f}".format(scorer, randomized_search.best_score_)
    print ""
    return randomized_search.best_estimator_, randomized_search.best_score_
Example #10
0
def svc_appr():
    """
    Best params: {'C': 0.022139881953014046}

    Submission:
    E_val:
    E_in:
    E_out:
    """
    from sklearn.svm import LinearSVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.grid_search import RandomizedSearchCV
    from scipy.stats import expon

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5), verbose=2,
                            param_distributions={'C': expon()})
    rs.fit(X_scaled, y)

    logger.debug('Got best SVC.')
    logger.debug('Best params: %s', rs.best_params_)
    logger.debug('Grid scores:')
    for i, grid_score in enumerate(rs.grid_scores_):
        print('\t%s' % grid_score)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
Example #11
0
def rf_cv(fv_train, target_train, fv_test, target_test):

    ####---- cross validation of train dataset, gridsearch the best parameters for random forest

    # Set the parameters by cross-validation
    tuned_parameters = {
        'n_estimators': [1000, 2000],
        "max_depth": [3, 6, 9, None],
        "max_features": ["auto", "log2", None],
        "class_weight": [None, 'balanced']
    }

    scores = ['recall_macro']

    n_iter_search = 20

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        mycv = StratifiedKFold(target_train, n_folds=5)

        clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1),
                                 tuned_parameters,
                                 cv=mycv,
                                 n_iter=n_iter_search,
                                 scoring='%s' % score)

        clf.fit(fv_train, target_train)

        report_cv(clf, fv_test, target_test)
def best_ExtraTree(X, y):
    from sklearn.grid_search import RandomizedSearchCV
    from scipy.stats import randint as sp_randint
    from sklearn.metrics import accuracy_score
    
    X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(),
                                                        random_state=42)
    
    clf = ExtraTreesClassifier(max_depth=None,
                                 bootstrap = False)

    grid = {'n_estimators': sp_randint(250, 400),
            'min_samples_leaf' : sp_randint(1, 12),
            'max_features' : sp_randint(5, 50)}

    clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=10,
                                 param_distributions=grid,
                                 scoring='accuracy')

    y_hat = clf_rfc.fit(X_train,
                    y_train.ravel()).predict(X_test)

    print('Best Params: \n', clf_rfc.best_params_ )
    print("Accuracy with Extra Forest = %4.4f"  %
          accuracy_score(y_test.ravel(), y_hat))
    binarize_y_confustion_matrix(y_test.ravel(), y_hat)
    return(clf_rfc.best_params_) 
Example #13
0
def Cv3(X_train, y_train):
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        max_iterations=100,
        all_possible_transitions=True
    )
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='weighted')
    # search
    rs = RandomizedSearchCV(crf, params_space,
                            cv=3,
                            verbose=1,
                            n_jobs=4,
                            n_iter=50,
                            scoring=f1_scorer)
    rs.fit(X_train, y_train)

    # labels = list(crf.classes_)
    # labels.remove('O')

    print('best params:', rs.best_params_)
    return rs.best_estimator_
Example #14
0
    def buildRandomForest(self, X_train, X_test, y_train, cv = 3, n_iter = 5, save = False):
        rf = RandomForestClassifier(random_state = 9)
        #Tune the model
        param_distributions = {
            'n_estimators': range(1,50,1),
            'max_depth': range(1,70,1),
            'max_features': range(6,15,1),
            'min_samples_split':[2,3,4],
            'min_samples_leaf':[1,2,3,4],
            'n_jobs':[-1]
        }

        rf_optimized = RandomizedSearchCV(
            estimator = rf,
            param_distributions = param_distributions,
            n_iter= n_iter,
            scoring = 'f1',
            cv = cv,
            random_state = 1
        )

        rf_optimized.fit(X_train, y_train)
        if save == True:
            joblib.dump(value = rf_optimized, filename = "rf_optimized.pkl", compress=1)

        print "Best parameter: %s"  %rf_optimized.best_params_
        print "Best average cross validated F1 score: %0.4f" %rf_optimized.best_score_
        print "--------------------------------------------"

        #predictions
        predicted_y_train = rf_optimized.predict(X_train)
        predicted_y_test = rf_optimized.predict(X_test)

        return predicted_y_train, predicted_y_test
Example #15
0
File: crf.py Project: hthuwal/nlp
def gridsearch():
    labels = ['T', 'D']
    # define fixed parameters and parameters to search
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               max_iterations=100,
                               all_possible_transitions=True)
    params_space = {
        #     'algorithm': ['lbfgs', 'l2sgd', 'ap', 'pa', 'arow'],
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='macro',
                            labels=labels)

    # search
    rs = RandomizedSearchCV(crf,
                            params_space,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            n_iter=50,
                            scoring=f1_scorer)
    rs.fit(train_data, train_labels)
    return rs.best_params_, rs.best_estimator_, rs.best_score_
def tuneSGD(data,labels, clf=None):
    from sklearn.cross_validation import StratifiedShuffleSplit 
    from sklearn.linear_model import SGDClassifier
    sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .2, random_state = 42) 
    clf = Pipeline([#('num_features',SelectPercentile(f_classif,percentile = 5)),
                    ('sgd', SGDClassifier(random_state = 11, penalty = 'elasticnet', n_jobs = 1, alpha = 10**-4))])
    param_grid = {
        #'num_features__percentile': list(range(1,101)),
        'sgd__loss':['modified_huber','squared_hinge'],#,'hinge','log'],
        'sgd__class_weight':['balanced',None],
        'sgd__l1_ratio': list(np.arange(0,1.0,.01)),
        'sgd__alpha': list(10.**np.arange(-6,-3,.1))

    }
    
    grid_search = RandomizedSearchCV(clf, 
                               param_grid,
                               n_iter = 250,
                               random_state = 42,
                               cv=sss,
                               scoring = 'roc_auc',#roc_score,
                               n_jobs= -2,
                               pre_dispatch = '2*n_jobs')
    grid_search.fit(data,labels)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for p in param_grid.keys():
        print (p, best_parameters[p])
    
    return grid_search
    plot_cs(grid_search)
Example #17
0
def searchBestModelParameters(algorithm, trainingData):
    if algorithm == 'multinomialnb':
        # model the data using multinomial naive bayes
        # define the parameter values that should be searched
        alpha = [0, 0.2, 0.4, 0.6, 0.8, 1]
        fitPrior = [True, False]
        # specify "parameter distributions" rather than a "parameter grid"
        paramDistribution = dict(alpha=alpha, fit_prior=fitPrior)
        model = MultinomialNB()

    bestRun = []
    for _ in range(1):
        rand = RandomizedSearchCV(model,
                                  paramDistribution,
                                  cv=10,
                                  scoring='precision',
                                  n_iter=5)
        rand.fit(trainingData, trainingData['isSpam'])
        # examine the best model
        bestRun.append({
            'score': round(rand.best_score_, 3),
            'params': rand.best_params_
        })
    print(max(bestRun, key=lambda x: x['score']))
    return max(bestRun, key=lambda x: x['score'])
Example #18
0
def randomized_search_ksvm():
    clf = SVC(random_state=1)
    # specify parameters and distributions to sample from
    param_dist = {
        'clf__C': [0.01, 0.1, 1, 10, 100, 1000],
        'clf__gamma': [0.01, 0.1, 1, 10, 100, 1000],
        'clf__kernel': ['rbf', 'linear'],
    }
    steps = [('scl', StandardScaler()), ('clf', SVC())]
    pipeline = Pipeline(steps)

    # run randomized search
    n_iter_search = 50
    random_search = RandomizedSearchCV(pipeline,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search)

    start = time()
    random_search.fit(X_train, y_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings.\n" % ((time() - start), n_iter_search))
    print('best score: %f\n' % (random_search.best_score_))
    print('best estimator: %s\n' % (random_search.best_estimator_))
    print('best params: %s\n' % (random_search.best_params_))

    clf = random_search.best_estimator_
    clf.fit(X_train, y_train)
    print('Test accuracy: %.3f' % clf.score(X_test, y_test))
Example #19
0
def pick_best_features(df):
    """
    Grid search to find best features. TODO refactor
    :param train: train data
    :param test: test data
    :return:
    """

    #X = sample_data_random(df, .25)
    X = df[0:int(df.shape[0] * .25)]
    overfit_models = dict()
    for out in outputs:
        print out
        pipe_clf = CustomPipeline.get_transforms()

        clf = SGDClassifier(loss='log')

        tuned_parameters = {'alpha': sp_rand()}
        score = 'log_loss'
        tran_x = pipe_clf.fit_transform(X)
        grid = RandomizedSearchCV(clf, tuned_parameters, cv=5, scoring=score)
        grid.fit(tran_x, X[out])
        print grid.best_estimator_
        overfit_models[out] = grid.best_estimator_
    return overfit_models
def run_randomsearch(X, y, clf, para_dist, cv=5,
                     n_iter_search=20):
    """Run a random search for best Decision Tree parameters.
    
    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.

    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf, 
                    	param_distributions=param_dist,
                        n_iter=n_iter_search)
    
    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter "
           "settings.").format((time() - start),
                               n_iter_search))

    top_params = report(random_search.grid_scores_, 3)
    return  top_params
Example #21
0
 def RandomFo(self):
     parameters_forest={'n_estimators':randint(10,self.n_estimators_max),
             "bootstrap": [True, False]}
     X_train, y_train =self.X_train,self.y_train
     forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
     forest_reg.fit(X_train,y_train)
     self.forest_reg=forest_reg.best_estimator_
    def find_best_parameters_and_get_fitted_model(self, **kwargs):
        
        """
        Finds the best set of hyperparameters for a Random Forest for the provided data. 
        The best hyperparameters are found by repeatedly drawing random samples from a distribution 
        of parameters and evaluating them by using cross validation.
        
        """
        
        # load data
        data = kwargs['data']
        X = data['features']
        y = data['targets']
        out_args = {}
        
        # we choose Random Fores Classifier as the Machine Learning algorithm for
        # this DPModel.
        rc = RandomForestClassifier()
        
        # here we define the space of parameters over which we want to perform the random search
        param_distributions = {}
        param_distributions["n_estimators"] = [50, 100, 150]

        # do random search
        random_search_outer = RandomizedSearchCV(rc, param_distributions=param_distributions,
            cv=5, n_iter=3)
        random_search_outer.fit(X, y)
            
        predictor = random_search_outer.best_estimator_

        return predictor, out_args
Example #23
0
   def Gradient(self): 
       X_train, y_train =self.X_train,self.y_train
       parameters_boost={'max_depth':randint(3,self.max_depth_max+1),
 'n_estimators':randint(80,100+self.n_estimators_max)}
       boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
       boost_reg.fit(X_train,y_train)
       self.boost_reg=boost_reg.best_estimator_
Example #24
0
def test_sklearn_cv():

    model = LightFM(loss='warp', random_state=42)

    # Set distributions for hyperparameters
    randint = stats.randint(low=1, high=65)
    randint.random_state = 42
    gamma = stats.gamma(a=1.2, loc=0, scale=0.13)
    gamma.random_state = 42
    distr = {'no_components': randint, 'learning_rate': gamma}

    # Custom score function
    def scorer(est, x, y=None):
        return precision_at_k(est, x).mean()

    # Custom CV which sets train_index = test_index
    class CV(KFold):
        def __iter__(self):
            ind = np.arange(self.n)
            for test_index in self._iter_test_masks():
                train_index = np.logical_not(test_index)
                train_index = ind[train_index]
                yield train_index, train_index

    cv = CV(n=train.shape[0], random_state=42)
    search = RandomizedSearchCV(estimator=model, param_distributions=distr,
                                n_iter=10, scoring=scorer, random_state=42,
                                cv=cv)
    search.fit(train)
    assert search.best_params_['no_components'] == 52
    def fit(self, drop_features=[], segments=["adopted", "sporadic", "low"]):
        """

        :param drop_features: list, which features to drop before fit
        :param segments: which user segments to consider while fitting
        :return:
        """
        if not self.transformed:
            self.transform_features()
        user_id, class_labels, features = self._prep_for_fit(
            drop_features, segments=["adopted", "sporadic", "low"])
        # this is a user based model, therefore we want to avoid including same user in Train and Test
        cv_strat = LabelKFold(user_id, n_folds=self.cv_params["folds"])
        # RandomSearch is vastly faster than GridCV with tolerable loss of optimization
        # NB: RandomForest doesn't generally require heavy parm optimization, this is somewhat for posterity here
        random_search = RandomizedSearchCV(self.clf,
                                           param_distributions=self.param_grid,
                                           n_iter=self.n_iter,
                                           cv=cv_strat,
                                           scoring=self.cv_params["scorer"],
                                           n_jobs=self.n_jobs)

        print("running random param search on {} ".format(
            self.clf.__class__.__name__))
        random_search.fit(features, class_labels)
        self._handle_result(random_search, list(features.columns))
Example #26
0
def optimize_hyperparameters(df):
    n_samples = df.shape[0]
    random_test = {
        'n_estimators': np.linspace(n_samples * 2, n_samples * 10,
                                    5).astype(int),
        'criterion': ['gini', 'entropy'],
        'max_features': [None, 'sqrt', 'log2'],
        'min_samples_split': np.linspace(2, n_samples / 50, 10).astype(int),
        'min_samples_leaf': np.linspace(1, n_samples / 200, 10).astype(int),
        'max_leaf_nodes': np.linspace(10, n_samples / 50, 10).astype(int)
    }
    clf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
    X = df.values[:, :-1]
    y = df.values[:, -1]
    random_search = RandomizedSearchCV(clf,
                                       random_test,
                                       n_jobs=-1,
                                       cv=10,
                                       n_iter=500,
                                       random_state=42)
    random_search.fit(X, y)
    best_params = report(random_search.grid_scores_, verbose=False)

    # save best hyperparameters to csv
    with open('./temp/best_params.csv', 'wt') as f:
        w = csv.DictWriter(f, best_params.keys())
        w.writeheader()
        w.writerow(best_params)
Example #27
0
class CVSearcher(SearcherBase):
    '''
    Cross validation searcher is not specific for time series
    '''

    def __init__(self, sklearn_model_class, params, scoring=None, method=None,
                 n_randomized_search=200, cv=5):
        super(CVSearcher, self).__init__(sklearn_model_class, params, method=method,
                                         n_randomized_search=n_randomized_search,
                                         cv=cv, scoring=scoring)

    def fit(self, X, Y):
        if self.method == 'Grid':
            self.__searcher = GridSearchCV(estimator=self.ml_class(), param_grid=self.search_space,
                                           scoring=self.scoring, cv=self.cv, refit=True)
        elif self.method == 'Randomized' or self.method is None:
            self.__searcher = RandomizedSearchCV(estimator=self.ml_class(), param_distributions=self.search_space,
                                                 scoring=self.scoring,
                                                 n_iter=self.n_randomized_search, cv=self.cv, refit=True)
        else:
            raise ValueError('CVSearcher only support GridSearch and RandomizedSearch')
        self.__searcher.fit(X, Y)
        print("Best: %s" % (self.__searcher.best_estimator_))
        return self

    def predict(self, X):
        return self.__searcher.predict(X)

    def get_scores(self):
        return self.__searcher.grid_scores_
def search_classifier(n_iter):
    assignments = load_structure()['ASS_ASSIGNMENT']
    features = load_featurized_training_set("files/train_featurized.pkl")

    # print(len(features.columns))
    X = features.drop(['DATE', 'n_calls'], axis=1).as_matrix().astype(float)
    y = (features.n_calls > 0).astype(int).as_matrix()
    calls = features.n_calls.as_matrix()

    X = StandardScaler().fit_transform(X)
    pipe = Pipeline([
        # ('scaler', StandardScaler()),
        # ('pca', RandomizedPCA()),
        ('clf', SGDClassifier())
    ])

    params = {
        # 'pca__n_components': [30, 50, 70, 86],
        'clf__class_weight': ['balanced'],
        'clf__loss': ['hinge'],
        'clf__penalty': ['l1'],
        'clf__alpha': st.uniform(0, 0.0003),
        'clf__fit_intercept': [False]
        # 'clf__alpha': [0.0001]
    }

    kf = KFold(len(X), n_folds=3, shuffle=True)
    grid_search = RandomizedSearchCV(pipe, params, scoring='accuracy', cv=kf, verbose=1000, n_iter=n_iter)
    grid_search.fit(X, y)

    print("\n")
    print(grid_search.best_params_)
    print(grid_search.best_score_)

    joblib.dump(grid_search.best_estimator_, "files/best_classifier.pkl")
def run(src_dir, mod, random_state=1234):

    if isinstance(src_dir, str):
        mat, labels_arr = load_mat_and_labels(src_dir, mod)
    else:
        mat, labels_arr = (src_dir, mod)

    masker = SimpleMaskerPipeline(threshold=.2)
    svc = SVC(kernel='linear')

    pipeline = Pipeline([('masker', masker),
                         ('anova', SelectKBest(k=500)),
                         ('svc', svc)])

    c_range = gamma.rvs(size=100, a=1.99, random_state=random_state)

    param_dist = {"svc__C": c_range}

    n_iter = 100
    cv = StratifiedShuffleSplit(labels_arr, n_iter=n_iter, test_size=1/6.0, random_state=random_state)

    total_runs = n_iter
    scorer = verbose_scorer(total_runs)

    search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer,
                                random_state=random_state)
    search.fit(mat, labels_arr)

    return search
Example #30
0
def K_NN(Xtrain, Ytrain, Xtest):

    KNNoptparam = {
        "n_neighbors": np.arange(20, 200, 10),
        "weights": ['uniform', 'distance'],
        "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute']
        #,"leaf_size":np.arange(30,150,15)
        ,
        "p": [2, 3]
    }

    #Randomized search parameter optimization
    RF1 = RandomizedSearchCV(KNeighborsRegressor(),
                             param_distributions=KNNoptparam,
                             cv=10,
                             n_iter=int(args[1]),
                             n_jobs=-1,
                             random_state=0)

    RF1.fit(Xtrain, Ytrain)
    #Predicting using unseen data
    KNN_predict = RF1.predict(Xtest)
    # save the model to disk
    filename = 'finalized_KNN.sav'
    pickle.dump(RF1, open(filename, 'wb'))
    return KNN_predict
    def train_dataset_crf(self, X, Y, ratio):
        crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                   max_iterations=100,
                                   all_possible_transitions=True)

        params_space = {
            'c1': scipy.stats.expon(scale=0.5),
            'c2': scipy.stats.expon(scale=0.05)
        }

        import multiprocessing
        cpus = multiprocessing.cpu_count()
        rs = RandomizedSearchCV(crf,
                                params_space,
                                cv=3,
                                verbose=1,
                                n_jobs=cpus - 1,
                                n_iter=50)

        assert len(X) == len(Y)

        # subset of indexes to used in training
        r_indexes = randint(low=0,
                            high=len(X) - 1,
                            size=round(ratio * (len(X) - 1)))

        X_subset = [X[i] for i in r_indexes]
        Y_subset = [Y[i] for i in r_indexes]

        rs.fit(X_subset, Y_subset)

        return rs
def tune(data,labels, clf=None):
    from sklearn.cross_validation import StratifiedShuffleSplit 
    sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .1, random_state = 42) 
    clf = Pipeline([('num_features', 
               SelectKBest(f_classif,k=100)),
                    ('svm', svm.SVC(C=.01, kernel = 'linear', probability = True, random_state = 11))])
    param_grid = {
        'num_features__k':range(250,2500,250),
        'svm__C':10.**np.arange(-3,4),
        #'svm__loss':['hinge','squared_hinge'],
        'svm__class_weight':['balanced',None]
    }
    grid_search = RandomizedSearchCV(clf, 
                               param_grid,
                               n_iter = 100,
                               cv=sss,
                               scoring='f1',
                               n_jobs=-1,
                               pre_dispatch = '2*n_jobs',
                               random_state = 42)
    grid_search.fit(data,labels)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for p in param_grid.keys():
        print (p, best_parameters[p])

    #plot_cs(grid_search)

    return grid_search
Example #33
0
def randomized_search_rfc(txt_lst, y):
	# build a classifier
	pipeline = Pipeline([
	('vect', CountVectorizer(stop_words='english', analyzer = analyzer, ngram_range=(1, 3)))
	,('tfidf', TfidfTransformer())
	, ('clf', RandomForestClassifier(n_estimators=100))
	])

	# specify parameters and distributions to sample from
	param_dist = {  
				'vect__ngram_range':[None, (1, 2), (1,3),(1,4)],
				"clf__max_depth": map(lambda x: int(x), np.logspace(1, 4, 10)), #sp.stats.randint(10,1000),
	              "clf__max_features": map(lambda x: int(x), np.logspace(0, 3, 10)),
	             "clf__min_samples_split": sp.stats.randint(1, 11),
	              "clf__min_samples_leaf": sp.stats.randint(2, 11),
	              "clf__criterion": ["gini", "entropy"]
	              }
	n_iter_search = 50
	grid_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
	                          verbose = 1, n_iter=n_iter_search, cv = 5, n_jobs=1, scoring = 'accuracy')
	start = time.time()
	grid_search.fit(txt_lst, y)
	print("RandomizedSearchCV took %.2f seconds for %d candidates"
	      " parameter settings." % ((time.time() - start), n_iter_search))
	report(grid_search.grid_scores_, n_top = 5)
	return grid_search
Example #34
0
 def doRandomSearch(self, clfName, clf, param_dist, X, Y):
     
     if self._custRandomSearchFlag == True:
         return self.doCustRandomSearch(clfName, clf, param_dist, X, Y)
     else:
         start = time.time()
         multiCores = -1
         if  clfName == "Logistic_Regression": 
             multiCores = 1
         if self._setXgboostTheradToOne == True and clfName =="Xgboost":
             multiCores = 1
             
         random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss'
                                ,verbose=10)
         
         
         random_search.fit(X, Y)
         log(clfName + " randomized search cost: " , time.time() - start , " sec")
         self._bestClf[clfName] = random_search.best_estimator_
         #self._bestLoglossDict[clfName] = self.getLogloss(self._bestClf[clfName], X, Y)
         self._bestLoglossDict[clfName] = self.validation(self._bestClf[clfName], X, Y, test_size=0.3)
         log("customize logloss: ",self._bestLoglossDict[clfName])
         self.report(random_search.grid_scores_, clfName)
         
         random_search.best_params_
         
         dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName)
         self._lastRandomSearchBestParam = random_search.best_params_
     
         return random_search.best_estimator_
Example #35
0
def auto_tune_paras_random_search(model,
                                  param_dist,
                                  x_input_train,
                                  y_input_train,
                                  n_iter_search=1,
                                  num_folds=5):
    """ Executing random search of the input model according to the param dictionary
    # Credit: source code adapted from SKLearn
    # Adapt from http://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py

    :param model: a sklearn model (an Estimator)
    :param param_dist: parameter dictionary
    :param x_input_train: A pandas data frame of input features for the train set
    :param y_input_train: A numpy array or pandas series of ground truth for the train set
    :param n_iter_search: number of iterations to search
    :param num_folds: number of folds to do cross validation
    :return: trained model from the cross validation
    """

    random_search_pipe = RandomizedSearchCV(model,
                                            param_distributions=param_dist,
                                            scoring=f1_scorer,
                                            n_iter=n_iter_search,
                                            verbose=10,
                                            cv=num_folds,
                                            random_state=0)
    start = time()
    random_search_pipe.fit(x_input_train, y_input_train)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report_search_scores(random_search_pipe.cv_results_)
    return (random_search_pipe)
Example #36
0
def randomized_search_forest():
    clf = RandomForestClassifier(n_estimators=20)
    # specify parameters and distributions to sample from
    param_dist = {
        "max_depth": [3, None],
        "max_features": sp_randint(1, 7),
        "min_samples_split": sp_randint(1, 11),
        "min_samples_leaf": sp_randint(1, 11),
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"]
    }

    # run randomized search
    n_iter_search = 100
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search)

    start = time()
    random_search.fit(X_train, y_train)

    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings.\n" % ((time() - start), n_iter_search))
    print('best score: %f\n' % (random_search.best_score_))
    print('best estimator: %s\n' % (random_search.best_estimator_))
    print('best parameters: %s\n' % (random_search.best_params_))

    clf = random_search.best_estimator_
    clf.fit(X_train, y_train)
    print('Test accuracy: %.3f' % clf.score(X_test, y_test))
Example #37
0
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    rf = RandomForestClassifier(n_jobs=8)

    param_dist = {
            "n_estimators":sp_randint(100,300),
        "criterion": ["gini"],
        #"max_depth": sp_randint(3, 10000),
        #"min_samples_split": sp_randint(1, 300),
        #"min_samples_leaf": sp_randint(1, 300),
        "max_features": sp_randint(10, 26),
        "bootstrap": [True, False],
        'random_state':sp_randint(1, 1000000),
        }

    clf = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=50,cv=10,scoring='roc_auc')

    clf.fit(train_x, train_y)
    valid_predictions = clf.predict_proba(valid_x)[:, 1]
    test_predictions= clf.predict_proba(test_x)[:, 1]

    loss = roc_auc_score(valid_y,valid_predictions)
    print('loss:')
    print(loss)
    print(clf.best_estimator_)
    data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv")
    data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
Example #38
0
def get_best_model(X_train, y_train, labels):
    '''

    :param X_train: Train features
    :param y_train: Train labels
    :param labels: list of all labels to be evaluated
    :return:
    '''
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        max_iterations=100,
        all_possible_transitions=True
    )
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='weighted', labels=labels)

    # search
    rs = RandomizedSearchCV(crf, params_space,
                            cv=3,
                            verbose=1,
                            n_jobs=-1,
                            n_iter=50,
                            scoring=f1_scorer)
    rs.fit(X_train, y_train)

    return rs.best_estimator_
Example #39
0
def crf_tune_hyperparam(data,
                        index,
                        label,
                        word_set_suffix,
                        word_set_prefix,
                        max_iterations=500):
    train_data = [data[i] for i in index]
    X = [
        sent2features(s, word_set_suffix, word_set_prefix) for s in train_data
    ]
    y = [sent2labels(s) for s in train_data]
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               max_iterations=max_iterations,
                               all_possible_transitions=True)
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }
    label.remove("O")
    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='weighted',
                            labels=label)

    # search
    rs = RandomizedSearchCV(crf,
                            params_space,
                            cv=3,
                            verbose=1,
                            n_jobs=8,
                            n_iter=50,
                            scoring=f1_scorer)
    rs.fit(X, y)
    return rs.best_params['c1'], rs.best_params['c2']
def run(full, target_col, random_state=1234, c_range_alpha=.05, c_range_size=100, normalize=False,
        score_fn=r2_score):

    svr = linearSVRPermuteCoefFactory()
    
    pipeline_steps = [('svr', svr)]

    pipeline = Pipeline(pipeline_steps)

    c_range = gamma.rvs(size=c_range_size, a=c_range_alpha, random_state=random_state)

    param_dist = {"svr__C": c_range}

    data, target = separate(full, target_col)
    
    if normalize:
        data = scale(data)

    n_iter = 100
    cv = ShuffleSplit(len(target), n_iter=n_iter, test_size=1/6.0, random_state=random_state)

    total_runs = n_iter
    scorer = verbose_scorer(total_runs, score_fn)

    search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer,
                                random_state=random_state)

    search.fit(data, target)

    return search
    def optimize_parameter(self):

        self.console.output("[CTG] OPTIMIZATION START...", "\n")

        # 计算旧模型(即初始模型)的交叉验证精度
        old_scores = cross_validation.cross_val_score(estimator=self.evaluator.pipeline, X=self.x_train, y=self.y_train,
                                                      scoring='accuracy',
                                                      cv=10, n_jobs=-1)
        old_score = np.mean(old_scores)

        # 计算新模型们中最好的交叉验证精度
        new_score = -1.0
        self.new_estimator = None
        for clf, param_grid in RandomParameterSettings.possible_models:
            self.console.output("[CTG] SEARCH MODEL:", str(clf) + "\n")
            estimator = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('clf', clf)])
            gs = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, scoring='accuracy', cv=10,
                                    n_jobs=-1)
            gs = gs.fit(self.x_train, self.y_train)
            if new_score < gs.best_score_:
                new_score = gs.best_score_
                self.new_estimator = gs.best_estimator_

        if new_score > old_score:
            self.label_tips.config(
                text='Found a new model with improvement: %.2f%%' % (100.0 * (new_score - old_score) / old_score))
            self.button_opt.config(text='应用', command=self.apply_new_estimator)
        else:
            self.label_tips.config(text="No better model founded.")

        self.console.output("[CTG] OPTIMIZATION COMPLETE !", "\n")
        self.console.output("[CTG] RESULT: ", "old_model_accuracy=%f, new_model_accuracy=%f, improvement=%.2f%%\n" % (
        old_score, new_score, (100.0 * (new_score - old_score) / old_score)) + "\n")
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20, njobs=4):
    """Run a random search for best Decision Tree parameters.

    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.

    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search)

    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter "
           "settings.").format((time() - start), n_iter_search))

    top_params = report(random_search.grid_scores_, 3)
    return top_params
def training_op(X_train, y_train):
    crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                               max_iterations=10,
                               all_possible_transitions=True)
    crf.fit(X_train, y_train)
    filename = 'crf_withoutCV'
    pickle.dump(crf, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

    labels = list(crf.classes_)
    labels.remove('O')
    labels
    params_space = {
        'c1': scipy.stats.expon(scale=0.5),
        'c2': scipy.stats.expon(scale=0.05),
    }

    # use the same metric for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score,
                            average='weighted',
                            labels=labels)
    # search
    rs = RandomizedSearchCV(crf,
                            params_space,
                            cv=5,
                            verbose=1,
                            n_jobs=1,
                            n_iter=5,
                            scoring=f1_scorer)
    rs.fit(X_train, y_train)
    crf = rs.best_estimator_
    filename = 'CRF_model'
    pickle.dump(crf, open(filename, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
Example #44
0
def searchBestModelParameters(algorithm, trainingData):
    #using randomforest
    if algorithm == 'rf':
        numTrees = range(10, 100, 10)
        numMinLeafSamples = range(2, 20, 2)
        numMinSamplesSplit = range(1, 20, 3)
        paramDistribution = dict(n_estimators = numTrees, min_samples_leaf = numMinLeafSamples, min_samples_split = numMinSamplesSplit)
        model = RandomForestClassifier()
    elif algorithm == 'knn':
        # model the data using knn
        # define the parameter values that should be searched
        k_range = range(1, 50)
        weight_options = ['uniform', 'distance']
        # specify "parameter distributions" rather than a "parameter grid"
        paramDistribution = dict(n_neighbors = k_range, weights = weight_options)
        model = KNeighborsClassifier()
    elif algorithm == 'logr':
        #model data using logistic regression
        model = LogisticRegression()
        get_ipython().magic("time print(np.sqrt(-cross_val_score(model, trainingData, trainingData['isSpam'], cv=10, scoring='mean_squared_error')).mean())")
        return
            
    bestRun = []
    for _ in range(20):
        rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring = 'accuracy', n_iter = 10)
        rand.fit(trainingData, trainingData['isSpam'])
        # examine the best model
        bestRun.append({'score' : round(rand.best_score_,3), 'params' : rand.best_params_})
    print(max(bestRun, key=lambda x:x['score']))
    return max(bestRun, key=lambda x:x['score'])
Example #45
0
def randomSearch(classifier, parameters, XTr, yTr, cv, n_iter):
    print("***** Random Search *****")
    print("Cross-Validation:{0} and number of iterations:{1}".format(
        cv, n_iter))

    scores = ['accuracy']
    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()
        if (score == 'accuracy'):
            scoring_method = score
        else:
            scoring_method = score + '_micro'

        clf = RandomizedSearchCV(classifier,
                                 param_distributions=parameters,
                                 cv=cv,
                                 scoring=scoring_method,
                                 n_iter=n_iter)
        clf.fit(XTr, yTr)

        print("Best parameters and scores set found on development set:")
        # print(self.clf.best_estimator_)
        print(clf.best_params_)
        print(clf.best_score_)
        print()
        return clf.best_params_
Example #46
0
def _compute_thresh(this_data, ch_type, cv=10):
    """ Compute the rejection threshold for one channel.

    Parameters
    ----------
    this_data: array (n_epochs, n_times)
        Data for one channel.
    ch_type: str
        'mag', 'grad' or 'eeg'.
    cv : iterator
        Iterator for cross-validation.
    """
    est = ChannelAutoReject()

    Limits = namedtuple('Limits', 'low high')
    limits = dict(eeg=Limits(low=20e-7, high=400e-6),
                  grad=Limits(low=400e-13, high=20000e-13),
                  mag=Limits(low=400e-15, high=20000e-15))

    param_dist = dict(thresh=uniform(limits[ch_type].low,
                                     limits[ch_type].high))
    rs = RandomizedSearchCV(est,  # XXX : is random really better than grid?
                            param_distributions=param_dist,
                            n_iter=20, cv=cv)
    rs.fit(this_data)
    best_thresh = rs.best_estimator_.thresh

    return best_thresh
def rf_cv(fv_train,target_train,fv_test,target_test):

    ####---- cross validation of train dataset, gridsearch the best parameters for random forest

    # Set the parameters by cross-validation
    tuned_parameters = {'n_estimators': [1000, 2000],
                        "max_depth": [3, 6, 9, None],
                        "max_features": ["auto","log2",None],
                        "class_weight": [None, 'balanced']}

    scores = ['recall_macro']

    n_iter_search   = 20

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        mycv = StratifiedKFold(target_train, n_folds = 5)

        clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search,
                           scoring='%s' % score)

        clf.fit(fv_train, target_train)

        report_cv(clf,fv_test,target_test)
def makeRandomCV(dataset,dbtype='CATH',
                level=1,
                k_iters=10,
                minsamples=500,
                clf = ExtraTreesClassifier(n_estimators=5,class_weight='auto')):

    from scipy.stats import randint as sp_randint
    
    dataDict = dbParser(dataset,level=level,dbtype=dbtype,minsamples=minsamples)
    print dataDict

    labels = dataDict['target_names']

    param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

    n_iter_search = k_iters
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)


    random_search.fit(dataDict['vectors'], dataDict['target_names'])
    report(random_search.grid_scores_)
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(y.shape[0], random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3)
        random_search.fit(X, y)
        for parameters, _, cv_validation_scores in random_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])
def fitAlgo(clf, Xtrain, Ytrain, opt = False, param_dict = None, opt_metric = 'roc_auc', n_iter = 5):
    '''Return the fitted classifier
    Keyword arguments:
    clf - - base classifier
    Xtrain - - training feature matrix
    Ytrain - - training target array
    param_dict - - the parameter distribution of param, grids space, if opt == False, every element should have length 1
    opt_metric - - optimization metric
    opt - - whether to do optimization or not
    '''
    if opt & (param_dict != None):
        assert(map(lambda x: isinstance(param_dict[x],list), param_dict))
        rs = RandomizedSearchCV(estimator = clf, n_iter = n_iter,
                                param_distributions = param_dict,
                                scoring = opt_metric,
                                refit = True,
                                n_jobs=-1, cv = 3, verbose = 3)

        rs.fit(Xtrain, Ytrain)
        imp = []
        if clf.__class__.__name__ == "RandomForestClassifier":
        	imp = rs.best_estimator_.feature_importances_
        return rs.best_estimator_, rs.grid_scores_, imp
    else:
        if param_dict != None:
            assert(map(lambda x: not isinstance(param_dict[x], list), param_dict))
            for k in param_dict.keys():
	            clf.set_params(k = param_dict[k])
        clf.fit(Xtrain, Ytrain)
        return clf, [], []
Example #51
0
    def doRandomSearch(self, clfName, clf, param_dist, X, Y):
        start = time.time()
        multiCores = -1
        if clfName == "Logistic_Regression":
            multiCores = 1
        if self._setXgboostTheradToOne == True and clfName == "Xgboost":
            multiCores = 1

        random_search = RandomizedSearchCV(clf,
                                           param_distributions=param_dist,
                                           n_iter=self._n_iter_search,
                                           n_jobs=multiCores,
                                           scoring='log_loss')

        random_search.fit(X, Y)
        log(clfName + " randomized search cost: ", time.time() - start, " sec")
        self._bestClf[clfName] = random_search.best_estimator_
        self._bestLoglossDict[clfName] = self.getLogloss(
            self._bestClf[clfName], X, Y)
        self.report(random_search.grid_scores_, clfName,
                    self._bestLoglossDict[clfName])

        dumpModel(random_search.best_estimator_, clfName, self._expInfo,
                  self._subFolderName)

        return random_search.best_estimator_
Example #52
0
def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x,
                                             data_train_y, data_test_y):
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.grid_search import RandomizedSearchCV
    print "-- {} --".format("Fine-tuning Gradient Boosting Regression")
    rf = GradientBoostingRegressor(n_estimators=1000)
    param_dist = {
        "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2],
        "max_depth": sp_randint(1, 15),
        "min_samples_split": sp_randint(1, 15),
        "min_samples_leaf": sp_randint(1, 15),
        "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "max_features": sp_randint(1, 15)
    }
    n_iter_search = 300
    random_search = RandomizedSearchCV(rf,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       n_jobs=-1,
                                       cv=5,
                                       verbose=1)

    start = time()
    random_search.fit(data_train_x, data_train_y)
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
Example #53
0
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1):
    # hyperparameter optimization
    param_dist = {"n_iter": randint(5, 100),
                  "power_t": uniform(0.1),
                  "alpha": uniform(1e-08, 1e-03),
                  "eta0": uniform(1e-03, 1),
                  "penalty": ["l1", "l2", "elasticnet"],
                  "learning_rate": ["invscaling", "constant", "optimal"]}
    scoring = 'roc_auc'
    n_iter_search = n_iter_search
    random_search = RandomizedSearchCV(estimator,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=cv,
                                       scoring=scoring,
                                       n_jobs=n_jobs,
                                       random_state=random_state,
                                       refit=True)
    X, y = make_data_matrix(positive_data_matrix=positive_data_matrix,
                            negative_data_matrix=negative_data_matrix,
                            target=target)
    random_search.fit(X, y)

    logger.debug('\nClassifier:')
    logger.debug('%s' % random_search.best_estimator_)
    logger.debug('\nPredictive performance:')
    # assess the generalization capacity of the model via a 10-fold cross validation
    for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']:
        scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs)
        logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores)))

    return random_search.best_estimator_
Example #54
0
def run_randomsearch(X, y, clf, para_dist, cv=5, n_iter_search=20):
		random_search 	= RandomizedSearchCV(clf,param_distributions=param_dist,n_iter=n_iter_search)
		start 			= time()
		random_search.fit(X, y)
		print(("\nRandomizedSearchCV took {:.2f} seconds ""for {:d} candidates parameter ""settings.").format((time() - start),n_iter_search))
		top_params 		= report(random_search.grid_scores_, 3)
		return  top_params
def main():
    NUM_TRAIN = bw_componentrecognition.NUM_TRAIN
    N_BINS = 23
    N_HU_MOMENTS = 7
    N_FEATURES = N_BINS + N_HU_MOMENTS

    X, y = bw_componentrecognition.Data.loadTrain(NUM_TRAIN, N_BINS)

    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    
    clfs = [
        RandomForestClassifier(n_estimators=20),
        ]
    
    param_dists = [
        {"max_depth": [10, 5, 3, None],
          "max_features": sp_randint(1, 11),
          "min_samples_split": sp_randint(1, 11),
          "min_samples_leaf": sp_randint(1, 11),
          "bootstrap": [True, False],
          "criterion": ["gini", "entropy"]},]
        
    
    for clf, param_dist in zip(clfs, param_dists):
        # run randomized search
        n_iter_search = 25
        
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                           n_iter=n_iter_search)

        random_search.fit(X, y)

        report(random_search.grid_scores_)