Exemple #1
1
def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.grid_search import RandomizedSearchCV
    print "-- {} --".format("Fine-tuning Gradient Boosting Regression")
    rf = GradientBoostingRegressor(
        n_estimators=1000
    )
    param_dist = {
        "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2],
        "max_depth": sp_randint(1, 15),
        "min_samples_split": sp_randint(1, 15),
        "min_samples_leaf": sp_randint(1, 15),
        "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "max_features": sp_randint(1, 15)
    }
    n_iter_search = 300
    random_search = RandomizedSearchCV(
        rf,
        param_distributions=param_dist,
        n_iter=n_iter_search,
        n_jobs=-1,
        cv=5,
        verbose=1
    )

    start = time()
    random_search.fit(data_train_x, data_train_y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
   def Gradient(self): 
       X_train, y_train =self.X_train,self.y_train
       parameters_boost={'max_depth':randint(3,self.max_depth_max+1),
 'n_estimators':randint(80,100+self.n_estimators_max)}
       boost_reg=RandomizedSearchCV(GradientBoostingRegressor(loss=self.loss),param_distributions=parameters_boost,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
       boost_reg.fit(X_train,y_train)
       self.boost_reg=boost_reg.best_estimator_
Exemple #3
0
def test_sklearn_cv():

    model = LightFM(loss='warp', random_state=42)

    # Set distributions for hyperparameters
    randint = stats.randint(low=1, high=65)
    randint.random_state = 42
    gamma = stats.gamma(a=1.2, loc=0, scale=0.13)
    gamma.random_state = 42
    distr = {'no_components': randint, 'learning_rate': gamma}

    # Custom score function
    def scorer(est, x, y=None):
        return precision_at_k(est, x).mean()

    # Custom CV which sets train_index = test_index
    class CV(KFold):
        def __iter__(self):
            ind = np.arange(self.n)
            for test_index in self._iter_test_masks():
                train_index = np.logical_not(test_index)
                train_index = ind[train_index]
                yield train_index, train_index

    cv = CV(n=train.shape[0], random_state=42)
    search = RandomizedSearchCV(estimator=model, param_distributions=distr,
                                n_iter=10, scoring=scorer, random_state=42,
                                cv=cv)
    search.fit(train)
    assert search.best_params_['no_components'] == 52
def run(src_dir, mod, random_state=1234):

    if isinstance(src_dir, str):
        mat, labels_arr = load_mat_and_labels(src_dir, mod)
    else:
        mat, labels_arr = (src_dir, mod)

    masker = SimpleMaskerPipeline(threshold=.2)
    svc = SVC(kernel='linear')

    pipeline = Pipeline([('masker', masker),
                         ('anova', SelectKBest(k=500)),
                         ('svc', svc)])

    c_range = gamma.rvs(size=100, a=1.99, random_state=random_state)

    param_dist = {"svc__C": c_range}

    n_iter = 100
    cv = StratifiedShuffleSplit(labels_arr, n_iter=n_iter, test_size=1/6.0, random_state=random_state)

    total_runs = n_iter
    scorer = verbose_scorer(total_runs)

    search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer,
                                random_state=random_state)
    search.fit(mat, labels_arr)

    return search
def main():
    data = pd.read_csv(args.dataset)
    X = data.drop(['Id', 'Class'], axis=1)
    Y = data.loc[:, 'Class']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
    estimator = [('reduce_dim', SelectFromModel(RandomForestClassifier())), ('classifier', XGBClassifier())]
    # transform the threshold to the quantile of median
    tmp = map(str, np.arange(args.threshold[0],args.threshold[1],args.threshold[2]))
    threshold = map(lambda x: x+'*median', tmp)
    clf = Pipeline(estimator)
    params = {}
    params['reduce_dim__estimator__n_estimators'] = list(np.arange(args.components[0], args.components[1], args.components[2]))
    params['reduce_dim__threshold'] = threshold
    params['classifier__n_estimators'] = list(np.arange(args.num_tree[0], args.num_tree[1], args.num_tree[2]))
    params['classifier__max_depth'] = list(np.arange(args.depths[0], args.depths[1], args.depths[2]))
    params['classifier__learning_rate'] = list(np.arange(args.lr[0], args.lr[1], args.lr[2]))
    params['classifier__subsample'] = list(np.arange(args.subsample[0], args.subsample[1], args.subsample[2]))
    params['classifier__colsample_bytree'] = list(np.arange(args.colsample[0], args.colsample[1], args.colsample[2]))
    # Cross_validation for grid search
    try:
        grid_search = RandomizedSearchCV(clf, param_distributions=params, n_iter=args.iter, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
    except:
        grid_search = GridSearchCV(clf, param_grid=params, cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
    best_parameters, score, _ = max(grid_search.grid_scores_, key=lambda x: x[1])
    result = accuracy_score(y_test, grid_search.predict(X_test))
    print("Predict Accuracy: " + str(result))
    print("XGboost using raw pixel features:\n%s\n" % (metrics.classification_report(y_test, grid_search.predict(X_test))))
    print best_parameters
def fitAlgo(clf, Xtrain, Ytrain, opt = False, param_dict = None, opt_metric = 'roc_auc', n_iter = 5):
    '''Return the fitted classifier
    Keyword arguments:
    clf - - base classifier
    Xtrain - - training feature matrix
    Ytrain - - training target array
    param_dict - - the parameter distribution of param, grids space, if opt == False, every element should have length 1
    opt_metric - - optimization metric
    opt - - whether to do optimization or not
    '''
    if opt & (param_dict != None):
        assert(map(lambda x: isinstance(param_dict[x],list), param_dict))
        rs = RandomizedSearchCV(estimator = clf, n_iter = n_iter,
                                param_distributions = param_dict,
                                scoring = opt_metric,
                                refit = True,
                                n_jobs=-1, cv = 3, verbose = 3)

        rs.fit(Xtrain, Ytrain)
        imp = []
        if clf.__class__.__name__ == "RandomForestClassifier":
        	imp = rs.best_estimator_.feature_importances_
        return rs.best_estimator_, rs.grid_scores_, imp
    else:
        if param_dict != None:
            assert(map(lambda x: not isinstance(param_dict[x], list), param_dict))
            for k in param_dict.keys():
	            clf.set_params(k = param_dict[k])
        clf.fit(Xtrain, Ytrain)
        return clf, [], []
def test_randomized_search_grid_scores():
    # Make a dataset with a lot of noise to get various kind of prediction
    # errors across CV folds and parameter settings
    X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0)

    # XXX: as of today (scipy 0.12) it's not possible to set the random seed
    # of scipy.stats distributions: the assertions in this test should thus
    # not depend on the randomization
    params = dict(C=distributions.expon(scale=10), gamma=distributions.expon(scale=0.1))
    n_cv_iter = 3
    n_search_iter = 30
    search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_cv_iter, param_distributions=params, iid=False)
    search.fit(X, y)
    assert_equal(len(search.grid_scores_), n_search_iter)

    # Check consistency of the structure of each cv_score item
    for cv_score in search.grid_scores_:
        assert_equal(len(cv_score.cv_validation_scores), n_cv_iter)
        # Because we set iid to False, the mean_validation score is the
        # mean of the fold mean scores instead of the aggregate sample-wise
        # mean score
        assert_almost_equal(np.mean(cv_score.cv_validation_scores), cv_score.mean_validation_score)
        assert_equal(list(sorted(cv_score.parameters.keys())), list(sorted(params.keys())))

    # Check the consistency with the best_score_ and best_params_ attributes
    sorted_grid_scores = list(sorted(search.grid_scores_, key=lambda x: x.mean_validation_score))
    best_score = sorted_grid_scores[-1].mean_validation_score
    assert_equal(search.best_score_, best_score)

    tied_best_params = [s.parameters for s in sorted_grid_scores if s.mean_validation_score == best_score]
    assert_true(
        search.best_params_ in tied_best_params,
        "best_params_={0} is not part of the" " tied best models: {1}".format(search.best_params_, tied_best_params),
    )
def makeRandomCV(dataset,dbtype='CATH',
                level=1,
                k_iters=10,
                minsamples=500,
                clf = ExtraTreesClassifier(n_estimators=5,class_weight='auto')):

    from scipy.stats import randint as sp_randint
    
    dataDict = dbParser(dataset,level=level,dbtype=dbtype,minsamples=minsamples)
    print dataDict

    labels = dataDict['target_names']

    param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

    n_iter_search = k_iters
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)


    random_search.fit(dataDict['vectors'], dataDict['target_names'])
    report(random_search.grid_scores_)
    def find_best_parameters_and_get_fitted_model(self, **kwargs):
        
        """
        Finds the best set of hyperparameters for a Random Forest for the provided data. 
        The best hyperparameters are found by repeatedly drawing random samples from a distribution 
        of parameters and evaluating them by using cross validation.
        
        """
        
        # load data
        data = kwargs['data']
        X = data['features']
        y = data['targets']
        out_args = {}
        
        # we choose Random Fores Classifier as the Machine Learning algorithm for
        # this DPModel.
        rc = RandomForestClassifier()
        
        # here we define the space of parameters over which we want to perform the random search
        param_distributions = {}
        param_distributions["n_estimators"] = [50, 100, 150]

        # do random search
        random_search_outer = RandomizedSearchCV(rc, param_distributions=param_distributions,
            cv=5, n_iter=3)
        random_search_outer.fit(X, y)
            
        predictor = random_search_outer.best_estimator_

        return predictor, out_args
Exemple #10
0
def _compute_thresh(this_data, ch_type, cv=10):
    """ Compute the rejection threshold for one channel.

    Parameters
    ----------
    this_data: array (n_epochs, n_times)
        Data for one channel.
    ch_type: str
        'mag', 'grad' or 'eeg'.
    cv : iterator
        Iterator for cross-validation.
    """
    est = ChannelAutoReject()

    Limits = namedtuple('Limits', 'low high')
    limits = dict(eeg=Limits(low=20e-7, high=400e-6),
                  grad=Limits(low=400e-13, high=20000e-13),
                  mag=Limits(low=400e-15, high=20000e-15))

    param_dist = dict(thresh=uniform(limits[ch_type].low,
                                     limits[ch_type].high))
    rs = RandomizedSearchCV(est,  # XXX : is random really better than grid?
                            param_distributions=param_dist,
                            n_iter=20, cv=cv)
    rs.fit(this_data)
    best_thresh = rs.best_estimator_.thresh

    return best_thresh
def train_cv():
    # ---------------------- load the data
    train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
    Xtrain = train_df[feature_names]
    ytrain = train_df["Survived"]

    # ---------------------- train
    loss = ['deviance', 'exponential']
    learning_rate = np.logspace(-5,1)
    n_estimate_dist = sp_randint(1000,4800)
    max_depth_dist = sp_randint(1,10)
    param_dist = dict(loss=loss,
                    learning_rate=learning_rate,
                    n_estimators=n_estimate_dist,
                    max_depth=max_depth_dist)

    gbdt = GradientBoostingClassifier(verbose=1)
    searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1)

    print "--------------------- RandomizedSearchCV begins"
    searchcv.fit(Xtrain,ytrain)      
    print "--------------------- RandomizedSearchCV ends"
    print "best score: ",searchcv.best_score_                                  
    print "best parameters: ",searchcv.best_params_

    common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_)
    print "--------------------- GBDT saved into file"
def searchBestModelParameters(algorithm, trainingData):
    #using randomforest
    if algorithm == 'rf':
        numTrees = range(10, 100, 10)
        numMinLeafSamples = range(2, 20, 2)
        numMinSamplesSplit = range(1, 20, 3)
        paramDistribution = dict(n_estimators = numTrees, min_samples_leaf = numMinLeafSamples, min_samples_split = numMinSamplesSplit)
        model = RandomForestClassifier()
    elif algorithm == 'knn':
        # model the data using knn
        # define the parameter values that should be searched
        k_range = range(1, 50)
        weight_options = ['uniform', 'distance']
        # specify "parameter distributions" rather than a "parameter grid"
        paramDistribution = dict(n_neighbors = k_range, weights = weight_options)
        model = KNeighborsClassifier()
    elif algorithm == 'logr':
        #model data using logistic regression
        model = LogisticRegression()
        get_ipython().magic("time print(np.sqrt(-cross_val_score(model, trainingData, trainingData['isSpam'], cv=10, scoring='mean_squared_error')).mean())")
        return
            
    bestRun = []
    for _ in range(20):
        rand = RandomizedSearchCV(model, paramDistribution, cv=10, scoring = 'accuracy', n_iter = 10)
        rand.fit(trainingData, trainingData['isSpam'])
        # examine the best model
        bestRun.append({'score' : round(rand.best_score_,3), 'params' : rand.best_params_})
    print(max(bestRun, key=lambda x:x['score']))
    return max(bestRun, key=lambda x:x['score'])
def run(full, target_col, random_state=1234, c_range_alpha=.05, c_range_size=100, normalize=False,
        score_fn=r2_score):

    svr = linearSVRPermuteCoefFactory()
    
    pipeline_steps = [('svr', svr)]

    pipeline = Pipeline(pipeline_steps)

    c_range = gamma.rvs(size=c_range_size, a=c_range_alpha, random_state=random_state)

    param_dist = {"svr__C": c_range}

    data, target = separate(full, target_col)
    
    if normalize:
        data = scale(data)

    n_iter = 100
    cv = ShuffleSplit(len(target), n_iter=n_iter, test_size=1/6.0, random_state=random_state)

    total_runs = n_iter
    scorer = verbose_scorer(total_runs, score_fn)

    search = RandomizedSearchCV(pipeline, param_distributions=param_dist, cv=cv, scoring=scorer,
                                random_state=random_state)

    search.fit(data, target)

    return search
def run_randomsearch(X, y, clf, para_dist, cv=5,
                     n_iter_search=20):
    """Run a random search for best Decision Tree parameters.
    
    Args
    ----
    X -- features
    y -- targets (classes)
    cf -- scikit-learn Decision Tree
    param_dist -- [dict] list, distributions of parameters
                  to sample
    cv -- fold of cross-validation, default 5
    n_iter_search -- number of random parameter sets to try,
                     default 20.

    Returns
    -------
    top_params -- [dict] from report()
    """
    random_search = RandomizedSearchCV(clf, 
                    	param_distributions=param_dist,
                        n_iter=n_iter_search)
    
    start = time()
    random_search.fit(X, y)
    print(("\nRandomizedSearchCV took {:.2f} seconds "
           "for {:d} candidates parameter "
           "settings.").format((time() - start),
                               n_iter_search))

    top_params = report(random_search.grid_scores_, 3)
    return  top_params
def tuneSGD(data,labels, clf=None):
    from sklearn.cross_validation import StratifiedShuffleSplit 
    from sklearn.linear_model import SGDClassifier
    sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .2, random_state = 42) 
    clf = Pipeline([#('num_features',SelectPercentile(f_classif,percentile = 5)),
                    ('sgd', SGDClassifier(random_state = 11, penalty = 'elasticnet', n_jobs = 1, alpha = 10**-4))])
    param_grid = {
        #'num_features__percentile': list(range(1,101)),
        'sgd__loss':['modified_huber','squared_hinge'],#,'hinge','log'],
        'sgd__class_weight':['balanced',None],
        'sgd__l1_ratio': list(np.arange(0,1.0,.01)),
        'sgd__alpha': list(10.**np.arange(-6,-3,.1))

    }
    
    grid_search = RandomizedSearchCV(clf, 
                               param_grid,
                               n_iter = 250,
                               random_state = 42,
                               cv=sss,
                               scoring = 'roc_auc',#roc_score,
                               n_jobs= -2,
                               pre_dispatch = '2*n_jobs')
    grid_search.fit(data,labels)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for p in param_grid.keys():
        print (p, best_parameters[p])
    
    return grid_search
    plot_cs(grid_search)
 def RandomFo(self):
     parameters_forest={'n_estimators':randint(10,self.n_estimators_max),
             "bootstrap": [True, False]}
     X_train, y_train =self.X_train,self.y_train
     forest_reg=RandomizedSearchCV(RandomForestRegressor(),param_distributions=parameters_forest,cv=self.cv, n_iter=self.n_iter,n_jobs=-1)
     forest_reg.fit(X_train,y_train)
     self.forest_reg=forest_reg.best_estimator_
Exemple #17
0
def svc_appr():
    """
    Best params: {'C': 0.022139881953014046}

    Submission:
    E_val:
    E_in:
    E_out:
    """
    from sklearn.svm import LinearSVC
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.grid_search import RandomizedSearchCV
    from scipy.stats import expon

    X, y = dataset.load_train()

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    svc = LinearSVC(dual=False, class_weight='auto')
    rs = RandomizedSearchCV(svc, n_iter=50, scoring='roc_auc', n_jobs=-1,
                            cv=StratifiedKFold(y, 5), verbose=2,
                            param_distributions={'C': expon()})
    rs.fit(X_scaled, y)

    logger.debug('Got best SVC.')
    logger.debug('Best params: %s', rs.best_params_)
    logger.debug('Grid scores:')
    for i, grid_score in enumerate(rs.grid_scores_):
        print('\t%s' % grid_score)
    logger.debug('Best score (E_val): %s', rs.best_score_)
    logger.debug('E_in: %f', Util.auc_score(rs, X_scaled, y))
Exemple #18
0
def pick_best_features(df):
    """
    Grid search to find best features. TODO refactor
    :param train: train data
    :param test: test data
    :return:
    """

    #X = sample_data_random(df, .25)
    X = df[0:int(df.shape[0] * .25)]
    overfit_models = dict()
    for out in outputs:
        print out
        pipe_clf = CustomPipeline.get_transforms()

        clf = SGDClassifier(loss='log')

        tuned_parameters = {'alpha': sp_rand()}
        score = 'log_loss'
        tran_x = pipe_clf.fit_transform(X)
        grid = RandomizedSearchCV(clf, tuned_parameters, cv=5, scoring=score)
        grid.fit(tran_x, X[out])
        print grid.best_estimator_
        overfit_models[out] = grid.best_estimator_
    return overfit_models
Exemple #19
0
def optimized_classifier(X, y, classifier, distributions, scorer='f1_weighted', n_iter=30, cv=3):
    """
    Return best classifier and scores for X,y from a randomized search over parameters

    X             -- Features for each sample
    y             -- Class label for each sample
    classifier    -- An estimator class or pipeline from sklearn
    distributions -- The parameter distributions to search for that estimator
    scorer        -- Scoring function (e.g. accuracy or f1)
    n_iter        -- The number of random iterations to try
    """
    # Make a pipeline out of the classifier, to allow for feature scaling in the first step.

    # Add prefix to parameters to support use in pipeline
    class_name = classifier.__class__.__name__.lower()
    distributions = dict((class_name + "__" + key, val) for key, val in distributions.iteritems())

    # It is important to handle scaling here so we don't accidentally overfit some to the
    # test data by scaling using that information as well.
    classifier = make_pipeline(preprocessing.RobustScaler(), classifier)
    randomized_search = RandomizedSearchCV(
        classifier, param_distributions=distributions, n_iter=n_iter, scoring=scorer, cv=cv, n_jobs=1)
    randomized_search.fit(X, y)

    print randomized_search.best_estimator_
    print "Validation Score ({}): {:.2f}".format(scorer, randomized_search.best_score_)
    print ""
    return randomized_search.best_estimator_, randomized_search.best_score_
def main():
    NUM_TRAIN = bw_componentrecognition.NUM_TRAIN
    N_BINS = 23
    N_HU_MOMENTS = 7
    N_FEATURES = N_BINS + N_HU_MOMENTS

    X, y = bw_componentrecognition.Data.loadTrain(NUM_TRAIN, N_BINS)

    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    
    clfs = [
        RandomForestClassifier(n_estimators=20),
        ]
    
    param_dists = [
        {"max_depth": [10, 5, 3, None],
          "max_features": sp_randint(1, 11),
          "min_samples_split": sp_randint(1, 11),
          "min_samples_leaf": sp_randint(1, 11),
          "bootstrap": [True, False],
          "criterion": ["gini", "entropy"]},]
        
    
    for clf, param_dist in zip(clfs, param_dists):
        # run randomized search
        n_iter_search = 25
        
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                           n_iter=n_iter_search)

        random_search.fit(X, y)

        report(random_search.grid_scores_)
    def runGridSearch(self, model):
        logging.debug("run grid search on model: {}".format(model.__class__.__name__))
        logging.debug("cross validation strategy: {}".format(model.holdout_split))
        logging.debug("used features: {}".format(model.usedFeatures))
        logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions()))
        
        features,labels,cv = model.getFeaturesLabel()
        # do grid search
        if self.do_random_gridsearch:
            estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs,
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch)
        else:
            estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, 
                                     fit_params=model.get_fit_params(),
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500)
        estimator.fit(features, labels)
        model.clf = estimator.best_estimator_
        model.save_final_model = True
        model.save_model()
        
#         model.dispFeatureImportance()
        logging.debug('estimaator parameters: {}'.format(estimator.get_params))
        logging.debug('Best parameters: {}'.format(estimator.best_params_))
        logging.debug('Best Scores: {}'.format(-estimator.best_score_))
        logging.debug('Score grid: {}'.format(estimator.grid_scores_ ))
        for i in estimator.grid_scores_ :
            logging.debug('parameters: {}'.format(i.parameters ))
            logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score)))
            logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) ))

        
        
        return
def tune(data,labels, clf=None):
    from sklearn.cross_validation import StratifiedShuffleSplit 
    sss = StratifiedShuffleSplit(labels, n_iter = 10, test_size = .1, random_state = 42) 
    clf = Pipeline([('num_features', 
               SelectKBest(f_classif,k=100)),
                    ('svm', svm.SVC(C=.01, kernel = 'linear', probability = True, random_state = 11))])
    param_grid = {
        'num_features__k':range(250,2500,250),
        'svm__C':10.**np.arange(-3,4),
        #'svm__loss':['hinge','squared_hinge'],
        'svm__class_weight':['balanced',None]
    }
    grid_search = RandomizedSearchCV(clf, 
                               param_grid,
                               n_iter = 100,
                               cv=sss,
                               scoring='f1',
                               n_jobs=-1,
                               pre_dispatch = '2*n_jobs',
                               random_state = 42)
    grid_search.fit(data,labels)
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for p in param_grid.keys():
        print (p, best_parameters[p])

    #plot_cs(grid_search)

    return grid_search
Exemple #23
0
 def doRandomSearch(self, clfName, clf, param_dist, X, Y):
     
     if self._custRandomSearchFlag == True:
         return self.doCustRandomSearch(clfName, clf, param_dist, X, Y)
     else:
         start = time.time()
         multiCores = -1
         if  clfName == "Logistic_Regression": 
             multiCores = 1
         if self._setXgboostTheradToOne == True and clfName =="Xgboost":
             multiCores = 1
             
         random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                n_iter=self._n_iter_search, n_jobs=multiCores, scoring='log_loss'
                                ,verbose=10)
         
         
         random_search.fit(X, Y)
         log(clfName + " randomized search cost: " , time.time() - start , " sec")
         self._bestClf[clfName] = random_search.best_estimator_
         #self._bestLoglossDict[clfName] = self.getLogloss(self._bestClf[clfName], X, Y)
         self._bestLoglossDict[clfName] = self.validation(self._bestClf[clfName], X, Y, test_size=0.3)
         log("customize logloss: ",self._bestLoglossDict[clfName])
         self.report(random_search.grid_scores_, clfName)
         
         random_search.best_params_
         
         dumpModel(random_search.best_estimator_, clfName, self._expInfo, self._subFolderName)
         self._lastRandomSearchBestParam = random_search.best_params_
     
         return random_search.best_estimator_
    def buildRandomForest(self, X_train, X_test, y_train, cv = 3, n_iter = 5, save = False):
        rf = RandomForestClassifier(random_state = 9)
        #Tune the model
        param_distributions = {
            'n_estimators': range(1,50,1),
            'max_depth': range(1,70,1),
            'max_features': range(6,15,1),
            'min_samples_split':[2,3,4],
            'min_samples_leaf':[1,2,3,4],
            'n_jobs':[-1]
        }

        rf_optimized = RandomizedSearchCV(
            estimator = rf,
            param_distributions = param_distributions,
            n_iter= n_iter,
            scoring = 'f1',
            cv = cv,
            random_state = 1
        )

        rf_optimized.fit(X_train, y_train)
        if save == True:
            joblib.dump(value = rf_optimized, filename = "rf_optimized.pkl", compress=1)

        print "Best parameter: %s"  %rf_optimized.best_params_
        print "Best average cross validated F1 score: %0.4f" %rf_optimized.best_score_
        print "--------------------------------------------"

        #predictions
        predicted_y_train = rf_optimized.predict(X_train)
        predicted_y_test = rf_optimized.predict(X_test)

        return predicted_y_train, predicted_y_test
Exemple #25
0
def fit_estimator(estimator, positive_data_matrix=None, negative_data_matrix=None, target=None, cv=10, n_jobs=-1, n_iter_search=40, random_state=1):
    # hyperparameter optimization
    param_dist = {"n_iter": randint(5, 100),
                  "power_t": uniform(0.1),
                  "alpha": uniform(1e-08, 1e-03),
                  "eta0": uniform(1e-03, 1),
                  "penalty": ["l1", "l2", "elasticnet"],
                  "learning_rate": ["invscaling", "constant", "optimal"]}
    scoring = 'roc_auc'
    n_iter_search = n_iter_search
    random_search = RandomizedSearchCV(estimator,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=cv,
                                       scoring=scoring,
                                       n_jobs=n_jobs,
                                       random_state=random_state,
                                       refit=True)
    X, y = make_data_matrix(positive_data_matrix=positive_data_matrix,
                            negative_data_matrix=negative_data_matrix,
                            target=target)
    random_search.fit(X, y)

    logger.debug('\nClassifier:')
    logger.debug('%s' % random_search.best_estimator_)
    logger.debug('\nPredictive performance:')
    # assess the generalization capacity of the model via a 10-fold cross validation
    for scoring in ['accuracy', 'precision', 'recall', 'f1', 'average_precision', 'roc_auc']:
        scores = cross_validation.cross_val_score(random_search.best_estimator_, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs)
        logger.debug('%20s: %.3f +- %.3f' % (scoring, np.mean(scores), np.std(scores)))

    return random_search.best_estimator_
def search_classifier(n_iter):
    assignments = load_structure()['ASS_ASSIGNMENT']
    features = load_featurized_training_set("files/train_featurized.pkl")

    # print(len(features.columns))
    X = features.drop(['DATE', 'n_calls'], axis=1).as_matrix().astype(float)
    y = (features.n_calls > 0).astype(int).as_matrix()
    calls = features.n_calls.as_matrix()

    X = StandardScaler().fit_transform(X)
    pipe = Pipeline([
        # ('scaler', StandardScaler()),
        # ('pca', RandomizedPCA()),
        ('clf', SGDClassifier())
    ])

    params = {
        # 'pca__n_components': [30, 50, 70, 86],
        'clf__class_weight': ['balanced'],
        'clf__loss': ['hinge'],
        'clf__penalty': ['l1'],
        'clf__alpha': st.uniform(0, 0.0003),
        'clf__fit_intercept': [False]
        # 'clf__alpha': [0.0001]
    }

    kf = KFold(len(X), n_folds=3, shuffle=True)
    grid_search = RandomizedSearchCV(pipe, params, scoring='accuracy', cv=kf, verbose=1000, n_iter=n_iter)
    grid_search.fit(X, y)

    print("\n")
    print(grid_search.best_params_)
    print(grid_search.best_score_)

    joblib.dump(grid_search.best_estimator_, "files/best_classifier.pkl")
def test_grid_search_with_multioutput_data():
    # Test search with multi-output estimator

    X, y = make_multilabel_classification(random_state=0)

    est_parameters = {"max_depth": [1, 2, 3, 4]}
    cv = KFold(y.shape[0], random_state=0)

    estimators = [DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0)]

    # Test with grid search cv
    for est in estimators:
        grid_search = GridSearchCV(est, est_parameters, cv=cv)
        grid_search.fit(X, y)
        for parameters, _, cv_validation_scores in grid_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])

    # Test with a randomized search
    for est in estimators:
        random_search = RandomizedSearchCV(est, est_parameters, cv=cv, n_iter=3)
        random_search.fit(X, y)
        for parameters, _, cv_validation_scores in random_search.grid_scores_:
            est.set_params(**parameters)

            for i, (train, test) in enumerate(cv):
                est.fit(X[train], y[train])
                correct_score = est.score(X[test], y[test])
                assert_almost_equal(correct_score, cv_validation_scores[i])
def rf_cv(fv_train,target_train,fv_test,target_test):

    ####---- cross validation of train dataset, gridsearch the best parameters for random forest

    # Set the parameters by cross-validation
    tuned_parameters = {'n_estimators': [1000, 2000],
                        "max_depth": [3, 6, 9, None],
                        "max_features": ["auto","log2",None],
                        "class_weight": [None, 'balanced']}

    scores = ['recall_macro']

    n_iter_search   = 20

    for score in scores:
        print("# Tuning hyper-parameters for %s" % score)
        print()

        mycv = StratifiedKFold(target_train, n_folds = 5)

        clf = RandomizedSearchCV(RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=mycv, n_iter=n_iter_search,
                           scoring='%s' % score)

        clf.fit(fv_train, target_train)

        report_cv(clf,fv_test,target_test)
Exemple #29
0
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    rf = RandomForestClassifier(n_jobs=8)

    param_dist = {
            "n_estimators":sp_randint(100,300),
        "criterion": ["gini"],
        #"max_depth": sp_randint(3, 10000),
        #"min_samples_split": sp_randint(1, 300),
        #"min_samples_leaf": sp_randint(1, 300),
        "max_features": sp_randint(10, 26),
        "bootstrap": [True, False],
        'random_state':sp_randint(1, 1000000),
        }

    clf = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=50,cv=10,scoring='roc_auc')

    clf.fit(train_x, train_y)
    valid_predictions = clf.predict_proba(valid_x)[:, 1]
    test_predictions= clf.predict_proba(test_x)[:, 1]

    loss = roc_auc_score(valid_y,valid_predictions)
    print('loss:')
    print(loss)
    print(clf.best_estimator_)
    data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv")
    data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
Exemple #30
0
def randomized_search_rfc(txt_lst, y):
	# build a classifier
	pipeline = Pipeline([
	('vect', CountVectorizer(stop_words='english', analyzer = analyzer, ngram_range=(1, 3)))
	,('tfidf', TfidfTransformer())
	, ('clf', RandomForestClassifier(n_estimators=100))
	])

	# specify parameters and distributions to sample from
	param_dist = {  
				'vect__ngram_range':[None, (1, 2), (1,3),(1,4)],
				"clf__max_depth": map(lambda x: int(x), np.logspace(1, 4, 10)), #sp.stats.randint(10,1000),
	              "clf__max_features": map(lambda x: int(x), np.logspace(0, 3, 10)),
	             "clf__min_samples_split": sp.stats.randint(1, 11),
	              "clf__min_samples_leaf": sp.stats.randint(2, 11),
	              "clf__criterion": ["gini", "entropy"]
	              }
	n_iter_search = 50
	grid_search = RandomizedSearchCV(pipeline, param_distributions=param_dist,
	                          verbose = 1, n_iter=n_iter_search, cv = 5, n_jobs=1, scoring = 'accuracy')
	start = time.time()
	grid_search.fit(txt_lst, y)
	print("RandomizedSearchCV took %.2f seconds for %d candidates"
	      " parameter settings." % ((time.time() - start), n_iter_search))
	report(grid_search.grid_scores_, n_top = 5)
	return grid_search
Exemple #31
0
            'median_width': expon(scale=1, loc=median_w),
            'kernel_size': [2, 3, 4, 5, 6, 7, 8]
        }
        param_grid = []
        for i in xrange(N):
            param_grid.append(params)
        i = 0
        for params in param_grid:
            mkl = mkl_regressor()
            rs = RS(mkl,
                    param_distributions=params,
                    n_iter=20,
                    n_jobs=24,
                    cv=k,
                    scoring="mean_squared_error")  #"r2")
            rs.fit(data, labels)
            rs.best_estimator_.save(
                '/almac/ignacio/data/mkl_models/mkl_%d.model' % i)

            if args.estimate:  # If user wants to save estimates
                test_predict(data=data,
                             machine=rs.best_estimator_,
                             labels=labels,
                             out_file=out_file)
            if args.predict:  # If user wants to predict and save just after training.
                assert not args.X is None  # If test data is provided
                #preds = rs.best_estimator_.predict(data_t)
                if args.Y:  # Get performance if test labels are provided
                    test_predict(data=data_t,
                                 machine=rs.best_estimator_,
                                 labels=labels_t,
Exemple #32
0
param_distribs = {
    'n_estimators': randint(low=1, high=400),
    'learning_rate': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
    'max_features': randint(10, 115),
    'max_depth': randint(low=1, high=4)
}

boost_reg = GradientBoostingRegressor(random_state=42)
rnd_search = RandomizedSearchCV(boost_reg,
                                param_distributions=param_distribs,
                                n_iter=10,
                                cv=10,
                                scoring='mean_squared_error',
                                random_state=42)
rnd_search.fit(train_features, train_labels)

print('Best Params' + str(rnd_search.best_params_))
print('Best Estimator' + str(rnd_search.best_estimator_))
feature_importances = rnd_search.best_estimator_.feature_importances_
#print('Feature importance')
#sorted(zip(feature_importances, attributes), reverse=True)

# In[21]:

final_model = rnd_search.best_estimator_
print('Best Score ' + str(np.sqrt(-rnd_search.best_score_)))

final_predictions = final_model.predict(test_features)
final_mse = mean_squared_error(test_labels, final_predictions)
final_rmse = np.sqrt(final_mse)
Exemple #33
0
    'algorithm__min_samples_split': [3, 4, 5, 7, 9, 10, 15],
    'algorithm__min_samples_leaf': [2, 3, 5, 7, 10],
    'algorithm__max_leaf_nodes': [2, 4, 6, 8, 10],
    'algorithm__max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'algorithm__criterion': ["gini", "entropy"],
}
scaler = MinMaxScaler()

algo = DecisionTreeClassifier()
#skb = SelectKBest(k='all')

pipeline = Pipeline(steps=[('scaler',
                            scaler), ("features",
                                      combined_features), ('algorithm', algo)])
cv = StratifiedShuffleSplit(labels, 5, test_size=0.3, random_state=42)

gs = RandomizedSearchCV(pipeline, param_grid, cv=cv, scoring='f1')

gs.fit(features, labels)
print "Best estimator", gs.best_estimator_
clf = gs.best_estimator_
# fit the optimal model
clf.fit(features_train, labels_train)
# predict based on the optimal model
pred = clf.predict(features_test)

#print "predicting  time:", round(time()-t1, 3), "s"
accuracy = accuracy_score(pred, labels_test)
print accuracy
print classification_report(labels_test, pred)
dump_classifier_and_data(clf, my_dataset, features_list)
Exemple #34
0

# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


param_dist = {
    "n_neighbors": randint(2, 400),
    "weights": ["uniform", "distance"]
}

# run randomized search
n_iter_search = 30
random_search = RandomizedSearchCV(neigh,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
                                   scoring='roc_auc')  #or neigh

start = time()
random_search.fit(x_train, y_train)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates"
    " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)
Exemple #35
0
paramDist = {'n_estimators': scipy.stats.randint(20,50),
             'learning_rate': [0.1],
             'max_features':['auto'],
             'max_depth': scipy.stats.expon(scale=7),
#             'min_samples_split':scipy.stats.expon(scale=2),
             'min_samples_leaf':[1]}



Rforest = RandomForestRegressor()
GBM = GradientBoostingRegressor()
grid_search = RandomizedSearchCV(Rforest,cv=3,param_distributions=paramDist,n_iter=40,n_jobs=4,scoring='mean_squared_error')
grid_search = RandomizedSearchCV(GBM,cv=5,param_distributions=paramDist,n_iter=12,n_jobs=4,scoring='mean_squared_error')


grid_search.fit(X, Y)

scoresGrid = grid_search.grid_scores_
print grid_search.best_score_
print grid_search.best_estimator_
report(grid_search.grid_scores_)


cols = np.array(mat.drop(colRemoved,axis=1).columns)
importance = grid_search.best_estimator_.feature_importances_
featImport = pd.concat((pd.DataFrame(cols),pd.DataFrame(importance)),axis=1)
featImport.columns=['f','v']
featImport.sort('v',ascending=False,inplace=True)
featImport.set_index('f',inplace=True)
featImport.plot(kind='bar')
plt.subplots_adjust(bottom = 0.3)
        35,
        40,
        50,
        75,
        100,
        125,
        150,
        200,
    ]
},
                         verbose=1,
                         n_jobs=2,
                         cv=4,
                         scoring='roc_auc',
                         n_iter=1000)
clf.fit(train_data, outcome_train)
print('best clf score', clf.best_score_)
print('best params:', clf.best_params_)
bst = xgb.train(plst, dtrain, num_round, watchlist)
# this is prediction
preds = bst.predict(dcv)
pred_test = bst.predict(dtest)
labels = dcv.get_label()
print('error=%f' %
      (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) /
       float(len(preds))))

print('{0:<25} {1:>5}'.format('Feature', 'Importance'))
print("--------------------------------------")
for i in range(len(df_train_df.columns.values)):
    key = 'f' + str(i)
Exemple #37
0
X_train, X_test, y_train, y_test = train_test_split(transformed_data,
                                                    y,
                                                    test_size=0.1)

pipeline = Pipeline([('rf', RandomForestClassifier())])

param_grid = {
    'rf__max_depth': list(range(9, 20)),
    'rf__n_estimators': list(range(45, 70, 5)),
    'rf__criterion': ["gini", "entropy"],
    "rf__max_features": ["auto", None]
}

# searcher = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='accuracy')
n_iter_search = 20
searcher = RandomizedSearchCV(estimator=pipeline,
                              param_distributions=param_grid,
                              n_iter=n_iter_search)

searcher.fit(X_train, y_train)

print("Best hyper parameters")
print(searcher.best_params_)
# {'rf__max_depth': 19, 'rf__n_estimators': 55, 'rf__criterion': 'entropy', 'rf__max_features': None}

clf = searcher.best_estimator_
clf.fit(X_train, y_train)

print("Train accuracy: %.3f" % clf.score(X_train, y_train))
print("Test accuracy: %.3f" % clf.score(X_test, y_test))
Exemple #38
0
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted',
                        labels=labels)

# search
rs = RandomizedSearchCV(crf,
                        params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(
    metrics.flat_classification_report(y_test,
                                       y_pred,
                                       labels=sorted_labels,
                                       digits=3))

from collections import Counter
    "max_depth": [3, None],
    "max_features": sp_randint(1, 11),
    "min_samples_split": sp_randint(1, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dister,
                                   n_iter=n_iter_search,
                                   n_jobs=2)
start = time()
random_search.fit(X, y)

print("RandomizedSearchCV took %.2f s for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

report(random_search.grid_scores_)

# Load the testing data
test_mat = genfromtxt(TRAINING_INPUT_DIRECTORY + '/testing_matrix.csv',
                      delimiter=',')

test_y = test_mat[:, 0]
test_x = test_mat[:, 1:]

y_true, y_pred = test_y, random_search.predict(test_x)
def load_dataset_and_analyse():
    iris = load_iris()
    X = iris.data
    y = iris.target
    knn1 = k_nearest(X, y, 1)
    X_new = [[3, 5, 4, 2], [5, 4, 3, 2]]
    knn1.predict(X_new)  # Returns 2,1

    knn5 = k_nearest(X, y, 5)
    knn5.predict(X_new)  # Returns 1, 1

    # logreg = logreg_prediciting(X,y)
    # logreg.predict(X_new)  # Returns 2,0

    # print (metrics.accuracy_score(y, knn5.predict(X)))  # Training accuracy
    # print (metrics.accuracy_score(y, knn1.predict(X)))  # Training accuracy

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=4)
    # 0.4 means test size is 40% of the original data. Standard for it is around 20-40%
    # Random_state helps split the data same way every time. Without it, it will split it differently everytime
    knn5 = k_nearest(X_train, y_train, 5)

    print(metrics.accuracy_score(y_test,
                                 knn5.predict(X_test)))  # Testing accuracy

    # Can we locate an even better value for K?
    scores = []
    for k in range(1, 26):  # Testing K = 1 to 25
        knn = k_nearest(X_train, y_train, k)
        scores.append(metrics.accuracy_score(y_test, knn.predict(X_test)))

    # Cross validation example
    # Simulate splitting a dataset of 25 observations into 5 folds
    kf = KFold(25, n_folds=5, shuffle=False)

    # 1. Dataset contains 25 observations (numbered 0 through 24)
    # 2. 5 fold cross validation, thus it runs for 5 iterations
    # 3. For each iteration, every observation is either in the
    # training set or testing set but not both
    # 4. Every observation is in the testing set exactly once

    # Print the contents of each training and testing set
    print('{} {:^61} {}'.format('Iteration', 'Training set observations',
                                'Testing set observations'))
    for iteration, data in enumerate(kf, start=1):
        print('{} {} {}'.format(iteration, data[0], data[1]))

    # 10 fold cross validation with k=5 for knn
    knn = KNeighborsClassifier(n_neighbors=5)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    # cv=10 means 10 fold cross validation
    # scoring='accuracy' classification accuracy as the evaluation metrics

    print(scores)

    # use average accuracy as an estimate of out of sample accuracy
    print(scores.mean())

    # Search for an optimal value of k for knn
    k_scores = []
    for k in range(1, 31):  # Testing K = 1 to 30
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
        k_scores.append(scores.mean())
    print(k_scores)

    plt.plot(range(1, 31), k_scores)
    plt.xlabel('Value of K for KNN')
    plt.ylabel('Cross-Validated Accuracy')
    plt.show()
    # K = 20 should be picked from this graph even though
    # K =13, 18 and 20 have the same highest accuracy of 0.98.
    # This is because we want our models to be simples
    # and higher k values means less complexity

    # 10 fold cross validation with logistics regression
    logreg = LogisticRegression()
    print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())
    # 0.95333
    # It means knn = 20 is a better fit than logreg

    # The above strategy of using a for loop to find the optimal value of K
    # can be done through GridSearchCV. It replaces the for loop and provides
    # addtional functionality

    # Define the values that should be searched
    k_range = range(1, 31)

    # Create a param grid: map the paramter names to the values that should
    # be searched
    param_grid = dict(n_neighbors=k_range)
    knn = KNeighborsClassifier()
    # instantiate the grid
    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    # Set n_jobs = -1 to run computations in parallel
    # (if your computer and OS allows it)

    grid.fit(X,
             y)  # This step can take a while depending on the model and data

    # view the complete results (list of named tuples)
    grid.grid_scores_
    # [mean: 0.96, std: 0.0533, params: {'n_neighbors': 1},
    #  mean: 0.9533, std: 0.05207, params: {'n_neighbors': 2}, ..]

    grid.grid_scores_[0].parameters
    grid.grid_scores_[0].cv_validation_scores
    grid.grid_scores_[0].mean_validation_score

    grid_mean_scores = [
        result.mean_validation_score for result in grid.grid_scores_
    ]
    plt.plot(k_range, grid_mean_scores)
    plt.xlabel('Value of K for KNN')
    plt.ylabel('Cross-Validated Accuracy')
    plt.show()
    # plotting a graph isnt the most efficient way of finding the optimal k value

    # examine the best model
    print(grid.best_score_)  # best accuracy
    print(grid.best_params_)  # best param used for that accuracy
    print(grid.best_estimator_)  # best model used for the param

    weight_options = ['uniform', 'distance']
    # Another param of knn that can be tuned is the weights
    # Default value is uniform which means it puts uniform weight into all the
    # k neighbour. Distance is another option where the closer neighbours are
    # weighted more than further neighbours

    param_grid = dict(n_neighbors=k_range, weights=weight_options)

    grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
    grid.fit(X, y)

    # examine the best model
    print(grid.best_score_)  # 0.98
    print(grid.best_params_)  # {'n_neighbors': 13, 'weights' : 'uniform'}
    # Distance on knn didnt improve over uniform

    # train your model using all the data and best known parameters
    knn = KNeighborsClassifier(n_neighbour=13, weights='uniform')
    knn.fit(X, y)
    knn.predict([3, 5, 4, 2])  # predict out of sample data

    # Shortcut: grid can do the prediction
    grid.predict([3, 5, 4, 2])

    # Reducing computational expense using RandomizedSearchCV
    # RandomizedSearchCV is close cousin of GridSearchCV
    # RandomizedSearchCV seaches a subset of the parameters
    # and you control the computational "budget"

    # Specify "parameter distn" rather than "parameter grid"
    param_dist = dict(n_neighbors=k_range, weights=weight_options)
    # Important: If one of your tuning parameters is continous, Specify
    # a continous distn rather than a list of values

    # n_iter controls the number of searches
    # random_state is there for the purpose of reproducability
    rand = RandomizedSearchCV(knn,
                              param_dist,
                              cv=10,
                              scoring='accuracy',
                              n_iter=10,
                              random_state=5)
    rand.fit(X, y)
    rand.grid_scores_
    print(rand.best_score_)
    print(rand.best_params_)
    "max_features": sp_randint(1, 11),
    "min_samples_split": sp_randint(1, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
    "n_estimators": sp_randint(100, 600)
}

# In[4]:

search_GB = RandomizedSearchCV(model,
                               param_grid,
                               scoring='log_loss',
                               n_jobs=-1,
                               n_iter=n_iter,
                               cv=cv,
                               verbose=True)
search_GB.fit(X_train, y_train.flatten())

# In[5]:

log_model = search_GB.score(X_val, y_val.flatten())
print "Log loss = %s" % log_model
X_test = get_test()
y_pred = search_GB.predict_proba(X_test)
save_submission(model_name, log_model, y_pred)

# In[7]:

model_name
Exemple #42
0
    n_iter=25)

predictor1.fit(X, Y)

# Hyperparameters search space for a 1-hidden layer MLP
params = {
    'dropout_rate': sp.stats.uniform(0, 0.5),
    'hidden0__units': sp.stats.randint(10, 1000)
}

random_search1 = RandomizedSearchCV(predictor1,
                                    param_distributions=params,
                                    n_iter=n_iter_search_1,
                                    cv=CViterator,
                                    n_jobs=1)
random_search1.fit(X, Y)

## 2-layers

predictor2 = Classifier(layers=[
    Layer("Sigmoid", units=100, dropout=0),
    Layer("Sigmoid", units=100, dropout=0),
    Layer("Softmax", units=2)
],
                        learning_rate=0.001,
                        n_iter=25)

predictor2.fit(X, Y)

# Hyperparameters search space for a 2-hidden layers MLP
params = {
def train_classifier(x_train,
                     y_train,
                     clf_type='lr',
                     lr_regularization='l1',
                     svc_kernel='rbf',
                     optimize_params=True,
                     use_pca=False,
                     param_optimization_iter=100,
                     verbose=0):

    # Define classifiers
    if clf_type == 'lr':
        clf = LogisticRegression(penalty=lr_regularization)
        param_dist = {"clf__C": scipy.stats.expon(scale=100)}
        has_prob = True
    elif clf_type == 'svc':
        clf = SVC(kernel=svc_kernel)
        param_dist = {
            'clf__C': scipy.stats.expon(scale=100),
            'clf__gamma': scipy.stats.expon(scale=.1)
        }
        has_prob = False
    elif clf_type == 'rf':
        clf = RandomForestClassifier(n_estimators=20)
        param_dist = {
            "clf__max_depth": [3, None],
            "clf__max_features": scipy.stats.randint(1, 11),
            "clf__min_samples_split": scipy.stats.randint(1, 11),
            "clf__min_samples_leaf": scipy.stats.randint(1, 11),
            "clf__bootstrap": [True, False],
            "clf__criterion": ["gini", "entropy"]
        }
        has_prob = True

    else:
        print('Classifier type {} not found'.format(clf_type))
        return -1

    if use_pca:
        clf = Pipeline([('scale', sklearn.preprocessing.StandardScaler()),
                        ('pca', sklearn.decomposition.PCA(0.95)),
                        ('clf', clf)])
    else:
        clf = Pipeline([('scale', sklearn.preprocessing.StandardScaler()),
                        ('clf', clf)])
    # Run parameter optimization over training set
    if optimize_params:
        random_search = RandomizedSearchCV(clf,
                                           param_distributions=param_dist,
                                           n_iter=param_optimization_iter,
                                           scoring='roc_auc',
                                           verbose=verbose)
        random_search.fit(x_train, y_train)
        if verbose > 0:
            report(random_search.grid_scores_)
        params = random_search.best_params_
        clf.set_params(**params)

    # Train final model
    clf.fit(x_train, y_train)
    return clf, has_prob
Exemple #44
0
                                param_distributions=param_dist,
                                n_iter=n_iter_search,
                                scoring='mean_absolute_error')
    search = GridSearchCV(clf,
                          param_grid=param_dist,
                          scoring='mean_absolute_error')

    lle = manifold.LocallyLinearEmbedding(n_components=nfeats)
    for oidx, (train, test) in enumerate(cv):
        # print '=========\ncv %d/%d\n========='%(oidx+1,nfolds)
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]

        # X_train = lle.fit_transform(X_train)
        # X_test = lle.transform(X_test)

        search.fit(X_train, y_train)

        clf = search.best_estimator_
        clf.fit(X_train, y_train)
        test_scores.append(mean_absolute_error(clf.predict(X_test), y_test))
        train_scores.append(mean_absolute_error(clf.predict(X_train), y_train))

        clf = DummyRegressor(strategy='median')
        clf.fit(X_train, y_train)
        dummy_scores.append(mean_absolute_error(clf.predict(X_test), y_test))
    print '\n', seed, b
    print 'dummy: %.3f' % np.median(dummy_scores)
    print 'test: %.3f' % np.median(test_scores)
    print 'train: %.3f' % np.median(train_scores)
Exemple #45
0
# k-NN
print("\n")
print("[INFO] evaluating raw pixel accuracy...")
knn1 = KNeighborsClassifier(n_neighbors=15)
knn1.fit(trainRI, trainRL)
acc = knn1.score(testRI, testRL)
#print("[INFO] k-NN classifier: k=%d" % args["neighbors"])
print("[INFO] raw pixel accuracy: {:.2f}%".format(acc * 100))
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']
# specify "parameter distributions" rather than a "parameter grid"
param_dist = dict(n_neighbors=k_range, weights=weight_options)
# n_iter controls the number of searches
rand = RandomizedSearchCV(knn1, param_dist, cv=3, scoring='accuracy', n_iter=10,n_jobs=-1, random_state=5)
rand.fit(rawImages, class_names)
rand.grid_scores_
# examine the best model
print(rand.best_score_)
print(rand.best_params_)
# run RandomizedSearchCV 20 times (with n_iter=10) and record the best score
best_scores = []
for _ in range(20):
    rand = RandomizedSearchCV(knn1, param_dist, cv=3, scoring='accuracy', n_iter=10,n_jobs=-1)
    rand.fit(rawImages, class_names)
    best_scores.append(round(rand.best_score_, 3))
print(best_scores)



         X, y, test_size=0.1, random_state=42)


    # Create a Random Forest Classifier
    ## Run Randomized Search for Hyperparameter Optimization
    cv_call = StratifiedKFold(y_train,n_folds=10)
    # Specify cross-validation settings
    param_dist = {"n_estimators": randint(5, 500),
                 "class_weight": ["balanced","balanced_subsample"]}
    n_iter_search = 30
    clf = RandomForestClassifier(random_state=42,n_jobs=-1)
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search,cv=cv_call,
                                       scoring='f1')

    random_search = random_search.fit(X_train[feature_set], y_train)

    ## Retrieve Optimal Hyperparameter Values from Random Search
    best_parameters, score, _ = max(random_search.grid_scores_, key=lambda x: x[1])
    clf = RandomForestClassifier(random_state=42,n_jobs=-1,
                    n_estimators=187,#best_parameters["n_estimators"],
                    class_weight="balanced_subsample")#best_parameters["class_weight"])

    # best_parameters["n_estimators"]=187,best_parameters["class_weight"]="balanced_subsample"

    ## Run Model with Optimized Parameters on Entire Training Dataset
    clf = clf.fit(X[feature_set], y)

    # Join Test Datasets
    X_test = prepare_datasets(amazon,rot,test)
    preds_test = clf.predict(X_test[feature_set])