Ejemplo n.º 1
1
def fine_tune_gradient_boosting_hyper_params(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.grid_search import RandomizedSearchCV
    print "-- {} --".format("Fine-tuning Gradient Boosting Regression")
    rf = GradientBoostingRegressor(
        n_estimators=1000
    )
    param_dist = {
        "learning_rate": [0.01, 0.03, 0.05, 0.07, 0.09, 0.1, 0.15, 0.2],
        "max_depth": sp_randint(1, 15),
        "min_samples_split": sp_randint(1, 15),
        "min_samples_leaf": sp_randint(1, 15),
        "subsample": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        "max_features": sp_randint(1, 15)
    }
    n_iter_search = 300
    random_search = RandomizedSearchCV(
        rf,
        param_distributions=param_dist,
        n_iter=n_iter_search,
        n_jobs=-1,
        cv=5,
        verbose=1
    )

    start = time()
    random_search.fit(data_train_x, data_train_y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
Ejemplo n.º 2
0
def sk_generate_params(method, columns=None):
    param_dist = {}
    if 'rf' in method:
        param_dist = {
            "max_features": sp_unif(0.01, 1.0),
            "n_estimators": sp_randint(32, 256),
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"],
            "min_samples_split": sp_randint(2, 10),
            "min_samples_leaf": sp_randint(2, 10),
            "min_weight_fraction_leaf": sp_unif(0., 0.5),
            "class_weight": ['balanced', 'balanced_subsample']
        }
    elif 'svm' in method:
        param_dist = {
            "C": sp_unif(0.01, 20.),
            "kernel": ["linear"]
        }
    elif 'lr' in method:
        param_dist = {
            "C": sp_unif(0.01, 20.),
            "penalty": ["l1", "l2"],
        }
    if 'bagged' in method:
        _param_dist = {}
        for c in columns:
            for k, v in param_dist.items():
                _param_dist['{}__{}'.format(c, k)] = v
        _param_dist['weights'] = uniform_gen_5(0., 1.)
        return _param_dist
    else:
        return param_dist
Ejemplo n.º 3
0
    def getRandomForestClf(self, X, Y, param_list):
        clfName = "Random_Forest"
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
        clf = rf(n_estimators=300, max_depth=None, min_samples_split=1, random_state=0, bootstrap=True, oob_score = True)
            
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = 8
            tmpHighDepth = 30
            
            
            param_dist = {
                          "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                          "max_features": sp_randf(0,1),
                          "min_samples_split": sp_randint(1, 11),
                          "min_samples_leaf": sp_randint(1, 11),
                          "criterion": ["gini", "entropy"], 
                          "n_estimators" : sp_randint(5, 12),
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
        
        else:    

            if not param_list is None:
                clf = rf()
                clf.set_params(**param_list)
            clf.fit(X,Y)    
            
        return clf
Ejemplo n.º 4
0
def train(model_id,train_x,train_y,valid_x,valid_y,test_x):
    train_x,train_y=shuffle(train_x,train_y)


    random_state=random.randint(0, 1000000)
    rf = RandomForestClassifier(n_jobs=8)

    param_dist = {
            "n_estimators":sp_randint(100,300),
        "criterion": ["gini"],
        #"max_depth": sp_randint(3, 10000),
        #"min_samples_split": sp_randint(1, 300),
        #"min_samples_leaf": sp_randint(1, 300),
        "max_features": sp_randint(10, 26),
        "bootstrap": [True, False],
        'random_state':sp_randint(1, 1000000),
        }

    clf = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=50,cv=10,scoring='roc_auc')

    clf.fit(train_x, train_y)
    valid_predictions = clf.predict_proba(valid_x)[:, 1]
    test_predictions= clf.predict_proba(test_x)[:, 1]

    loss = roc_auc_score(valid_y,valid_predictions)
    print('loss:')
    print(loss)
    print(clf.best_estimator_)
    data.saveData(valid_id,valid_predictions,"./valid_results/valid_"+str(model_id)+".csv")
    data.saveData(test_id,test_predictions,"./results/results_"+str(model_id)+".csv")
Ejemplo n.º 5
0
 def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False):
     super().__init__(X, Y, tune_parameters)
     if tune_parameters:
         self.param_dist_random = {'max_features': sp_randint(1, self.X.shape[1]),
                                   'n_estimators': sp_randint(1, 100)}
     self.clf = BaggingClassifier(self.classifier, n_estimators=self.estimators, n_jobs=8,
                                  max_samples=self.max_samples, max_features=self.max_features)
Ejemplo n.º 6
0
def train_cv():
    # ---------------------- load the data
    train_df = pd.read_csv("train_processed.csv",index_col="PassengerId")
    Xtrain = train_df[feature_names]
    ytrain = train_df["Survived"]

    # ---------------------- train
    loss = ['deviance', 'exponential']
    learning_rate = np.logspace(-5,1)
    n_estimate_dist = sp_randint(1000,4800)
    max_depth_dist = sp_randint(1,10)
    param_dist = dict(loss=loss,
                    learning_rate=learning_rate,
                    n_estimators=n_estimate_dist,
                    max_depth=max_depth_dist)

    gbdt = GradientBoostingClassifier(verbose=1)
    searchcv = RandomizedSearchCV(estimator=gbdt, param_distributions=param_dist,n_iter=210,verbose=1,n_jobs=-1)

    print "--------------------- RandomizedSearchCV begins"
    searchcv.fit(Xtrain,ytrain)      
    print "--------------------- RandomizedSearchCV ends"
    print "best score: ",searchcv.best_score_                                  
    print "best parameters: ",searchcv.best_params_

    common.dump_predictor('gbdt-cv.pkl',searchcv.best_estimator_)
    print "--------------------- GBDT saved into file"
def get_random_grid_CV_params():
    """Define the Random Grid Search parameters for each model."""
    logit_params = {"C": sp_expon(loc=0.001, scale=1),
                    "fit_intercept": [True, False],
                    "intercept_scaling": sp_randint(1, 5),
                    "warm_start": [False, True]
                    }
    rf_params = {"min_samples_split": sp_randint(1, 50),
                 "min_samples_leaf": sp_randint(1, 50),
                 "criterion": ["gini", "entropy"],
                 "class_weight": ['balanced', 'balanced_subsample']
                 }
    ada_dt_params = {"learning_rate": sp_expon(loc=0.001, scale=1.5),
                     "algorithm": ['SAMME.R', 'SAMME']
                     }
    gbc_params = {"learning_rate": sp_expon(loc=0.001, scale=0.5),
                  "subsample": sp_uniform(loc=0.2, scale=0.8),
                  "max_features": [None, 'auto'],
                  "max_depth": sp_randint(2, 6),
                  }
    svc_params = {"C": sp_expon(loc=0.001, scale=2),
                  "kernel": ['rbf', 'poly'],
                  "degree": sp_randint(2, 10),
                  "coef0": [0, 1, 2],
                  "shrinking": [True, False]
                  }
    rnd_CV_param_distributions = {'Logistic': logit_params,
                                  'RandomForest': rf_params,
                                  'AdaBoost_DT': ada_dt_params,
                                  'GBC': gbc_params,
                                  'SVC': svc_params
                                  }
    return rnd_CV_param_distributions
def makeRandomCV(dataset,dbtype='CATH',
                level=1,
                k_iters=10,
                minsamples=500,
                clf = ExtraTreesClassifier(n_estimators=5,class_weight='auto')):

    from scipy.stats import randint as sp_randint
    
    dataDict = dbParser(dataset,level=level,dbtype=dbtype,minsamples=minsamples)
    print dataDict

    labels = dataDict['target_names']

    param_dist = {"max_depth": [3, None],
              "max_features": sp_randint(1, 11),
              "min_samples_split": sp_randint(1, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

    n_iter_search = k_iters
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=n_iter_search)


    random_search.fit(dataDict['vectors'], dataDict['target_names'])
    report(random_search.grid_scores_)
Ejemplo n.º 9
0
def best_ExtraTree(X, y):
    from sklearn.grid_search import RandomizedSearchCV
    from scipy.stats import randint as sp_randint
    from sklearn.metrics import accuracy_score
    
    X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(),
                                                        random_state=42)
    
    clf = ExtraTreesClassifier(max_depth=None,
                                 bootstrap = False)

    grid = {'n_estimators': sp_randint(250, 400),
            'min_samples_leaf' : sp_randint(1, 12),
            'max_features' : sp_randint(5, 50)}

    clf_rfc = RandomizedSearchCV(clf, n_jobs=4, n_iter=10,
                                 param_distributions=grid,
                                 scoring='accuracy')

    y_hat = clf_rfc.fit(X_train,
                    y_train.ravel()).predict(X_test)

    print('Best Params: \n', clf_rfc.best_params_ )
    print("Accuracy with Extra Forest = %4.4f"  %
          accuracy_score(y_test.ravel(), y_hat))
    binarize_y_confustion_matrix(y_test.ravel(), y_hat)
    return(clf_rfc.best_params_) 
def main():
    NUM_TRAIN = bw_componentrecognition.NUM_TRAIN
    N_BINS = 23
    N_HU_MOMENTS = 7
    N_FEATURES = N_BINS + N_HU_MOMENTS

    X, y = bw_componentrecognition.Data.loadTrain(NUM_TRAIN, N_BINS)

    scaler = preprocessing.StandardScaler().fit(X)
    X = scaler.transform(X)
    
    clfs = [
        RandomForestClassifier(n_estimators=20),
        ]
    
    param_dists = [
        {"max_depth": [10, 5, 3, None],
          "max_features": sp_randint(1, 11),
          "min_samples_split": sp_randint(1, 11),
          "min_samples_leaf": sp_randint(1, 11),
          "bootstrap": [True, False],
          "criterion": ["gini", "entropy"]},]
        
    
    for clf, param_dist in zip(clfs, param_dists):
        # run randomized search
        n_iter_search = 25
        
        random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                           n_iter=n_iter_search)

        random_search.fit(X, y)

        report(random_search.grid_scores_)
Ejemplo n.º 11
0
    def getKnnClf(self, X, Y):
        clfName = "K_NN"
        
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        clf = KNeighborsClassifier(
                                n_neighbors=5, 
                                weights='uniform', 
                                algorithm='auto', 
                                leaf_size=30, 
                                p=2, 
                                metric='minkowski', 
                                metric_params=None, 

                                )
        
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            
            param_dist = {
                          "n_neighbors": sp_randint(4, 8),
                          "weights": ['uniform', 'uniform'],
                          "leaf_size": sp_randint(30, 60),
                          "algorithm": ['auto', 'auto'],
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
            
        return clf
Ejemplo n.º 12
0
 def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False):
     super().__init__(X, Y, tune_parameters)
     if tune_parameters:
         self.param_dist_random = {'max_depth': sp_randint(1, 100),
                                   'min_samples_leaf': sp_randint(1, 100),
                                   'max_features': sp_randint(1, self.X.shape[1] - 1),
                                   'criterion': ['entropy', 'gini']}
     self.clf = RandomForestClassifier(n_estimators=100, n_jobs=8)
Ejemplo n.º 13
0
def deploy_07(train, test):
    """ Deploy 07: Ensemble modeling with two types of cross validation """
    from operator import itemgetter
    from scipy.stats import randint as sp_randint
    
    from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import VotingClassifier

    predictors = ['PassengerId','Gender', 'AgeRange',
                  'Family', 'AdjFare']
    
    # Data Munging
    train, test = deploy_03_features(train, test)
    train = train.fillna(0)
    test = test.fillna(0)

    X = train[predictors]
    y = train["Survived"]

    # Algorithm specs

    clf = RandomForestClassifier(
        n_estimators=100
    )

    # specify parameters and distributions to sample from
    param_dist = {"max_depth": [3, None],
                  "max_features": sp_randint(1, 5),
                  "min_samples_split": sp_randint(1, 5),
                  "min_samples_leaf": sp_randint(1, 5),
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}
    
    # run randomized search
    n_iter_search = 20
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search)

    # use a full grid over all parameters
    param_grid = {"max_depth": [3, None],
                  "max_features": [1, 3, 5],
                  "min_samples_split": [1, 3, 5],
                  "min_samples_leaf": [1, 3, 5],
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}

    # run grid search
    grid_search = GridSearchCV(clf, param_grid=param_grid)
   
    # Using both with a Voting Classifier
    alg = VotingClassifier(estimators=[('gr', grid_search),
                                       ('rs', random_search)], voting='soft')

    # Make submission
    create_submission(alg, train, test, predictors,
                      "results/deploy-07.csv")
Ejemplo n.º 14
0
    def learn_and_predict_xgb(self, dataset='train'):
        '''
        Use xgboost to do work
        '''
	#predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Titles", "FamilyId"]
	predictors = self.PREDICTORS
        if dataset == 'train':
	    param_dist = {'max_depth': sp_randint(3, 10),
	                  'learning_rate': [0.01, 0.03, 0.1, 0.3, 1.0],
	                  'gamma': [0, 0.1, 0.2, 0.3],
			  'subsample': [.1, .2, .3, .4, 0.5],
			  'colsample_bytree': [.4, .5],
			  'objective': ['binary:logistic'],
			  'n_estimators': sp_randint(20, 150),
			  }

	    clf = xgb.XGBClassifier()
            #random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=500, cv=3)
	    #random_search.fit(self.train_df[predictors], self.train_df['Survived'])

	    #report(random_search.grid_scores_)
            params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 
	    bst = xgb.train(params, self.DMatrix_train)
	    predictions = pd.Series(bst.predict(self.DMatrix_train))
	    predictions[predictions >= .5] = 1
	    predictions[predictions < .5] = 0
	    predictions = [int(x) for x in predictions.tolist()]

            train_model = pd.DataFrame({
	                  'PassengerId': self.train_df['PassengerId'],
			  'Survived': predictions,
	    })
	    train_model.to_csv('./xgb_train.csv', index=False)

        else: 
            params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 
	    bst = xgb.train(params, self.DMatrix_train)
	    #clf = xgb.XGBClassifier(params)
	    #clf.fit(self.train_df[predictors], self.train_df['Survived'], verbose=True)
	    #print(self.test_df[predictors])
	    predictions = pd.Series(bst.predict(self.DMatrix_test))
	    predictions_proba = predictions.copy()
	    predictions[predictions >= .5] = 1
	    predictions[predictions < .5] = 0
	    predictions = [int(x) for x in predictions.tolist()]
	    print(predictions)
            submission = pd.DataFrame({
                    'PassengerId': self.test_df['PassengerId'],
		    'Survived': predictions 
		    })
            submission.to_csv("xgboost_845.csv", index=False)
	    
            submission_proba = pd.DataFrame({
                    'PassengerId': self.test_df['PassengerId'],
		    'Survived': predictions_proba,
		    })
            submission_proba.to_csv("xgboost_845_soft.csv", index=False)
def build_sample(regressor, name):

	# print estimator.get_params().keys() : specify parameters and distributions to sample from
	param_dist = {"max_depth": [3, None],
		      "max_features": sp_randint(1, 11),
		      "min_samples_split": sp_randint(1, 11),
		      "min_samples_leaf": sp_randint(1, 11)}#,
		      #"bootstrap": [True, False],
		      #"criterion": ["mse", "entropy"]}

	

	# run randomized search
	n_iter_search = 20
	random_search = RandomizedSearchCV(regressor, param_distributions=param_dist,
		                           n_iter=n_iter_search)
	
	# time...
	start = time()
	# repeat the CV procedure 10 times to get more precise results
	n = 10  
	# for each iteration, randomly hold out 10% of the data as CV set
	for i in range(n):
		X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
		      sample_X, sample_y, test_size=.10, random_state=i*SEED)
		# train with rand...
		random_search.fit(X_train, y_train)
		# train...
		#regressor = regressor.fit(X_train, y_train)
		# save model
		#store_pkl(regressor, name + ".pkl")
		# predict on train
		preds = random_search.predict(X_cv)
		# print 
		#print preds
		# create DataFrame
		#preds = DataFrame(preds, columns = ["prime_tot_ttc_preds"])
		#print preds
		#print y_cv
		# mape
		mape_r = mape(y_cv, preds)
		# print
		print "MAPE of (fold %d/%d) of %s is : %f" % (i+1 , n, name, mape_r)
		# time...
		print("RandomizedSearchCV took %.2f seconds for %d candidates"
		      " parameter settings." % ((time() - start), n_iter_search))
		report(random_search.grid_scores_)
	# predict on test
	predict_res = random_search.predict(sample_t)
	preds_on_test = DataFrame(list(zip(sample_id, predict_res)), columns = ["ID", "CODIS"])
	preds_on_test['ID'].astype(int)
	# save predictions
	store_csv(preds_on_test, name + ".csv")
	return predict_res
Ejemplo n.º 16
0
def build_nn(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a regression neural network model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    net = NeuralNet(layers=[('input', InputLayer),
                            ('hidden0', DenseLayer),
                            ('hidden1', DenseLayer),
                            ('output', DenseLayer)],
                    input_shape=(None, x_train.shape[1]),  # Number of i/p nodes = number of columns in x
                    hidden0_num_units=15,
                    hidden0_nonlinearity=lasagne.nonlinearities.softmax,
                    hidden1_num_units=17,
                    hidden1_nonlinearity=lasagne.nonlinearities.softmax,
                    output_num_units=1,  # Number of o/p nodes = number of columns in y
                    output_nonlinearity=lasagne.nonlinearities.softmax,
                    max_epochs=100,
                    update_learning_rate=0.01,
                    regression=True,
                    verbose=0)

    # Finding the optimal set of params for each variable in the training of the neural network
    param_dist = {'hidden0_num_units':sp_randint(3, 30), 'hidden1_num_units':sp_randint(3, 30)}
    clf = RandomizedSearchCV(estimator=net, param_distributions=param_dist,
                             n_iter=15, n_jobs=-1)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)

    with open('../trained_networks/nn_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(net, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
    def best_RandomForest(self, df=pd.DataFrame(),
        flag_interactions=False,
                            flag_clean_features=False,
                            impute_func=None,
                            fill_test_func=None):


        df = self.df
        if impute_func:
            print('imputing data...')
            df, self.df_X_realtest = self.impute_data(df,
                                                    self.df_X_realtest,
                                                    impute_func,
                                                    fill_test_func)

        print('get X, y from training set')
        (self.X, self.y) = self.ready_for_model_train(
                                    df, flag_interactions=flag_interactions,
                                    flag_clean_features=flag_clean_features)


        clf = RandomForestClassifier(bootstrap=False)

        grid = {'n_estimators': sp_randint(170, 350),
                'min_samples_leaf': sp_randint(1, 12),
                'max_features': sp_randint(2, 50),
                'max_depth': sp_randint(5, 30),
                'criterion': ['entropy','gini']}

        clf_rfc = RandomizedSearchCV(clf, n_jobs=4,
                                    n_iter=25, cv=6,
                                    param_distributions=grid,
                                    scoring='accuracy')

        print('Finding the best parameters...')
        clf_rfc.fit(self.X, self.y.ravel())

        print('preparing X, y from test set...')
        X_test, y_test = self.ready_for_model_test(
            self.df_X_realtest, flag_interactions)

        y_hat = clf_rfc.predict(X_test)


        print('Best Params: \n')
        for k, v in clf_rfc.best_params_.items():
            print(k, v)

        print("Accuracy with Random Forest = %4.4f"  %
            accuracy_score(y_test, y_hat))
        #binarize_y_confustion_matrix(y_test, y_hat)
        return(clf_rfc.best_params_)
Ejemplo n.º 18
0
def randomized_search_and_grid_search_for_hyperparameter_estimation(train_data, labels):
    # build a classifier
    clf = RandomForestClassifier(n_estimators = 20)


    # Utility function to report best scores
    def report(grid_scores, n_top = 3):
        top_scores = sorted(grid_scores, key = itemgetter(1), reverse = True)[:n_top]
        for i, score in enumerate(top_scores):
            print("Model with rank: {0}".format(i + 1))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  score.mean_validation_score,
                  np.std(score.cv_validation_scores)))
            print("Parameters: {0}".format(score.parameters))
            print("")


    # specify parameters and distributions to sample from
    param_dist = {"max_depth": [3, None],
                  "max_features": sp_randint(1, 11),
                  "min_samples_split": sp_randint(1, 11),
                  "min_samples_leaf": sp_randint(1, 11),
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}

    # run randomized search
    n_iter_search = 20
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search)

    start = time()
    random_search.fit(train_data, labels)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)

    # use a full grid over all parameters
    param_grid = {"max_depth": [3, None],
                  "max_features": [1, 3, 10],
                  "min_samples_split": [1, 3, 10],
                  "min_samples_leaf": [1, 3, 10],
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"]}

    # run grid search
    grid_search = GridSearchCV(clf, param_grid=param_grid)
    start = time()
    grid_search.fit(train_data, labels)

    print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
          % (time() - start, len(grid_search.grid_scores_)))
    report(grid_search.grid_scores_)
Ejemplo n.º 19
0
 def __init__(self, training_iters=1000, cv_folds=5, scoring_method="f1", n_jobs=4):
     super(AdoptionPredictor, self).__init__()
     self.clf = RandomForestClassifier(class_weight="balanced_subsample")
     self.cv_params = {"folds": cv_folds, "scorer": scoring_method}
     self.n_iter = training_iters
     self.n_jobs = n_jobs
     # define param search space for RandomizedSearchCV
     self.param_grid = {"max_depth": [3, 5, 7, 9, None],
                        "n_estimators": sp_randint(10, 100),
                        "max_features": ["sqrt", "log2"],
                        "min_samples_split": sp_randint(3, 10),
                        "min_samples_leaf": sp_randint(1, 10),
                        "criterion": ["gini", "entropy"]}
Ejemplo n.º 20
0
    def __init__(self, X: np.array, Y: np.array, tune_parameters=False):
        super().__init__(X, Y, tune_parameters=False)
        self.X = X.todense()  # TensorFlow/Skflow doesn't support sparse matrices
        output_layer = len(np.unique(Y))
        if tune_parameters:
            self.param_dist_random = {'learning_rate': random.random(100),
                                      'optimizer': ['Adam'],
                                      'hidden_units': [sp_randint(50, 500), sp_randint(50, 500)]}

        self.clf = skflow.TensorFlowDNNClassifier(hidden_units=self.hidden_units,
                                                  n_classes=output_layer, steps=self.steps,
                                                  learning_rate=self.learning_rate, verbose=0,
                                                  optimizer=self.optimizer)
def RFC_Classifier(Train_DS, y, Actual_DS, grid=True):
    print("***************Starting Random Forest Classifier***************")
    t0 = time()

    if grid:
       #used for checking the best performance for the model using hyper parameters
        print("Starting model fit with Grid Search")

        # specify parameters and distributions to sample from
        param_dist = {
                      "max_depth": [1, 2, 3, 4, 5, None],
                      "max_features": sp_randint(1, 11),
                      "min_samples_split": sp_randint(1, 11),
                      "min_samples_leaf": sp_randint(1, 11),
                      "bootstrap": [True, False]
                     }

        clf = RandomForestClassifier(n_estimators=100,n_jobs=1)

        # run randomized search
        n_iter_search = 20
        clf = RandomizedSearchCV(clf, param_distributions=param_dist,
                                           n_iter=n_iter_search, scoring = 'log_loss')

        start = time()
        clf.fit(Train_DS, y)

        print("RandomizedSearchCV took %.2f seconds for %d candidates"
                " parameter settings." % ((time() - start), n_iter_search))
        report(clf.grid_scores_)

        print("Best estimator found by grid search:")
        print(clf.best_estimator_)
        print(clf.grid_scores_)
        print(clf.best_score_)
        print(clf.best_params_)
        print(clf.scorer_)
    else:
        clf = RandomForestClassifier(n_estimators=10,n_jobs=1)
        Kfold_score = Kfold_Cross_Valid(Train_DS, y, clf)
        clf.fit(Train_DS, y)

    #incase if it is required for stacking
    pred_Train = clf.predict_proba(Train_DS)

    #Predict actual model
    pred_Actual = clf.predict_proba(Actual_DS)
    print("Actual Model predicted")

    print("***************Ending Random Forest Classifier***************")
    return pred_Train, pred_Actual
    def best_XGboost(self, df=pd.DataFrame(),
        flag_interactions=False,
                            flag_clean_features=False,
                            impute_func=None,
                            fill_test_func=None):


        df = self.df
        if impute_func:
            print('imputing data...')
            df, self.df_X_realtest = self.impute_data(df,
                                                    self.df_X_realtest,
                                                    impute_func,
                                                    fill_test_func)

        print('get X, y from training set')
        (self.X, self.y) = self.ready_for_model_train(
                                    df, flag_interactions=flag_interactions,
                                    flag_clean_features=flag_clean_features)


        clf = XGBClassifier()

        grid = {'n_estimators': sp_randint(100, 600),
                'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.3, 0.5],
                'max_depth': sp_randint(5, 30),
                'min_child_weight': sp_randint(1, 5)}

        clf_rfc = RandomizedSearchCV(clf, n_jobs=3,
                                    n_iter=15, cv=4,
                                    param_distributions=grid,
                                    scoring='accuracy')

        print('Finding the best parameters...')
        clf_rfc.fit(self.X, self.y.ravel())

        print('preparing X, y from test set...')
        X_test, y_test = self.ready_for_model_test(
            self.df_X_realtest, flag_interactions)

        y_hat = clf_rfc.predict(X_test)


        print('Best Params: \n')
        for k, v in clf_rfc.best_params_.items():
            print(k, v)

        print("Accuracy with Random Forest = %4.4f"  %
            accuracy_score(y_test, y_hat))
        #binarize_y_confustion_matrix(y_test, y_hat)
        return(clf_rfc.best_params_)
Ejemplo n.º 23
0
def train(feature_names,estimators):
    traindf = pd.read_csv("train_extend.csv",index_col="datetime")

    parm_dist = dict(learning_rate=[0.001,0.005,0.01,0.02,0.05,0.1,0.3],                   
                     n_estimators=sp_randint(100,2000),                    
                     max_depth=sp_randint(3,6),
                     min_samples_leaf = range(1,10)
                     )
    n_iter = 300
    n_jobs = 6
    for estimator in estimators:
        best_cv_score = estimator.train(traindf,parm_dist,n_iter,n_jobs)
        print "************* '%s' got best CV score: %f"%(estimator.target_column,best_cv_score)
        estimator.dump()
Ejemplo n.º 24
0
 def __init__(self, X: csr_matrix, Y: np.array, tune_parameters=False):
     super().__init__(X, Y, tune_parameters)
     if tune_parameters:
         self.param_dist_random = {'shrinking': [True, False],
                                   'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                   'degree': sp_randint(2, 5)}
     self.clf = SVC(kernel='rbf', shrinking=True)
Ejemplo n.º 25
0
def return_best_rf_regressor(df, target, num_trees_hyperparameter, num_trees_final_clf, num_iterations):
	print "entering return best rf regressor function"
	if df.shape[0] < 10000:
		num_samples = df.shape[0]
	else:
		num_samples = int(df.shape[0]*0.7)

	print "Sample dataframe"
	#use
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples)

	# figure out a vary this some how
	"""
	param_dist = {"max_depth": [5, None],
              "max_features": sp_randint(1, df.shape[1]),
              "min_samples_split": sp_randint(1, 15),
              "min_samples_leaf": sp_randint(1, 15),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
    """
	param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True]}

	clf = RandomForestRegressor(n_estimators=num_trees_hyperparameter)
	print "starting hyperparameter search"
	clf_best, best_params = hyperparameter_search_random(X, y, clf, param_dist, num_iterations)

	print "sample data for fitting model"
    #train new classifier on the entire dataset
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples=df.shape[0])

	clf_final = RandomForestRegressor(n_estimators=num_trees_final_clf, max_depth = best_params["max_depth"], min_samples_leaf = best_params["min_samples_leaf"],  min_samples_split = best_params["min_samples_split"], bootstrap = best_params["bootstrap"], max_features = best_params["max_features"])

	print "Fitting Random Forest Regressor"
	clf_final.fit(X,y)
	return clf_final, column_list_for_sampled
Ejemplo n.º 26
0
    def grid_search(self, **kwargs):
        """Using grid search to find the best parameters."""
        n_jobs = kwargs.get('n_jobs', 1)
        n_iter = kwargs.get('n_iter', 5)
        col2fit = kwargs.get('features')
        bids_path = kwargs.get('bids_path', 'data/bids.csv')
        score = kwargs.get('score')

        # use a full grid over all parameters
        parameters = {"max_depth": sp_randint(1, 30),
                      "criterion": ["gini", "entropy"],
                      "max_features": [1.0, 0.8, 0.6, 0.4, 0.2, 0.1],
                      "min_samples_leaf": sp_randint(1, 25),
                      "min_samples_split": sp_randint(1, 25),
                      "bootstrap": [True, False],
                      "class_weight": [None, "auto", "subsample"]}

        if not self.iscleaned:
            print 'Preparing the data...'
            self.prepare_data(bids_path, **kwargs)
        else:
            print 'data frame is already cleaned...'
        train_values = self.df_train[col2fit].values
        target_values = self.df_train['outcome'].values

        pre_dispatch = '2*n_jobs'

        # Fit the grid
        print 'fitting the grid with n_jobs = {}...'.format(n_jobs)
        start = time()
        self.set_model(**kwargs)
        rf_grid = grid_search.RandomizedSearchCV(self.learner,
                                                 parameters,
                                                 n_jobs=n_jobs, verbose=2,
                                                 pre_dispatch=pre_dispatch,
                                                 scoring=score,
                                                 error_score=0,
                                                 n_iter=n_iter)
        rf_grid.fit(train_values, target_values)
        print('Grid search finished')

        print("\n\nGridSearchCV took %.2f seconds for %d candidate parameter settings."
              % (time() - start, len(rf_grid.grid_scores_)))
        self.grid_report(rf_grid.grid_scores_, 15)

        print('\n\nBest score = {}'.format(rf_grid.best_score_))
        print('Best params = {}\n\n'.format(rf_grid.best_params_))
Ejemplo n.º 27
0
    def getExtraTressClf(self, X, Y, param_list=-1):
        clfName = "Extra_Trees"
        
        ## http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
        clf = ExtraTreesClassifier(
                                    n_estimators=10, 
                                    criterion='gini', 
                                    max_depth=None, 
                                    min_samples_split=2, 
                                    min_samples_leaf=1, 
                                    min_weight_fraction_leaf=0.0, 
                                    max_features='auto', 
                                    max_leaf_nodes=None, 
                                    bootstrap=False, 
                                    oob_score=False, 
                                    n_jobs=1, 
                                    random_state=None, 
                                    verbose=0, 
                                    warm_start=False, 
                                    class_weight=None)
        
        
        if self._gridSearchFlag == True:
            log(clfName + " start searching param...")
            tmpLowDepth = int(len(X.columns) * 0.7)
            tmpHighDepth = int(len(X.columns) )
            
            param_dist = {
                          "max_depth": sp_randint(tmpLowDepth, tmpHighDepth),
                          "max_features": sp_randf(0,1),
                          "min_samples_split": sp_randint(1, 11),
                          "min_samples_leaf": sp_randint(1, 11),
                          "bootstrap": [True, True],
                          "criterion": ["gini", "entropy"], 
                          "oob_score":[True, True],
                          "n_estimators" : sp_randint(800, 1200),
                          }
            
            clf = self.doRandomSearch(clfName, clf, param_dist, X, Y)
        else:    

            if param_list != -1:
                clf = ExtraTreesClassifier(param_list)
                clf.set_params(**param_list)
            clf.fit(X,Y)
        
        return clf
def evalModel(train_data, eval_data, train_labels, eval_labels, seed):
    joined_data = np.concatenate((train_data, eval_data), axis=0)
    joined_labels = np.concatenate((train_labels, eval_labels), axis=0)
    train_mask = np.zeros(train_data.shape[0]) - 1.0
    eval_mask = np.zeros(eval_data.shape[0])
    joined_mask = np.concatenate((train_mask, eval_mask), axis=0)
    ps = PredefinedSplit(test_fold=joined_mask)
    loss = make_scorer(get_rmsle, greater_is_better=False)
    train_data = sparse.csr_matrix(train_data)
    eval_data = sparse.csr_matrix(eval_data)

    clf = RandomForestRegressor(random_state=seed, verbose=1)
    # clf.fit(train_data, train_labels)
    # preds = clf.predict(eval_data)
    # print(get_rmsle(eval_labels, preds))
    ## achieves 0.263

    # specify parameters and distributions to sample from
    param_dist = {
        "n_estimators": sp_randint(300, 800),
        "max_depth": sp_randint(10, 50),
        "max_features": ["auto", "sqrt", "log2"],
        "min_samples_split": sp_randint(1, 11),
        "min_samples_leaf": sp_randint(1, 11),
    }

    # run randomized search
    n_iter_search = 60
    random_search = RandomizedSearchCV(
        clf,
        param_distributions=param_dist,
        cv=ps,
        scoring=loss,
        n_iter=n_iter_search,
        n_jobs=-1,
        pre_dispatch="n_jobs",
        verbose=2,
    )

    start = time()
    random_search.fit(joined_data, joined_labels)
    print(
        "RandomizedSearchCV took %.2f seconds for %d candidates"
        " parameter settings." % ((time() - start), n_iter_search)
    )
    report(random_search.grid_scores_)
def evalModel(data, labels):
    loss  = make_scorer(get_rmsle, greater_is_better=False)
    seed1 = 42
    clf = xgb.XGBRegressor(seed=seed1, silent=True)
    
    param_dist = { "learning_rate":sp_uniform(0.01,0.1),
                   "n_estimators":sp_randint(50,500),
                   "max_depth": sp_randint(2,6), 
                   "subsample": sp_uniform(0.5,0.4),
                   "max_delta_step": sp_uniform(1,2),
                   "min_child_weight":sp_uniform(1,6),
                   "colsample_bytree":sp_uniform(0.8,0.2)};    
    
    n_iter_search = 60
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, scoring=loss,
                                       n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2)
    report(random_search.grid_scores_,n_top=5)
def createRandomSearch(clf, X, y):

    param_dist = {"max_depth": [3, None], \
                  "max_features": sp_randint(1, 11), \
                  "min_samples_split": sp_randint(1, 11), \
                  "min_samples_leaf": sp_randint(1, 11), \
                  "bootstrap": [True, False], \
                  "criterion": ["gini", "entropy"]
                 }

    n_iter_search = 20
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist, \
                                            n_iter=n_iter_search)
    start = time()
    random_search.fit(X, y)

    print("RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)
Ejemplo n.º 31
0
# Split data into train/test by year
pickups_14_15['day_of_week'] = pickups_14_15['day_of_week'].apply(str)
pickups_14_15['hour'] = pickups_14_15['hour'].apply(str)
pickups_14_15['month'] = pickups_14_15['month'].apply(str)

# Create training data (2014)
X_train = pickups_14_15[pickups_14_15['year'] == 2014]
Y_train = X_train['passenger_count']
X_train.drop(['date', 'passenger_count', 'year'], axis=1, inplace=True)
X_train = pd.get_dummies(X_train)

# Try random forest model, randomized search for optimal parameters
rf = RandomForestRegressor()
# Specify parameters and distributions to sample from
param_dist = {'n_estimators': sp_randint(1, 101),
              'max_depth': [1, 2, 3, None],
              'max_features': sp_randint(1, X_train.shape[1]),
              'min_samples_split': sp_randint(1, 11),
              'min_samples_leaf': sp_randint(1, 11),
              'bootstrap': [True, False]}

# Run randomized search
# Try it on subset of training data to speed up search
sample_index = X_train.sample(50000).index
n_iter_search = 50
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=n_iter_search)
random_search.fit(X_train.ix[sample_index], Y_train.ix[sample_index])
random_search.best_params_

# Create test data (2015)
Ejemplo n.º 32
0
# Import data
tweets = pd.read_csv('labeled_data.csv')
grouped_tweets = tweets.groupby(['author', 'class'])['text'].apply(' '.join).reset_index()

# Train_test_split
X = grouped_tweets['text']
Y = grouped_tweets['class']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .25, random_state = 0)

# Define the pipeline
tfidf = TfidfVectorizer(tokenizer = spacy_tokenizer, max_features = 5000)
rf_grid_pipe = Pipeline([('vect', tfidf),('fit', RandomForestClassifier())])

# Grid search
param_grid = {"fit__bootstrap" : [True, False],
              "fit__n_estimators" : sp_randint(50, 150),
              "fit__max_depth" : [10, 50, None],
              "fit__max_leaf_nodes" : sp_randint(10, 50)
              }
grid = RandomizedSearchCV(rf_grid_pipe, param_grid,
                          cv=3, n_iter=20, n_jobs=14, random_state=0)
grid.fit(X_train, y_train)
preds = grid.predict(X_test)

accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

# logs for comet_ml
Ejemplo n.º 33
0
# ElasticNet

EN_gscv_param_grid = dict(ElasticNetReg__alpha=[.1, 0.5, 1., 5., 10.],
                          ElasticNetReg__l1_ratio=[.05, .1, .5, .7, .9, .95])

EN_param_distro = dict(
    ElasticNetReg__alpha=EN_gscv_param_grid["ElasticNetReg__alpha"],
    ElasticNetReg__l1_ratio=sp_exp(scale=1))

# KNeighborsReg

KNR_gscv_param_grid = dict(KNeighborsReg__weights=['uniform', 'distance'],
                           KNeighborsReg__n_neighbors=[5, 10, 30, 50])

KNR_param_distro = dict(KNeighborsReg__weights=['uniform', 'distance'],
                        KNeighborsReg__n_neighbors=sp_randint(5, 50))

# RandomForestReg

RFR_gscv_param_grid = dict(
    RandomForestReg__n_estimators=[10, 30, 50, 100, 200, 500, 1000],
    RandomForestReg__criterion=['mse', 'mae'],
    RandomForestReg__max_features=[None, .75, .5, 'sqrt'],
    # RandomForestReg__max_features=[0.25, 'auto', 'sqrt', 'log2'],
    RandomForestReg__min_samples_leaf=[1, 3, 5],
    RandomForestReg__max_depth=[3, 5, 7, 10])

RFR_param_distro = RFR_gscv_param_grid
RFR_param_distro['RandomForestReg__n_estimators'] = sp_randint(
    10, 500)  # 100, 1000
RFR_param_distro['RandomForestReg__max_depth'] = sp_randint(3, 10)
Ejemplo n.º 34
0
    # gskf = list(StatifiedGroupK_Fold.StratifiedGroupKfold(n_splits=5).split(X_resampled, Y_resampled, groups))
    sgkf = StatifiedGroupK_Fold.StratifiedGroupKfold(n_splits=5)

    params = {
        'colsample_bytree': 0.9009933084016689,
        'min_child_samples': 123,
        'min_child_weight': 0.001,
        'num_leaves': 40,
        'reg_alpha': 0,
        'reg_lambda': 0,
        'subsample': 0.8426999443200605
    }

    # GRID
    param_test = {
        'num_leaves': sp_randint(6, 50),
        'min_child_samples': sp_randint(100, 500),
        'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
        'subsample': sp_uniform(loc=0.2, scale=0.8),
        'colsample_bytree': sp_uniform(loc=0.4, scale=0.7),
        'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
        'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
        #'feval': 'auc',
    }

    # This parameter defines the number of HP points to be tested
    n_HP_points_to_test = 100

    # n_estimators is set to a "large value". The actual number of trees build will depend on early
    # stopping and 5000 define only the absolute maximum
    # clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metrics='none', n_jobs=-1, n_estimators=5000, class_weight='balanced')
X_train_tfidf = tfidf_transformer.fit_transform(X_train_count)

pca = TruncatedSVD(n_components=20)
pca.fit(X_train_tfidf)

X_train_pca = pca.transform(X_train_tfidf)

# Test data transformations
X_test_count = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_count)
X_test_pca = pca.transform(X_test_tfidf)

clf = RandomForestClassifier()

parameters_rand = {
    "n_estimators": sp_randint(300, 2000),
    "max_depth": [3, None],
    "max_features": sp_randint(1, 11),
    "min_samples_split": sp_randint(2, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}

# run randomized search
# Accuracy should be comparable to grid search, but runs much much faster
n_iter_search = 20
random_search = RandomizedSearchCV(clf,
                                   param_distributions=parameters_rand,
                                   n_iter=n_iter_search,
                                   n_jobs=-1)
Ejemplo n.º 36
0
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
# param_dist = { "n_estimators": [50,100,150,200],
#               "learning_rate": [0.1,0.5,1,1.5,2],
#               }

param_dist = {
    "max_depth": sp_randint(1, 15),
    "n_estimators": [100, 150, 200, 400, 500],
    "min_samples_split": sp_randint(2, 10),
    "min_samples_leaf": sp_randint(1, 10)
}

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
Ejemplo n.º 37
0
# In[2]:

n_iter = 100
k_fold = 10
# cv = kfold
# initialize the classifier
X_train, X_val, y_train, y_val, cv = load_train_and_kfold(n_folds=k_fold)


# In[4]:

model = KNeighborsClassifier()
model_name = model.__class__.__name__
param_grid = {
      "n_neighbors": sp_randint(4,400),
      "algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
}


# In[ ]:

search_GB = RandomizedSearchCV(model,param_grid,scoring='log_loss',n_jobs=-1,
               n_iter=n_iter,cv=cv,verbose=True)
search_GB.fit(X_train,y_train.flatten())


# In[ ]:


Ejemplo n.º 38
0
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


# specify parameters and distributions to sample from
param_dist = {
    "max_depth": [3, None],
    "max_features": sp_randint(1, 30),
    "min_samples_split": sp_randint(1, 11),
    "min_samples_leaf": sp_randint(1, 11),
    "n_estimators": sp_randint(1, X_train.shape[1]),
}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clf,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)
start = time()
random_search.fit(X_train, y_train)
print(
    "RandomizedSearchCV took %.2f seconds for %d candidates"
    " parameter settings." % ((time() - start), n_iter_search))
        print("Parameters: {0}".format(score.parameters))
        print("")


print("Starting RandomizedSearchCV")

n_features = X_train.shape[1]
N_FOLDS = 10

model = xgb.XGBRegressor()
# specify parameters and distributions to sample from
param_dist = {
    "objective": ["reg:linear"],
    #              "booster" : ["gbtree"],
    #              "eta": [0.1, 0.3, 0.5, 0.7],
    "max_depth": sp_randint(10, 30),
    "subsample": sp_uniform(0.1, 0.9),
    "colsample_bytree": sp_uniform(0.1, 1.0),
    "silent": [1],
    "seed": [42]
}

# run randomized search
n_iter_search = 30
folds = cv.KFold(n=len(y_train),
                 n_folds=N_FOLDS,
                 shuffle=True,
                 random_state=42)
random_search = RandomizedSearchCV(model,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search,
Ejemplo n.º 40
0
aid_mimic.load_candidates(mimic_files['candidates'])
# aid_mimic.compute_features(features='filtered')
# aid_mimic.save_features(mimic_files['feature_values'])
aid_mimic.load_features(mimic_files['feature_values'])
aid_mimic.load_y_true(mimic_files['y_true'])

# ____________________________________________________________________________________________________________________

sample_weights = compute_sample_weight('balanced', aid_mimic.y_true)

tuning_params = [
    {
        'name': "Nearest Neighbors",
        'predictor': make_sklearn_pipeline(KNeighborsClassifier()),
        'parameters': {
            'clf__n_neighbors': sp_randint(2, 20),
            'clf__weights': ['uniform', 'distance'],
        },
        'n_iter': 1000,
        'fit_params': None,
    },
    {
        'name':
        "Linear SVM",
        'predictor':
        make_sklearn_pipeline(
            SVC(kernel="linear", class_weight='balanced', random_state=1)),
        'parameters': {
            'clf__C': sp_expon(),
        },
        'n_iter':
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
param_dist = {
    "n_estimators": [10, 20, 30, 40, 50],
    "max_depth": [3, 5, 7, 9, 10, None],
    "max_features": sp_randint(1, 20),
    "min_samples_split": sp_randint(2, 12),
    "min_samples_leaf": sp_randint(1, 12),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}
Ejemplo n.º 42
0
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


#In the following randomSearchCV/GridSearchCV accept parameters values as dictionaries
#In example given below we have constructed dictionary for different parameter values
#that we want to try for randomForest model (We are setting up the grid search parameters here)

param_dist = {
    "n_estimators": [10, 100, 500, 700],
    "max_depth": [3, 5, None],
    "max_features": sp_randint(5, 11),
    "min_samples_split": sp_randint(5, 11),
    "min_samples_leaf": sp_randint(5, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"]
}
#Total number of combinations we would get from above is 4 X 3 X 7 X 7 X 7 X 2 X 2 which would take a lot
#of time to run and hence we would need to randomly choose certain combinations from the above

#Running the randomized Search (We are randomly picking 100 combinations out of the above mentioned
#combinations)
n_iter_search = 100

#n_iter parameter of RandomizedSearchCV controls, how many parameter combination will be tried; out
#of all possible given values, we are also setting the  param_distributions parameter as param_dist
#Which contains all the combinations of parameters
Ejemplo n.º 43
0
if best_algo == 'SVR':
    algo = getattr(sklearn.svm, best_algo)(gamma='auto')

if best_algo == 'MLPRegressor':
    algo = getattr(sklearn.neural_network, best_algo)()

if best_algo == 'XGBRegressor':
    algo = getattr(xgboost, best_algo)()

if best_algo == 'KNeighborsRegressor':
    algo = getattr(sklearn.neighbors, best_algo)()

## Begin the tune by setting hyper parameter limits per algorithm

if best_algo == 'LinearRegression':
    hyperparameters = {"penalty": ["l1", "l2"], "C": sp_randint(1, 10)}
    scoring_metric = make_scorer(explained_variance_score)

if best_algo == 'SGDRegressor':
    hyperparameters = {
        'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
        "learning_rate": ["constant", "optimal", "invscaling", "adaptive"]
    }
    scoring_metric = make_scorer(roc_auc_score)

if (best_algo
        == 'RandomForestRegressor') or (best_algo == 'AdaBoostRegressor') or (
            best_algo
            == 'GradientBoostingRegressor') or (best_algo
                                                == 'BaggingRegressor'):
    hyperparameters = {"n_estimators": sp_randint(1, 1000)}
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
            score.mean_validation_score, np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


xgbnew = XGBRegressor()
# specify parameters and distributions to sample from
one_to_left = st.beta(10, 1)
from_zero_positive = st.expon(0, 50)
param_dist = {
    "n_estimators": sp_randint(80, 120),
    "max_depth": sp_randint(2, 15),
    "learning_rate": st.uniform(0.05, 0.1),
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(xgbnew,
                                   param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(x_train2num, y_train2num)
Ejemplo n.º 45
0
    def test(self):
        im_ratio = [10, 100, 200]
        sep_condition = [1, 4, 8]

        params_dict1 = {
            'test__max_depth': [None],
            'test__max_features': sp_randint(1,
                                             3),  #Number of features per bag
            'test__min_samples_split':
            sp_randint(2, 100),  #Min number of samples in a leaf node split
            'test__min_samples_leaf':
            sp_randint(1, 100),  #Min number of samples in a leaf node
            'test__bootstrap':
            [True, False],  #Sample \mathbf{x}x with/without replacement
            'test__n_estimators': [1, 2, 5, 10, 50, 75, 100, 250, 500,
                                   1000],  #Number of trees in the forest
            "test__n_jobs": [-1],
            "test__class_weight": ["balanced", "balanced_subsample"]
        }

        params_dict2 = {
            'test__n_estimators': [1, 2, 5, 10, 50, 75, 100, 250, 500],
            "test__n_jobs": [-1],
            "test__replacement": [True, False]
        }

        params_dict3 = {
            'test__n_estimators': [1, 2, 5, 10, 50, 75, 100],
            "test__algorithm": ["SAMME", "SAMME.R"],
            "test__replacement": [True, False]
        }

        params_dict4 = {
            'test__n_estimators': [1, 2, 5, 10, 50, 75, 100, 250, 500, 1000],
            "test__n_jobs": [-1],
            "test__replacement": [True, False],
            "test__base_estimator": [
                SVC(C=1.0,
                    cache_size=200,
                    class_weight="balanced",
                    coef0=0.0,
                    decision_function_shape='ovr',
                    degree=3,
                    gamma='auto',
                    kernel='rbf',
                    max_iter=-1,
                    probability=False,
                    random_state=None,
                    shrinking=True,
                    tol=0.001,
                    verbose=False)
            ],
            "test__max_samples":
            sp_unif(0, 1),
            "test__max_features":
            sp_unif(0, 1)
        }

        ensembler = [("BalancedRandomForestClassifier",
                      BalancedRandomForestClassifier, params_dict1),
                     ("EasyEnsembleClassifier", EasyEnsembleClassifier,
                      params_dict2),
                     ("RUSBoostClassifier", RUSBoostClassifier, params_dict3),
                     ("BalancedBaggingClassifier", BalancedBaggingClassifier,
                      params_dict4)]

        # with open("Ensembler.txt", "a") as f:
        #     f.write("Strategy\timbalanced_ratio\tclass_separability\tk_J1\tu_J1\tcv_score\tcv_gms\treplace_score\treplace_gms\n")

        for classifier in ensembler:
            for sep in sep_condition:
                for ratio in im_ratio:
                    X_train, Y_train, X_test, Y_test = self.load_data(
                        ratio=ratio, sep=sep)

                    batch = len(X_train)

                    for i in range(1):

                        self.classifier = classifier[1]
                        self.params_dict = classifier[2]

                        self.X_train, self.Y_train, self.X_test, self.Y_test = X_train[
                            i], Y_train[i], X_test[i], Y_test[i]

                        k_J1, u_J1 = self.J1_estimate(self.X_train,
                                                      self.Y_train,
                                                      self.X_test, self.Y_test)

                        f1_cv, g_cv, f1_rp, g_rp = self.phase2(
                            self.X_train, self.Y_train)

                        with open("Ensembler.txt", "a") as f:
                            f.write(
                                "{}\t{}\t{}\t{:.2f}\t{:.2f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n"
                                .format(classifier[0], ratio, sep, k_J1, u_J1,
                                        f1_cv, g_cv, f1_rp, g_rp))
Ejemplo n.º 46
0
    print("TRAIN:", train_index, "TEST:", test_index)
    doc_train = doc_data[train_index]
    doc_test = doc_data[test_index]

    X_train, y_train = utils.convert_docs_to_lines(doc_train)
    X_test, y_test = utils.convert_docs_to_lines(doc_test)
    argument_sets += [(X_train, X_test, y_train, y_test)]

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('pca', TruncatedSVD(n_components=20)),
                     ('clf', RandomForestClassifier(n_jobs=-1))])

param_distributions = {
    "vect__ngram_range": [(1, 3)],
    "pca__n_components": sp_randint(20, 400),
    "clf__n_estimators": sp_randint(100, 2000),
    "clf__max_features": sp_randint(1, 8),
    "clf__min_samples_leaf": sp_randint(1, 6),
    #  "clf__class_weight": [
    #  {0: 1, 1: 1.5, 2: 1.75},
    #  {0: 1, 1: 2, 2: 3},
    #  {0: 1, 1: 3, 2: 5},
    #  ],
    "clf__criterion": ["entropy", "gini"]
}

n_iter_search = 10
random_search = RandomizedSearchCV(text_clf,
                                   param_distributions=param_distributions,
                                   n_iter=n_iter_search,
Ejemplo n.º 47
0
trainData = df.ix[:, :-1].values
trainLabels = df.ix[:, -1].values
dataset = "output/fungi/googlenetwithcontrol-test.csv"
# Loading dataset
df = pd.read_csv(dataset)
testData = df.ix[:, :-1].values
testLabels = df.ix[:, -1].values

#================================================================================================================
print("MultiLayer Perceptron")
#================================================================================================================

from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=84)
n_iter_search = 20
param_dist = {
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbgfs', 'sgd', 'adam'],
    'alpha': sp_randint(0.0001, 1),
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'momentum': [0.9, 0.95, 0.99]
}
model = RandomizedSearchCV(clf,
                           param_distributions=param_dist,
                           n_iter=n_iter_search)

model.fit(trainData, trainLabels)
predictionsMLP = model.predict(testData)
print(classification_report(testLabels, predictionsMLP))
print(accuracy_score(testLabels, predictionsMLP))
Ejemplo n.º 48
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=42,
                                                    test_size=0.3)
print(x_train.shape, x_test.shape)

# RandomSearch
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint  # Statistikpaket -> für zufällige WErte aus Intervall für RandomSearch

# Liste der Möglichen Parameter mit
# sp_randint(1,15) -> Zufällige Zahl zwischen 1 und 15. Intervall sollte jedoch gut überlegt werden
# "weights": ["uniform", "distance"] -> Zufälliger Auswahl zwischen uniform und Distance
# "p": [1, 2] -> Zufälliger Auswahl zwischen p = 1 oder 2
param_dist = {
    "n_neighbors": sp_randint(1, 15),
    "weights": ["uniform", "distance"],
    "p": [1, 2]
}
n_iter_search = 20  # Es sollen 20 Modelle erstellt werden

neigh = KNeighborsClassifier()

clf = RandomizedSearchCV(
    neigh, param_distributions=param_dist, n_iter=n_iter_search, cv=3
)  # CV (Crossvalidation). Bei kleinen Datensätzen CV~ 3. Bei großen Datensätzen (mehrere 1000) CV = 5...10
clf.fit(x_train, y_train)

for key in clf.cv_results_.keys(
):  # Listet Attribute auf die Aufgerufen werden können
    print(key)
    bestParams = []
    cv = StratifiedShuffleSplit(n_splits=10, test_size=0.6, random_state=0)
    for grp in all_train_data:

        print 'Working on group : %s' % (grp)

        # get some data
        X = all_train_data[grp]['features'].values.astype(np.float32)
        y = all_train_data[grp]['labels'].astype(np.int16)

        # build a classifier
        clf = RandomForestClassifier()

        # specify parameters and distributions to sample from
        param_dist = {
            "n_estimators": sp_randint(1, 1000),
            "max_depth": sp_randint(3, 303),
            "max_features": sp_randint(1, 350),
            "min_samples_split": sp_randint(2, 350),
            "min_samples_leaf": sp_randint(1, 350),
            "bootstrap": [True, False],
            "criterion": ["gini", "entropy"]
        }

        # run randomized search
        n_iter_search = 5000
        random_search = RandomizedSearchCV(clf,
                                           scoring='neg_log_loss',
                                           param_distributions=param_dist,
                                           n_iter=n_iter_search,
                                           cv=cv,
Ejemplo n.º 50
0
# Hence, the default parameters are:
# * criterion (default=‘mse’). The function to measure the quality of a split.
# * splitter (default=‘best’). The strategy used to choose the split at each node.
# * min_samples_leaf (default=1). The minimum number of samples required to be at a leaf node.
# * min_weight_fraction_leaf (default=0.). The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.
# * random_state (default=None). The random number generator is the RandomState instance used by np.random.
# * max_leaf_nodes (default=None). Unlimited number of nodes.
# * min_impurity_decrease (default=0.). A node will be split if this split induces a decrease of the impurity greater than or equal to 0.
# * min_impurity_split (default=1e-7). Threshold for early stopping in tree growth.

# In[10]:

from scipy.stats import randint as sp_randint

param_grid = {
    'max_depth': sp_randint(2, 20),
    'min_samples_split': sp_randint(2, 20)
}

from sklearn.model_selection import RandomizedSearchCV
rgt_grid = RandomizedSearchCV(rgt,
                              param_grid,
                              scoring='neg_mean_squared_error',
                              cv=tr_val_partition,
                              n_jobs=1,
                              verbose=1)

rgt_grid.fit(X_train_minmax, y_train)

y_test_pred_rgt_G = rgt_grid.predict(X_test_minmax)
Ejemplo n.º 51
0
class initialConfig:
    ## The following parameters correspond to the machine learning
    ## part of the framework.

    # This parameter refers to the number of outer folds that
    # are being used in order for the k-fold cross-validation
    # to take place.
    kfold_parameter = 5
    kfold_inner_parameter = 4

    # Number of parallel jobs to be initiated:
    # -1: use all processors
    # int: no of processors to use
    n_jobs = -1

    test_dataset = './datasets/dataset-string-similarity_original_1k.csv'
    # test_dataset = './datasets/dataset-string-similarity_latin_EU_NA_1k.txt'
    # test_dataset = './datasets/dataset-string-similarity-100.csv'

    # the classification method used: basic, basic_sorted, lgm
    classification_method = 'lgm'

    # This parameter contains a list of the various classifiers
    # the results of which will be compared in the experiments.
    # classifiers = ['SVM', 'Decision Tree', 'Random Forest', 'AdaBoost',
    #                'Naive Bayes', 'MLP', 'Gaussian Process', 'Extra Trees']

    # Search Method to use for best hyperparameters: randomized, grid, hyperband - not yet implemented!!!
    hyperparams_search_method = 'randomized'

    # These are the parameters that constitute the search space for GridSearchCV
    # in our experiments.
    SVM_hyperparameters = [
        {'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
         'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [300]},
        {'kernel': ['poly'], 'degree': [1, 2, 3, 4], 'gamma': ['scale'],
         'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [300]},
        {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': ['scale'], 'max_iter': [300]}
    ]
    DecisionTree_hyperparameters = {
        'max_depth': [i for i in range(1, 33)],
        'min_samples_split': list(np.linspace(0.1, 1, 10)),
        'min_samples_leaf': list(np.linspace(0.1, 0.5, 5)),
        'max_features': [i for i in range(1, 10)]
    }
    RandomForest_hyperparameters = {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 100, None],
        'criterion': ['gini', 'entropy'],
        'max_features': ['log2', 'sqrt'],  # auto is equal to sqrt
        'min_samples_leaf': [1, 2, 4],
        'min_samples_split': [2, 5, 10],
        "n_estimators": [250, 500, 1000]
    }
    XGBoost_hyperparameters = {
        "n_estimators": [500, 1000, 3000],
        # 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        # hyperparameters to avoid overfitting
        'eta': list(np.linspace(0.01, 0.2, 10)),  # 'learning_rate'
        'gamma': [0, 1, 5],
        'subsample': [0.8, 0.9, 1],
        'colsample_bytree': list(np.linspace(0.3, 1, 8)),
        'min_child_weight': [1, 5, 10],
    }
    MLP_hyperparameters = {
        'learning_rate_init': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1],
        'max_iter': [300, 500, 1000],
        'solver': ['sgd', 'adam']
    }

    # These are the parameters that constitute the search space for RandomizedSearchCV
    # in our experiments.
    SVM_hyperparameters_dist = {
        'C': expon(scale=100), 'gamma': expon(scale=.1), 'kernel': ['rbf'], 'class_weight': ['balanced', None]
    }
    DecisionTree_hyperparameters_dist = {
        'max_depth': sp_randint(10, 100),
        'min_samples_split': list(np.linspace(0.1, 1, 50)),
        'min_samples_leaf': list(np.linspace(0.1, 0.5, 25)),
        'max_features': sp_randint(1, 11),
    }
    RandomForest_hyperparameters_dist = {
        'bootstrap': [True, False],
        'max_depth': [10, 20, 30, 40, 50, 60, 100, None],
        'criterion': ['gini', 'entropy'],
        'max_features': ['sqrt', 'log2'],  # sp_randint(1, 11)
        'min_samples_leaf': sp_randint(1, 5),
        'min_samples_split': sp_randint(2, 11),
        "n_estimators": sp_randint(250, 1000),
    }
    XGBoost_hyperparameters_dist = {
        "n_estimators": sp_randint(500, 4000),
        # 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
        # hyperparameters to avoid overfitting
        'eta': expon(loc=0.01, scale=0.1),  # 'learning_rate'
        'gamma': [0, 1, 5],
        'subsample': truncnorm(0.7, 1),
        'colsample_bytree': truncnorm(0, 1),
        'min_child_weight': [1, 5, 10],
    }
    MLP_hyperparameters_dist = {
        'learning_rate_init': expon(loc=0.0001, scale=0.1),
        'max_iter': [300, 500, 1000],
        'solver': ['sgd', 'adam']
    }

    max_iter = 250
Ejemplo n.º 52
0
import psycopg2 as pg
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sqlalchemy import create_engine

ALERT_AGENT_THRESHOLD = .15
LOCK_ACCOUNT_THRESHOLD = .3

connection = pg.connect(
    host='localhost',
    port=54320,
    dbname='ht_db',
    user='******'
)
engine = create_engine('postgresql://*****:*****@127.0.0.1:54320/ht_db')

data_path = '../data'
model_path = 'artifacts'
schema_path = '../misc/schemas.yaml'

# LGBM RandomSearch parameters
param_test = {'num_leaves': sp_randint(6, 50),
              'subsample': sp_uniform(loc=0.2, scale=0.8),
              'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
              'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
              'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100],
              'min_data_in_leaf': sp_randint(100, 3000),
              'max_bin': sp_randint(150, 400),
              'scale_pos_weight': sp_randint(2, 90)}
Ejemplo n.º 53
0
def ModelFit():
    global best_model

    #contruct hyperparameter grid
    param_dist = {"max_depth": [3, 10, 20, 70, None],
                  "max_features": [2, 10, 41, 80, 'sqrt'],
                  "min_samples_split": sp_randint(2, 11),
                  "min_samples_leaf": sp_randint(1, 11),
                  #"bootstrap": [True, False],
                  "criterion": ["gini", "entropy"],
                  "n_estimators": [100, 300, 500, 800, 1000]}
    pprint(param_dist)

    #define random forest classifier function
    rf = RandomForestClassifier(random_state = 120)

    #search across 1000 randomized combinations in the above grid
    estimator = RandomizedSearchCV(estimator = rf, param_distributions = param_dist, n_iter = 1000, cv = 10, verbose = 10, random_state = 12, scoring = 'roc_auc', n_jobs = -1)

    #fit the model
    grid_result = estimator.fit(X_train, y_train)

    #find and define best estimator based on grid search
    best_model = grid_result.best_estimator_
    print('\nbest_model:\n', best_model)

    #predict y based on test data
    y_pred = grid_result.predict(X_test)

    #accuracy score
    print('accuracy score:', accuracy_score(y_test, y_pred))

    #confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(tn,fp,fn,tp)

    #classification report
    print('\nclassification report:\n',classification_report(y_test, y_pred))

    #AUC and ROC curve
    y_pred_prob = grid_result.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, y_pred_prob)
    print('auc:', auc)

    false_positive, true_positive, _ = roc_curve(y_test, y_pred_prob)

    font = {'fontname':'Helvetica'}
    plt.figure()
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(false_positive, true_positive, color='black')
    plt.xlabel('False positive rate', **font)
    plt.ylabel('True positive rate', **font)
    plt.savefig('feces_roc.png', dpi=300)
    plt.show()
    
    # Save the model as a pickle in a file 
    joblib.dump(grid_result, 'campy_rf_feces.pkl')
    
    #determine best features
    feature_importances = grid_result.best_estimator_.feature_importances_
    column_names=list(feces)
    del column_names[-0]
    importance = pd.DataFrame(feature_importances, index=column_names, columns=["Importance"])
    sort_importance = importance.sort_values(by=['Importance'], ascending = False)
    sort_column_names = sort_importance.index.values.tolist()
    mult = 100/(sort_importance['Importance'].iloc[0])
    sort_imp_mult = sort_importance * mult
    
    top_imp = sort_imp_mult['Importance'].iloc[0:15].tolist()
    top_column_names = sort_column_names[0:15]
    top_column_names =  ['AvgMaxGustSpeed1.6',
                         'AvgAverageHumidity1.7',
                         'AverageHumidityTwoDayBefore',
                         'AvgMaxGustSpeed1.3',
                         'AvgMaxGustSpeed1.5',
                         'AvgMinTemperature1.7',
                         'AvgMaxWindSpeed1.7',
                         'AvgMinHumidity1.4',
                         'AvgMaxHumidity1.3',
                         'AvgPrecipitation1.4',
                         'MaxGustSpeedOneDayBefore',
                         'AvgMaxGustSpeedS1.2',
                         'AvgMaxWindSpeed1.4',
                         'AvgAverageHumidity1.3',
                         'MaxGustSpeedTwoDayBefore']
    
    plt.rcParams.update(plt.rcParamsDefault)
    y_ticks = np.arange(0, len(top_column_names))
    fig, ax = plt.subplots()
    ax.barh(y_ticks, top_imp, color = "dimgray")
    ax.set_yticklabels(top_column_names, **font)
    ax.set_yticks(y_ticks)
    plt.xlabel('Relative Importance', **font)
    fig.tight_layout()
    plt.gca().invert_yaxis()
    plt.savefig('feces_var.png', dpi=300)
    plt.show()
   
    return
Ejemplo n.º 54
0
                                         y,
                                         scoring=self.scoring,
                                         cv=self.cv).mean()
            print("Model with rank: {0}".format(i))
            print("Pred score: {0}".format(pred[idx]))
            print("Mean validation score: {0:.3f}".format(score_mean))
            print("Parameters: {0}".format(param))
            print("")


if __name__ == '__main__':
    # get some data
    digits = load_digits()
    X, y = digits.data, digits.target
    clf = RandomForestClassifier(n_estimators=20)
    param_dist = {
        "max_depth": sp_randint(1, 11),
        "max_features": sp_randint(1, 11),
        "min_samples_split": sp_randint(2, 11),
        "min_samples_leaf": sp_randint(1, 11),
        "bootstrap": [True, False],
        "criterion": ["gini", "entropy"]
    }
    rpt = RPTune(clf,
                 param_distributions=param_dist,
                 n_iter='auto',
                 n_jobs=-1,
                 random_state=42,
                 scoring='accuracy')
    rpt.fit(X, y)
    file.close()


CLASSIFIER_MAPPING = {
    # clf - classifier
    # prep - according prepare data method
    # dist - specify parameters and distributions to sample from
    # grid - use a full grid over all parameters
    'perceptron': {
        'clf': linear_model.Perceptron,
        'prep': prepare_two_class_data,
        'dist': {
            'penalty': [None, 'l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
            'fit_intercept': [True, False],
            'n_iter': sp_randint(1, 20),
            'shuffle': [True, False],
            'verbose': sp_randint(1, 5),
            'eta0': [1.0, 1.5, 2.0],
            'random_state': [0, None],
            'class_weight': ['balanced', None],
            'warm_start': [True, False]
        },
        'grid': {
            'penalty': [None, 'l2', 'l1', 'elasticnet'],
            'alpha': [0.0001, 0.0003, 0.0005, 0.0007, 0.0008, 0.0009],
            'fit_intercept': [True, False],
            'n_iter': [5, 10, 20, 30],
            'shuffle': [True, False],
            'verbose': [0, 2, 4],
            'eta0': [1.0, 1.5, 2.0],
Ejemplo n.º 56
0
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                results['mean_test_score'][candidate],
                results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")


# specify parameters and distributions to sample from
param_dist = {
    "svm__C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    "xgb__max_depth": sp_randint(3, 25),
    "xgb__min_child_weight": sp_randint(1, 7),
    "xgb__subsample": [0.6, 0.7, 0.8, 0.9, 1.0],
    "xgb__reg_lambda": [0.01, 0.1, 1.0],
    "xgb__reg_alpha": [0, 0.1, 0.5, 1.0],
    "rf__n_estimators": [10, 50, 100, 150, 200, 300, 500],
    "rf__max_depth": [5, 8, 15, 25, 30, None],
    "rf__max_features": sp_randint(1, 11),
    "rf__min_samples_split": sp_randint(2, 100),
    "rf__min_samples_leaf": sp_randint(1, 11),
    "rf__bootstrap": [True, False],
    "rf__criterion": ["gini", "entropy"]
}

# run randomized search
n_iter_search = 10000
Ejemplo n.º 57
0
    
    #switch between motions and particle sets for RandomizedSearchCV
    test_motions = False
    if test_motions:
        model = model_m
        X_train = X_train_m
        y_train = y_train_m
        print("RandomizedSearchCV testing the Motions dataset")
    else:
        model = model_p
        X_train = X_train_p
        y_train = y_train_p
        print("RandomizedSearchCV testing the Particles dataset")
              
    # specify parameters and distributions to sample from
    param_dist = {"min_samples_leaf": sp_randint(5, 3000),
              "max_leaf_nodes": sp_randint(5, 300),
              "max_depth": sp_randint(3, 100),
              "criterion": ['gini', 'entropy']
              }

    #https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html#sphx-glr-auto-examples-model-selection-plot-randomized-search-py
    n_iter_search = 1000
    random_search = RandomizedSearchCV(model, param_distributions = param_dist, n_iter=n_iter_search, cv = 5, scoring='accuracy', verbose=10)    
    random_search.fit(X_train, y_train)
    report(random_search.cv_results_)
    #scores = random_search.cv_results_['mean_test_score']


grid_cv = False
if grid_cv:
Ejemplo n.º 58
0
def finding_parameter():
    """
    Finding parameter for machine learning model
    :return: parameter for traditional machine learning algorithm
    """
    def report(results, n_top=3):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            for candidate in candidates:
                print("Model with rank: {0}".format(i))
                print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                    results['mean_test_score'][candidate],
                    results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")

    df = pd.read_csv(args.raw_data)
    df_train_input_sc, df_train_target, df_test_input_sc, df_test_target = lib.clear_data(df, args)

    if args.algo == 'decisiontree':
        clf = tree.DecisionTreeClassifier()
        param_dist = {"max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
                      "max_features": sp_randint(1, 11),
                      "min_samples_split": sp_randint(2, 11),
                      "min_samples_leaf": [0.05, 0.1, 1],
                      "criterion": ["gini", "entropy"],
                      "splitter": ["best", "random"],
                      "class_weight": ["balanced", None]
                       }

    if args.algo == 'randomforest':
        print("Finding parameter for random forest")
        # build a classifier
        clf = RandomForestClassifier(n_estimators=1000)
        param_dist = {"max_depth": [3, 4, 5, 6, 7, 8, 9, 10],
                      "max_features": sp_randint(1, 11),
                      "min_samples_split": sp_randint(2, 11),
                      "bootstrap": [True, False],
                      "criterion": ["gini", "entropy"]}
        # Utility function to report best scores

    if args.algo == "logisticregression":
        print("Finding parameter for logisticregression")
        clf = LogisticRegression()
        param_dist = {
            "penalty": ["l1", "l2"],
            "tol": [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
            "C": [0.05, 0.1],
            "fit_intercept": [True, False],
            "intercept_scaling": [0.01, 0.1, 1],
            "max_iter": [10, 100, 1000]
        }

        # specify parameters and distributions to sample from

    # run randomized search
    n_iter_search = 10000
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search, cv=5, iid=False)

    start = time()
    random_search.fit(df_train_input_sc, df_train_target)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.cv_results_)
Ejemplo n.º 59
0
xgbm = XGBClassifier(**xparams)
lgbm = LGBMClassifier(**lparams)
cgbm = CatBoostClassifier(**cparams)
rdf = RandomForestClassifier()
classifiers = [rdf, xgbm, lgbm]
classifiers = [xgbm, lgbm, cgbm]
classifiers = [xgbm, lgbm]
lr = LogisticRegression(C=0.1)
grid = StackingClassifier(classifiers=classifiers,
                          use_probas=True,
                          average_probas=False,
                          meta_classifier=lr)

n_estimators = [100, 300]
n_estimators = sp_randint(250, 500)
max_depth = [2, 3]
subsample = [0.5, 0.7]
subsample = sp_rec(0.3, 0.8)
C = [0.01, 0.2]
C = sp_rec(0.01, 0.2)
learning_rate = [0.1, 0.4]
learning_rate = sp_rec(0.1, 0.4)
reg_lambda = [2, 6]
reg_lambda = sp_randint(2, 10)
reg_alpha = [0.1, 0.3]
reg_alpha = sp_rec(0.1, 0.8)
gamma = sp_rec(0.1, 0.8)
feature_fraction = sp_rec(0.3, 0.8)
bagging_fraction = sp_rec(0.3, 0.8)
bagging_freq = sp_randint(3, 8)
Ejemplo n.º 60
0
def tune_xgb_params_randomized(estimator_cls,
                               folds: Union[KFold, StratifiedKFold],
                               label: np.ndarray,
                               metric_sklearn: str,
                               n_jobs: int,
                               params: dict,
                               train: np.ndarray,
                               n_iter: int = 20,
                               verbosity_level: int = 10,
                               **kwargs):
    """
    :param estimator_cls:
        The class type of the estimator to instantiate - either an XGBClassifier or an XGBRegressor.
    :param folds:
        A KFold or StratifiedKFold object to cross validate the parameters.
    :param label:
        An array-like containing the labels of the classification or regression problem.
    :param metric_sklearn:
        The evaluation metric to be passed to scikit-learn's GridSearchCV - see
        http://scikit-learn.org/stable/modules/model_evaluation.html
        for the options this can take - e.g. 'neg_mean_squared_error' for RMSE.
    :param n_jobs:
        The number of jobs to run simultaneously.
    :param params:
        A dictionary of XGB parameters.
    :param train:
        An array-like containing the training input samples.
    :param n_iter:
        An optional parameter to control the number of parameter settings that are sampled.
    :param n_jobs:
        An optional parameter to control the amount of parallel jobs - defaults to the amount of CPUs available.
    :param verbosity_level:
        An optional parameter to control the verbosity of the grid searching - defaults to the most verbose option.
    :param kwargs:
        Parameter distributions may be controlled through keyword arguments - e.g. to sample uniformly between 0.5 and 0.7 for
        colsample_bytree, supply colsample_bytree_loc=0.5 and colsample_bytree_scale=0.2.
    :return:
        A dictionary of tuned parameters and a list of the parameters found at each step with their respective scores.
    """
    params_copy = clean_params_for_sk(params)
    param_distributions = {
        'colsample_bytree':
        uniform(kwargs.get('colsample_bytree_loc', 0.2),
                kwargs.get('colsample_bytree_scale', 0.8)),
        'gamma':
        uniform(kwargs.get('gamma_loc', 0), kwargs.get('gamma_scale', 0.9)),
        'max_depth':
        sp_randint(kwargs.get('max_depth_low', 2),
                   kwargs.get('max_depth_high', 11)),
        'min_child_weight':
        sp_randint(kwargs.get('min_child_weight_low', 1),
                   kwargs.get('min_child_weight_high', 11)),
        'reg_alpha':
        halfnorm(kwargs.get('reg_alpha_loc', 0),
                 kwargs.get('reg_alpha_scale', 5)),
        'reg_lambda':
        halfnorm(kwargs.get('reg_alpha_loc', 0),
                 kwargs.get('reg_alpha_scale', 5)),
        'subsample':
        uniform(kwargs.get('subsample_loc', 0.2),
                kwargs.get('subsample_scale', 0.8))
    }

    rand_search = RandomizedSearchCV(cv=folds.split(train, label),
                                     estimator=estimator_cls(**params_copy),
                                     n_iter=n_iter,
                                     n_jobs=n_jobs,
                                     param_distributions=param_distributions,
                                     scoring=metric_sklearn,
                                     verbose=verbosity_level)
    rand_search.fit(train, label)
    return rand_search.best_params_, [(rand_search.best_params_,
                                       rand_search.best_score_)]