Esempio n. 1
0
def readme():
    data = sklearn.datasets.load_digits()
    X = data["data"]
    y = data["target"]

    paramgrid = {
        "kernel": ["rbf"],
        "C": np.logspace(-9, 9, num=25, base=10),
        "gamma": np.logspace(-9, 9, num=25, base=10)
    }

    random.seed(1)

    cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(),
                                       params=paramgrid,
                                       scoring="accuracy",
                                       cv=StratifiedKFold(n_splits=4),
                                       verbose=1,
                                       population_size=10,
                                       gene_mutation_prob=0.10,
                                       gene_crossover_prob=0.5,
                                       tournament_size=3,
                                       generations_number=5)
    cv.fit(X, y)
    return cv
def main():

    df_train = pd.read_csv('../train_dataset.csv')
    df_test = pd.read_csv('../test_dataset.csv')

    X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values
    #X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values

    # log-uniform: understand as search over p = exp(x) by varying x
    opt = EvolutionaryAlgorithmSearchCV(
        estimator=SVC(),
        # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.py
        params={
            "kernel": ["rbf"],
            "C": np.logspace(1e-6, 1e+6, num=20, base=10),
            "gamma": np.logspace(3.0517578125e-05, 8, num=20, base=10),
            'decision_function_shape': ['ovo', 'ovr'],
            'degree': list(range(2, 5)),
            'coef0': np.logspace(-1, 1, num=20, base=10),
            'coef0': np.logspace(1e-5, 1e-1, num=20, base=10),
        },
        cv=StratifiedKFold(n_splits=10, shuffle=True),
        scoring="accuracy",
        verbose=True,
        population_size=50,
        gene_mutation_prob=0.10,
        tournament_size=3,
        generations_number=10,
    )

    opt.fit(X_train, y_train)
Esempio n. 3
0
def svm_ga(X, y, rfe=True, paramgrid=None):

    # feature selection
    fltr = RFE(ReliefF(), n_features_to_select=5,
               step=0.5) if rfe else ReliefF(n_features_to_select=5,
                                             n_neighbors=3)

    clf = SVC()

    param_grid = {
        "svc__kernel": ["rbf"],
        'svc__C': [10e-2, 10e-1, 10, 10e1, 10e2, 10e3, 10e4],
        'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 1.1]
    } if paramgrid is None else paramgrid

    # make pipeline
    pipe = make_pipeline(preprocessing.StandardScaler(), fltr, clf)

    from evolutionary_search import EvolutionaryAlgorithmSearchCV
    cv = EvolutionaryAlgorithmSearchCV(estimator=pipe,
                                       params=param_grid,
                                       scoring="accuracy",
                                       cv=10,
                                       verbose=1,
                                       population_size=50,
                                       gene_mutation_prob=0.1,
                                       gene_crossover_prob=0.8,
                                       tournament_size=10,
                                       generations_number=25)
    cv.fit(X, y)

    print(cv.best_params_)
    print(cv.best_score_)
Esempio n. 4
0
def get_GeneticGridSearchCV(model, params, X, y):
    from evolutionary_search import EvolutionaryAlgorithmSearchCV
    print("performing genetic grid search ...")
    grid = EvolutionaryAlgorithmSearchCV(estimator=model,
                                         params=params,
                                         scoring="r2",
#                                        cv=StratifiedKFold(n_splits=2),
                                         verbose=True,
                                         population_size=50,
                                         gene_mutation_prob=0.10,
                                         tournament_size=3,
                                         generations_number=10,
#                                        pmap = pool.map,
                                        )

    grid.fit(X, y.ravel()) # fit the model and parameters

    # our classical metric for performance
    print("Best Accuracy: {}".format(grid.best_score_))

    # the best parameters that caused the best accuracy
    print("Best Parameters: {}".format(grid.best_params_))
    
    # the average time it took a model to fit to the data (in seconds)
    print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

    print(pd.DataFrame(grid.cv_results_).sort_values("mean_test_score", ascending=False).head())
Esempio n. 5
0
def evo_search(xtrain, xtest, ytrain, ytest):
    layers = [[a, a] for a in range(10, 500, 100)]
    print(layers)

    parameters = {
                   'activation': ['identity', 'logistic', 'tanh', 'relu'],
                  # 'solver': ['lbfgs', 'sgd', 'adam'],
                  # 'learning_rate': ['constant', 'invscaling', 'adaptive'],
                   'batch_size': [5, 10, 20, 50, 100],
                   'learning_rate_init': [0.0001, 0.001, 0.01, 0.1],
                  # 'hidden_layer_sizes': generate_networks(),
                    'hidden_layer_sizes': layers
                  }
    print(parameters)
    print('Starting evolutionary search')

    cv = EvolutionaryAlgorithmSearchCV(estimator=MLPClassifier(random_state=42, max_iter=20000),
                                       params=parameters,
                                       scoring=make_scorer(f1_score),
                                       #cv=StratifiedKFold(n_splits=4),
                                       verbose=10,
                                       population_size=20,
                                       gene_mutation_prob=0.10,
                                       gene_crossover_prob=0.5,
                                       tournament_size=3,
                                       generations_number=10,
                                       n_jobs=1)
    cv.fit(xtrain, ytrain.values.ravel())

    print_classifier_stats(cv.best_estimator_, xtrain, xtest, ytrain, ytest)
    print('Evo search done...')
Esempio n. 6
0
    def evolution_method(self):
        # this does not work, but we need to continue
        params = dict(epochs=[200], batch_size=[4, 8])
        # cv = [(slice(None), slice(None))]

        es = self.callback.es
        mc = self.callback.mc
        tb = self.callback.tb
        my_callbacks = [es, mc]

        fit_params = {
            "epochs":
            300,
            "validation_data":
            (self.data_obj.x_validation, self.data_obj.y_validation),
            "callbacks":
            my_callbacks
        }

        self.evo = EvolutionaryAlgorithmSearchCV(
            estimator=self.keras_regressor,
            params=params,
            verbose=0,
            population_size=10,
            fit_params=fit_params)

        evo_hist = self.evo.fit(X=self.data_obj.x_train,
                                y=self.data_obj.y_train)
        return evo_hist
Esempio n. 7
0
def SVM(X_train_little,
        y_train_little,
        X_train_pca,
        X_test_pca,
        y_train,
        y_test,
        tune_only=False):
    from sklearn.svm import SVC
    from sklearn.model_selection import cross_validate
    C_range = np.linspace(1, 10, 101)
    gamma_range = np.linspace(3000, 4000, 100)
    param_dist = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
    rnds = EvolutionaryAlgorithmSearchCV(estimator=SVC(max_iter=200),
                                         params=param_dist,
                                         scoring="f1",
                                         cv=cv,
                                         verbose=1,
                                         population_size=50,
                                         gene_mutation_prob=0.10,
                                         gene_crossover_prob=0.5,
                                         tournament_size=3,
                                         generations_number=6,
                                         n_jobs=4)
    rnds.fit(X_train_little, y_train_little)
    # summarize the results of the random parameter search
    file = open("SVM_best_parameters.txt", "w")
    file.write("{}\n".format(rnds.best_score_))
    file.write('C: {}\n'.format(rnds.best_estimator_.C))
    file.write('gamma: {}\n'.format(rnds.best_estimator_.gamma))
    file.close()

    if not tune_only:
        # apply best parameters
        svc = SVC(max_iter=200,
                  C=rnds.best_estimator_.C,
                  gamma=rnds.best_estimator_.gamma,
                  random_state=SEED)
        svc.fit(X_train_pca, y_train)
        sc_tr = cross_validate(svc,
                               X_train_pca,
                               y_train,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        sc_ts = cross_validate(svc,
                               X_test_pca,
                               y_test,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        pred = svc.predict(X_test_pca)
        pred_train = svc.predict(X_train_pca)

        output_report("SVM", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
Esempio n. 8
0
def LR2(X_train_little,
        y_train_little,
        X_train_pca,
        X_test_pca,
        y_train,
        y_test,
        tune_only=False):
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import cross_validate
    C_range = np.linspace(1, 50, 50)
    tol_range = np.linspace(0.001, 0.01, 50)
    param_dist = dict(tol=tol_range, C=C_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
    rnds = EvolutionaryAlgorithmSearchCV(
        estimator=LogisticRegression(penalty='l2'),
        params=param_dist,
        scoring="f1",
        cv=cv,
        verbose=1,
        population_size=50,
        gene_mutation_prob=0.10,
        gene_crossover_prob=0.5,
        tournament_size=3,
        generations_number=6,
        n_jobs=4)
    rnds.fit(X_train_little, y_train_little)
    # summarize the results of the random parameter search
    file = open("LR2_best_parameters.txt", "w")
    file.write("{}\n".format(rnds.best_score_))
    file.write('C: {}\n'.format(rnds.best_estimator_.C))
    file.write('tol: {}\n'.format(rnds.best_estimator_.tol))
    file.close()

    if not tune_only:
        # apply best parameters
        l2r = LogisticRegression(C=rnds.best_estimator_.C,
                                 tol=rnds.best_estimator_.tol,
                                 random_state=SEED)
        l2r.fit(X_train_pca, y_train)
        sc_tr = cross_validate(l2r,
                               X_train_pca,
                               y_train,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        sc_ts = cross_validate(l2r,
                               X_test_pca,
                               y_test,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        pred = l2r.predict(X_test_pca)
        pred_train = l2r.predict(X_train_pca)

        output_report("LR2", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
Esempio n. 9
0
def sk_params_search_best(
    clf,
    X,
    y,
    param_grid={"alpha": np.linspace(0, 1, 5)},
    method="gridsearch",
    param_search={
        "scorename": "r2",
        "cv": 5,
        "population_size": 5,
        "generations_number": 3
    },
):
    """
   Genetic: population_size=5, ngene_mutation_prob=0.10,,gene_crossover_prob=0.5, tournament_size=3,  generations_number=3

    :param X:
    :param y:
    :param clf:
    :param param_grid:
    :param method:
    :param param_search:
    :return:
  """
    p = param_search
    myscore = sk_score_get(p["scorename"])

    if method == "gridsearch":
        from sklearn.model_selection import GridSearchCV

        grid = GridSearchCV(clf, param_grid, cv=p["cv"], scoring=myscore)
        grid.fit(X, y)
        return grid.best_score_, grid.best_params_

    if method == "genetic":
        from evolutionary_search import EvolutionaryAlgorithmSearchCV
        from sklearn.model_selection import StratifiedKFold

        # paramgrid = {"alpha":  np.linspace(0,1, 20) , "l1_ratio": np.linspace(0,1, 20) }
        cv = EvolutionaryAlgorithmSearchCV(
            estimator=clf,
            params=param_grid,
            scoring=myscore,
            cv=StratifiedKFold(y),
            verbose=True,
            population_size=p["population_size"],
            gene_mutation_prob=0.10,
            gene_crossover_prob=0.5,
            tournament_size=3,
            generations_number=p["generations_number"],
        )

        cv.fit(X, y)
        return cv.best_score_, cv.best_params_
Esempio n. 10
0
    def ev_tree(self):
        ev_params = self.default_evparams
        ev_params['estimator'] = RandomForestClassifier()
        ev_params['params'] = self.frst_space

        cv = EvolutionaryAlgorithmSearchCV(**ev_params)
        cv.fit(self.X_insample, self.y_insample)

        clf = cv.best_estimator_

        self.frst_called = True
        self.opt_frst = clf
Esempio n. 11
0
    def ev_svm(self):
        ev_params = self.default_evparams
        ev_params['estimator'] = SVC(probability=True)
        ev_params['params'] = self.svm_space

        cv = EvolutionaryAlgorithmSearchCV(**ev_params)
        cv.fit(self.X_insample, self.y_insample)

        clf = cv.best_estimator_

        self.svm_called = True
        self.opt_svm = clf
Esempio n. 12
0
def NB(X_train_little,
       y_train_little,
       X_train_pca,
       X_test_pca,
       y_train,
       y_test,
       tune_only=False):
    from sklearn.naive_bayes import BernoulliNB
    from sklearn.model_selection import cross_validate
    alpha_range = np.linspace(0, 500, 500)
    param_dist = dict(alpha=alpha_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2)
    rnds = EvolutionaryAlgorithmSearchCV(estimator=BernoulliNB(),
                                         params=param_dist,
                                         scoring="f1",
                                         cv=cv,
                                         verbose=1,
                                         population_size=50,
                                         gene_mutation_prob=0.10,
                                         gene_crossover_prob=0.5,
                                         tournament_size=3,
                                         generations_number=6,
                                         n_jobs=4)
    rnds.fit(X_train_little, y_train_little)
    # summarize the results of the random parameter search
    file = open("NB_best_parameters.txt", "w")
    file.write("{}\n".format(rnds.best_score_))
    file.write('alpha: {}\n'.format(rnds.best_estimator_.alpha))
    file.close()

    if not tune_only:
        # apply best parameters
        gnb = BernoulliNB(alpha=rnds.best_estimator_.alpha)
        gnb.fit(X_train_pca, y_train)
        sc_tr = cross_validate(gnb,
                               X_train_pca,
                               y_train,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        sc_ts = cross_validate(gnb,
                               X_test_pca,
                               y_test,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        pred = gnb.predict(X_test_pca)
        pred_train = gnb.predict(X_train_pca)

        output_report("NB", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
Esempio n. 13
0
    def geneticGridTest(self):
        print("performing Genetic grid search...")

        gridSearch = EvolutionaryAlgorithmSearchCV(estimator=self.classifier,
                                                   params=self.gridParams,
                                                   cv=self.kfold,
                                                   scoring='accuracy',
                                                   verbose=True,
                                                   iid='False',
                                                   n_jobs=4,
                                                   population_size=20,
                                                   gene_mutation_prob=0.30,
                                                   tournament_size=2,
                                                   generations_number=5)
        gridSearch.fit(self.X, self.y)
 def GA_tune_lgbm(cls, x, y):
     tuner = EvolutionaryAlgorithmSearchCV(
         estimator=LGBMClassifier(),
         params=cls.lgbm_paramgrid,
         scoring="accuracy",
         cv=TimeSeriesSplit(n_splits=4),
         verbose=1,
         population_size=50,
         gene_mutation_prob=0.2,
         gene_crossover_prob=0.5,
         tournament_size=3,
         generations_number=20,
     )
     tuner.fit(x, y)
     return tuner.best_params_
Esempio n. 15
0
def RF_DT(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False):
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.model_selection import cross_validate
	min_samples_leaf_range = np.round(np.linspace(1, 10, 10)).astype(int)
	max_depth_range 	   = np.round(np.linspace(1, 30, 30)).astype(int)
	param_dist 			   = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range)
	num_features		   = len(X_train_little[0])
	cv 					   = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
	rnds 				   = EvolutionaryAlgorithmSearchCV( estimator     	    = RandomForestClassifier(n_estimators=int((1+num_features/2))),
															params              = param_dist,
															scoring             = "f1",
															cv                  = cv,
															verbose				= 1,
															population_size	    = 50,
															gene_mutation_prob  = 0.10,
															gene_crossover_prob = 0.5,
															tournament_size		= 3,
															generations_number	= 6,
															n_jobs				= 4)
	rnds.fit(X_train_little, y_train_little)
	# summarize the results of the random parameter search
	file = open("RF_DT_best_parameters.txt","w")
	file.write("{}\n".format(rnds.best_score_))
	file.write('min_samples_leaf: {}\n'.format(rnds.best_estimator_.min_samples_leaf))
	file.write('max_depth: {}\n'.format(rnds.best_estimator_.max_depth))
	file.close()

	if not tune_only:
		# apply best parameters RF
		rfc = RandomForestClassifier(n_estimators    = int((1+num_features/2)), 
									min_samples_leaf = rnds.best_estimator_.min_samples_leaf, 
									max_depth        = rnds.best_estimator_.max_depth,
									random_state     = SEED)
		rfc.fit(X_train_pca,y_train)
		sc_tr      = cross_validate(rfc, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False)
		sc_ts      = cross_validate(rfc, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False)
		pred       = rfc.predict(X_test_pca)
		pred_train = rfc.predict(X_train_pca)

		output_report("RF", y_train, pred_train, y_test, pred, sc_tr, sc_ts)

	return pred, pred_train
Esempio n. 16
0
def tune(model, X, y, cv):

    C = np.round(np.linspace(1, 10, 10)).astype(int)
    param_dist = dict(C=C, )
    #num_features		   = len(X[0])

    best_model = EvolutionaryAlgorithmSearchCV(estimator=model,
                                               params=param_dist,
                                               scoring="f1_weighted",
                                               cv=cv,
                                               verbose=1,
                                               population_size=50,
                                               gene_mutation_prob=0.10,
                                               gene_crossover_prob=0.5,
                                               tournament_size=3,
                                               generations_number=6,
                                               n_jobs=4)
    best_model.fit(X, y)

    return best_model
Esempio n. 17
0
def main():
    paramgrid = {"kernel": ["rbf"],
             "C"     : np.logspace(-9, 9, num=25, base=10),
             "gamma" : np.logspace(-9, 9, num=25, base=10)}

    random.seed(1)

    from evolutionary_search import EvolutionaryAlgorithmSearchCV
    cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=1,
                                   population_size=5,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)
    cv.fit(X, y)
Esempio n. 18
0
def tune(model, X, y, cv):

	min_samples_leaf_range = np.round(np.linspace(1, 10, 10)).astype(int)
	max_depth_range 	   = np.round(np.linspace(1, 30, 30)).astype(int)
	param_dist 			   = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range)

	best_model 			   = EvolutionaryAlgorithmSearchCV( estimator     	    = model,
															params              = param_dist,
															scoring             = "f1_weighted",
															cv                  = cv,
															verbose				= 1,
															population_size	    = 50,
															gene_mutation_prob  = 0.10,
															gene_crossover_prob = 0.5,
															tournament_size		= 3,
															generations_number	= 6,
															n_jobs				= 4)
	best_model.fit(X, y)

	return best_model
Esempio n. 19
0
def get_search(type, pipeline_configuration, hp_metric):
    fu_pl, clf_pl = pipeline_configuration.pipelines()
    if type == 'grid':
        return GridSearchCV(
            clf_pl,
            pipeline_configuration.parameters('grid'),
            scoring=hp_metric,
            cv=pipeline_configuration.pipeline_parameters_grid_n_splits,
            refit=False,
            n_jobs=-1,
            verbose=1), fu_pl
    elif type == 'randomized':
        return RandomizedSearchCV(
            clf_pl,
            pipeline_configuration.parameters('randomized'),
            scoring=hp_metric,
            random_state=pipeline_configuration.
            pipeline_parameters_randomized_random_state,
            n_iter=pipeline_configuration.
            pipeline_parameters_randomized_n_iter,
            cv=pipeline_configuration.pipeline_parameters_randomized_n_splits,
            refit=False,
            n_jobs=3,
            #pre_dispatch=6,
            verbose=1), fu_pl
    elif type == 'evolutionary':
        from random import seed
        seed(pipeline_configuration.
             pipeline_parameters_evolutionary_random_seed)
        return EvolutionaryAlgorithmSearchCV(
            clf_pl,
            pipeline_configuration.parameters('evolutionary'),
            scoring=hp_metric,
            cv=pipeline_configuration.
            pipeline_parameters_evolutionary_n_splits,
            population_size=pipeline_configuration.
            pipeline_parameters_evolutionary_population_size,
            gene_mutation_prob=pipeline_configuration.
            pipeline_parameters_evolutionary_gene_mutation_prob,
            gene_crossover_prob=pipeline_configuration.
            pipeline_parameters_evolutionary_gene_crossover_prob,
            tournament_size=pipeline_configuration.
            pipeline_parameters_evolutionary_tournament_size,
            generations_number=pipeline_configuration.
            pipeline_parameters_evolutionary_generations_number,
            refit=False,
            n_jobs=-1,
            verbose=1), fu_pl
Esempio n. 20
0
def grid_search(clf,params,scoring,search_method='ev'):
    if search_method == 'grid':
        gs=GridSearchCV(clf,param_grid=params,scoring=scoring,cv=2,refit=True,n_jobs=2,verbose=2)
    elif search_method == 'ev':
        gs=EvolutionaryAlgorithmSearchCV(clf,params=params,scoring=scoring,cv=2,refit=True,n_jobs=2,verbose=2,
                                        population_size=8,
                                        gene_mutation_prob=0.10,
                                        gene_crossover_prob=0.5,
                                        tournament_size=3,
                                        generations_number=4
                                        )
    else:
        gs=clf

    gs.fit(train_x,train_y)
    return gs
Esempio n. 21
0
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, evo=None, population_size=5):
    if score_func:
        if evo:
            gs = EvolutionaryAlgorithmSearchCV(pipeline, grid=parameters, scoring=score_func, n_jobs=n_jobs, population_size=population_size)
        else:
            gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        if evo:
            gs = EvolutionaryAlgorithmSearchCV(pipeline, grid=parameters, scoring=None, verbose=True, n_jobs=4, population_size=population_size)
        else:
            gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print "BEST", gs.best_params_, gs.best_score_, gs.grid_scores_
    best = gs.best_estimator_
    return best
Esempio n. 22
0
    for element in columnsSJ:
        del train_SJ[element]
        del test_SJ[element]
    for element in columnsIQ:
        del train_IQ[element]
        del test_IQ[element]

    paramgrid = {"max_depth": range(1, 10), "criterion": ["mae"]}
    paramgrid2 = {"max_depth": range(1, 10), "criterion": ["mae"]}

    rtreeForSJ = EvolutionaryAlgorithmSearchCV(
        estimator=RandomForestRegressor(),
        params=paramgrid,
        scoring="neg_mean_absolute_error",
        cv=StratifiedKFold(n_splits=4),
        verbose=1,
        population_size=50,
        gene_mutation_prob=0.10,
        gene_crossover_prob=0.5,
        tournament_size=3,
        generations_number=5,
        n_jobs=1)
    rtreeForIQ = EvolutionaryAlgorithmSearchCV(
        estimator=RandomForestRegressor(),
        params=paramgrid2,
        scoring="neg_mean_absolute_error",
        cv=StratifiedKFold(n_splits=4),
        verbose=1,
        population_size=50,
        gene_mutation_prob=0.10,
        gene_crossover_prob=0.5,
        tournament_size=3,
Esempio n. 23
0
def main():

    rand_st = 42

    classes = ["A", "B", "C", "D", "E", "F", "G", "H"]

    from itertools import combinations

    subsets = []

    for subset in combinations(classes, 2):
        subsets.append(subset)
        try:
            os.makedirs(PREPROCESS_PATH + str(subset[0] + subset[1]))
        except:
            pass

    for sub in subsets:

        PATH = "/home/bruno/base-wipo/preprocess-artigo/" + str(sub[0]) + str(
            sub[1]) + "/"

        print(" --------------------------" + str(sub[0]) + str(sub[1]) +
              "--------------------------------- ")
        treinamento = "treinamento.csv"

        y = pd.read_csv(os.path.join(os.path.dirname(__file__),
                                     PATH + treinamento),
                        header=0,
                        delimiter=";",
                        usecols=["section"],
                        quoting=3)
        '''
        X = pd.read_csv(os.path.join(os.path.dirname(__file__),PATH+treinamento),
                            header=0,delimiter=";",usecols=["data"], quoting=3)

        X = X["data"].tolist()
        '''

        X = TideneIterCSVGA(PATH + treinamento)

        tfidf_transformer = TfidfVectorizer()

        n = len(y)

        random.seed(1)
        from evolutionary_search import EvolutionaryAlgorithmSearchCV
        '''
        #--------------------------------- GA-RF ---------------------------------
        from evolutionary_search import EvolutionaryAlgorithmSearchCV

        clf_RF_gs = RandomForestClassifier(random_state=rand_st, n_jobs=-1)
        clf_RF_pg = [{
            'max_depth': np.logspace(0.3,4,num = 10 ,base=10,dtype='int'), #[1, 5, 13, 34, 87, 226, 584, 1505, 3880, 10000]
            'n_estimators' : np.logspace(0.1,3,num = 10 ,base=10,dtype='int'), #[1, 2, 5, 11, 24, 51, 107, 226, 476, 1000]
            'min_samples_split' : np.logspace(0.4, 1, num=5, base=10, dtype='int'), #[2, 3, 5, 7, 10]
            'min_samples_leaf' : np.logspace(0.1,1,num = 4 ,base=9,dtype='int'), #[1, 2, 4, 9]
            'max_features' : ['auto', None]
                      }]


        model_name = "100features_40minwords_10context"


        model = gensim.models.Word2Vec.load(model_name)


        w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}



        cv = EvolutionaryAlgorithmSearchCV(estimator=clf_RF_gs,
                                           params=clf_RF_pg,
                                           scoring="accuracy",
                                           cv=StratifiedKFold(n_splits=4),
                                           verbose=1,
                                           population_size=10,
                                           gene_mutation_prob=0.05,
                                           gene_crossover_prob=0.25,
                                           tournament_size=2,
                                           generations_number=3,
                                           n_jobs=2)


        #print(model.wv.most_similar('sensitive'))
        cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
        #cv.fit(MeanEmbeddingVectorizer(w2v).transform(X), y['section'].tolist())


        '''

        #--------------------------------- GA-SVC ---------------------------------

        paramgrid = {"C": np.logspace(-9, 9, num=25, base=10)}

        cv = EvolutionaryAlgorithmSearchCV(estimator=LinearSVC(),
                                           params=paramgrid,
                                           scoring="accuracy",
                                           cv=StratifiedKFold(n_splits=4),
                                           verbose=1,
                                           population_size=50,
                                           gene_mutation_prob=0.10,
                                           gene_crossover_prob=0.5,
                                           tournament_size=3,
                                           generations_number=5,
                                           n_jobs=4)

        out = cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
#learner = input("Option? ")
learner = True

if learner:
    # SVM
    from sklearn.svm import SVC
    C_range = np.linspace(1, 10, 100)
    gamma_range = np.linspace(3000, 4000, 100)
    param_dist = dict(gamma=gamma_range, C=C_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
    rnds = EvolutionaryAlgorithmSearchCV(estimator=SVC(max_iter=200),
                                         params=param_dist,
                                         scoring="f1",
                                         cv=cv,
                                         verbose=1,
                                         population_size=50,
                                         gene_mutation_prob=0.10,
                                         gene_crossover_prob=0.5,
                                         tournament_size=3,
                                         generations_number=6,
                                         n_jobs=4)
    rnds.fit(X_train_little, y_train_little)
    # summarize the results of the random parameter search
    print(rnds.best_score_)
    print('\nC: ')
    print(rnds.best_estimator_.C)
    print('\ngamma: ')
    print(rnds.best_estimator_.gamma)
    # apply best parameters
    svc = SVC(max_iter=400,
              C=rnds.best_estimator_.C,
    for index, row in data.iterrows():#iterate over csv file
        if index==limit:
            break
        img=cv2.imread(root_path + '/images/' + str(row['image_id']) + '.jpg')
        histogram=np.zeros((3, 256))
        for i in range(3):#calc hist for each channel
            histogram[i] = cv2.calcHist([img],[i],None,[256],[0,255]).ravel()
        X[index]=histogram.ravel()#to 1d array
    return X
    
X = extract_features(train_data, 1500)
y = train_data['image_label'].values[:X.shape[0]].ravel()
    
grid = {
    'knn__n_neighbors': [1, 10, 20, 30, 40, 60, 75, 100, 120, 160, 200],
    'knn__metric': ['euclidean', 'manhattan', 'chebyshev'],
    'knn__weights': ['uniform', 'distance'],
    'preprocess__norm': ['l1', 'l2', 'max']
}
pipeline = Pipeline(steps=[
    ('preprocess', preprocessing.Normalizer()),
    ('knn', neighbors.KNeighborsClassifier())
])

model = EvolutionaryAlgorithmSearchCV(pipeline, grid, scoring='roc_auc', verbose=True, n_jobs=4, population_size=10)
model.fit(X, y)

preds = model.predict_proba(extract_features(test_data))[:, 1]
test_data = test_data.drop('image_url', 1)
test_data['image_label'] = preds
test_data.to_csv(root_path + '/res.csv', index=False)
#######################################################################

paramgrid = {"kernel":["poly"],
             "C":C_range,
             "gamma":gamma_range,
             "degree":[3]            
             }

random.seed(1)
 
cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(),
                                    params=paramgrid,
                                    scoring="accuracy",
                                    cv=StratifiedKFold(trainlabel, n_folds=10),
                                    verbose=True,
                                    population_size=50,
                                    gene_mutation_prob=0.10,
                                    tournament_size=10,
                                    generations_number=5,
                                    n_jobs = 5)
cv.fit(train, trainlabel)
 
print cv.best_score_, cv.best_params_
##############################################


#ff = mysvc.training_manCV()



#ff.train_gene(train, trainlabel, 'poly', Cmin=-10, Cmax=10, numC=21, rmin=-10, rmax=10, numr=21, degree = 3)
Esempio n. 27
0
		pass
	def score(self, Xt, yt):
		return roc_auc_score(yt, outdet.IsolationForest(Xt, contamination=0.1, behaviour='new', return_scores = True, **self.params))
	
params = {
	'max_samples': list(range(100, 1000)),
	'n_estimators': list(range(50,200)),
	'max_features': list(range(1, data.shape[1])) }

cv = EvolutionaryAlgorithmSearchCV (
	estimator = odIfEstimator(),
	params = params,
	gene_type = [2, 2, 2],
	verbose = 1,
	population_size = 80,
	gene_mutation_prob = .1,
	gene_crossover_prob = .5,
	tournament_size = 3,
	generations_number = 8,
	# this is already validation set, no need for cross validation
	cv = ShuffleSplit(test_size=0.99, n_splits=1),
	n_jobs = 40)
	
cv.fit(data, labels)


params = {
	'k': list(range(100,1000)),
	'x': list(range(3,30)),
	'qv': [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5] }
	
Esempio n. 28
0
# use a full grid over all parameters

# run grid search
grid_search = GridSearchCV(clf, param_grid=param_grid)
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings." %
      (time() - start, len(grid_search.cv_results_['params'])))
#report(grid_search.cv_results_)
print(grid_search.best_score_)

# run evolutionary_
evolution_search = EvolutionaryAlgorithmSearchCV(
    estimator=clf,
    params=param_grid,
    #scoring="accuracy",
    verbose=1,
    population_size=50,
    gene_mutation_prob=0.10,
    gene_crossover_prob=0.5,
    tournament_size=3,
    generations_number=4,
)

start = time()
evolution_search.fit(X, y)
print("evolution_searchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
print(evolution_search.cv_results_)
Esempio n. 29
0
# pip install sklearn-deap
from evolutionary_search import EvolutionaryAlgorithmSearchCV


parameters = {'xg__learning_rate':  [0.03, 0.05],
              'xg__n_estimators':   [200,  300],
              'xg__max_depth':      [4,6],
              'pca__n_components' : [25,30]}


clf2 = EvolutionaryAlgorithmSearchCV(
                                   estimator=pipe_xg,  # How will objective be evaluated
                                   params=parameters,  # Parameters range
                                   scoring="accuracy", # Criteria
                                   cv=2,               # No of folds
                                   verbose=True,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   tournament_size=3,
                                   generations_number=10
                                   )


start = time.time()
clf2.fit(X_train, y_train)   # 1hr 2 minute
end = time.time()
(end-start)/60


clf2.best_params_
Esempio n. 30
0
                                   gene_crossover_prob=0.25,
                                   tournament_size=2,
                                   generations_number=3,
                                   n_jobs=2)


#print(model.wv.most_similar('sensitive'))
cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
#cv.fit(MeanEmbeddingVectorizer(w2v).transform(X), y['section'].tolist())


'''

#--------------------------------- GA-SVC ---------------------------------

paramgrid = {"C": np.logspace(-9, 9, num=25, base=10)}

cv = EvolutionaryAlgorithmSearchCV(estimator=LinearSVC(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)

cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
Esempio n. 31
0
class Wrap:
    """use GridSearchCV, RandomizedSearchCV and Evolutionary Search with this class.

    Methods
    -------
    grid_method(self)
        uses the GridSearchCV object and .fit()
        :returns grid_history = grid.fit()

    random_method(self)
        uses the RandomizedSearchCV object and .fit()
        :returns rand_history = rand.fit()

    Attributes
    ----------
    data_obj : (DataManager)
        passed in object from DataManager class
    network_obj: (NeuralArch)
        architecture of the neural network, so we can use it in KerasRegressor
    keras_regressor: (KerasRegressor)
        a KerasRegressor object with build_fn=network_obj.build_nn"""
    def __init__(self, data_obj, patience=10):
        # todo explain why do we have these different object in the class
        self.data_obj = data_obj
        network_obj = Net(self.data_obj)
        self.keras_regressor = KerasRegressor(build_fn=network_obj.build_nn)
        self.callback = MyCallbacks(patience=patience)

    def grid_method(self):
        """grid_method(self)
            uses the GridSearchCV object and .fit()
            :returns grid_history = grid.fit()"""

        params = dict(epochs=[200], batch_size=[4, 8])
        cv = [(slice(None), slice(None))
              ]  # why have i written this over and over??

        es = self.callback.es
        mc = self.callback.mc
        tb = self.callback.tb
        csv_log = self.callback.csv_log
        my_callbacks = [es, mc, csv_log]

        self.grid = GridSearchCV(estimator=self.keras_regressor,
                                 param_grid=params,
                                 cv=cv)
        grid_history = self.grid.fit(
            X=self.data_obj.x_train,
            y=self.data_obj.y_train,
            validation_data=(self.data_obj.x_validation,
                             self.data_obj.y_validation),
            verbose=0,
            callbacks=my_callbacks)
        return grid_history

    def random_method(self):
        """grid_method(self)
            uses the GridSearchCV object and .fit()
            returns grid_history = grid.fit()"""

        params = dict(epochs=[100],
                      batch_size=[2, 4, 8, 12, 16, 20, 24, 32, 36])
        cv = [(slice(None), slice(None))]

        es = self.callback.es
        mc = self.callback.mc
        tb = self.callback.tb
        my_callbacks = [es, mc]

        self.rand = RandomizedSearchCV(estimator=self.keras_regressor,
                                       param_distributions=params,
                                       n_iter=8)
        rand_history = self.rand.fit(
            X=self.data_obj.x_train,
            y=self.data_obj.y_train,
            validation_data=(self.data_obj.x_validation,
                             self.data_obj.y_validation),
            verbose=0,
            callbacks=my_callbacks)

        return rand_history

    def evolution_method(self):
        # this does not work, but we need to continue
        params = dict(epochs=[200], batch_size=[4, 8])
        # cv = [(slice(None), slice(None))]

        es = self.callback.es
        mc = self.callback.mc
        tb = self.callback.tb
        my_callbacks = [es, mc]

        fit_params = {
            "epochs":
            300,
            "validation_data":
            (self.data_obj.x_validation, self.data_obj.y_validation),
            "callbacks":
            my_callbacks
        }

        self.evo = EvolutionaryAlgorithmSearchCV(
            estimator=self.keras_regressor,
            params=params,
            verbose=0,
            population_size=10,
            fit_params=fit_params)

        evo_hist = self.evo.fit(X=self.data_obj.x_train,
                                y=self.data_obj.y_train)
        return evo_hist
sizes=['10','50','100','150','200','250']
methods=['MRMR','JMI','JMIM']
targets=np.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
	
for method in methods:
    for size in sizes:
	random.seed(1)
	X=X_original
	indices= joblib.load(method+' PICKLES/selected_indices_'+method+'.joblib.pkl')
	X=np.array(X)[:,indices]
	indices= joblib.load(method+' PICKLES/'+size+'-'+method+'.joblib.pkl')
	X=np.array(X)[:,indices]
	f=open('genetic/'+method+'-'+size+'.txt','w')
        print size
        print method
        print "svm.SVC"
        f.write("svm.SVC\n")
        cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(targets, n_folds=10),
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=-1)
	cv.fit(X, targets)
        f.write('\n=======================\n')
Esempio n. 33
0
def NN(X_train_little,
       y_train_little,
       X_train_pca,
       X_test_pca,
       y_train,
       y_test,
       tune_only=False):
    from sklearn.neural_network import MLPClassifier
    from sklearn.model_selection import cross_validate
    num_features = len(X_train_little[0])
    # prepare parameter grid
    alpha_range = np.linspace(0.005, 0.015, 50)
    learning_rate_range = np.linspace(0.01, 0.07, 50)
    epsilon_range = np.logspace(-9, -6, 50)
    beta_1_range = np.linspace(0.3, 0.7, 50)
    beta_2_range = np.linspace(0.3, 0.7, 50)
    a = int((num_features + 1) / 2)
    b = int((num_features + 1) / 2 + 10)
    med_layer_range = np.arange(a, b)
    param_dist = dict(alpha=alpha_range,
                      hidden_layer_sizes=(num_features, med_layer_range, 1),
                      learning_rate_init=learning_rate_range,
                      epsilon=epsilon_range,
                      beta_1=beta_1_range,
                      beta_2=beta_2_range)
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED)
    rnds = EvolutionaryAlgorithmSearchCV(
        estimator=MLPClassifier(early_stopping=True),
        params=param_dist,
        scoring="f1",
        cv=cv,
        verbose=1,
        population_size=50,
        gene_mutation_prob=0.10,
        gene_crossover_prob=0.5,
        tournament_size=3,
        generations_number=6,
        n_jobs=4)
    rnds.fit(X_train_little, y_train_little)
    # summarize the results of the random parameter search
    file = open("MLP_best_parameters.txt", "w")
    file.write("{}\n".format(rnds.best_score_))
    file.write('alpha: {}\n'.format(rnds.best_estimator_.alpha))
    file.write('hidden_layer_sizes: {}\n'.format(
        rnds.best_estimator_.hidden_layer_sizes))
    file.write('learning_rate_init: {}\n'.format(
        rnds.best_estimator_.learning_rate_init))
    file.write('epsilon: {}\n'.format(rnds.best_estimator_.epsilon))
    file.write('beta_1: {}\n'.format(rnds.best_estimator_.beta_1))
    file.write('beta_2: {}\n'.format(rnds.best_estimator_.beta_2))
    file.close()

    if not tune_only:
        # apply best parameters
        mlp = MLPClassifier(
            hidden_layer_sizes=rnds.best_estimator_.hidden_layer_sizes,
            early_stopping=True,
            alpha=rnds.best_estimator_.alpha,
            learning_rate_init=rnds.best_estimator_.learning_rate_init,
            epsilon=rnds.best_estimator_.epsilon,
            beta_1=rnds.best_estimator_.beta_1,
            beta_2=rnds.best_estimator_.beta_2,
            random_state=SEED)
        mlp.fit(X_train_pca, y_train)
        sc_tr = cross_validate(mlp,
                               X_train_pca,
                               y_train,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        sc_ts = cross_validate(mlp,
                               X_test_pca,
                               y_test,
                               scoring=SCORING,
                               cv=5,
                               return_train_score=False)
        pred = mlp.predict(X_test_pca)
        pred_train = mlp.predict(X_train_pca)

        output_report("MLP", y_train, pred_train, y_test, pred, sc_tr, sc_ts)