def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values #X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values # log-uniform: understand as search over p = exp(x) by varying x opt = EvolutionaryAlgorithmSearchCV( estimator=SVC(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.py params={ "kernel": ["rbf"], "C": np.logspace(1e-6, 1e+6, num=20, base=10), "gamma": np.logspace(3.0517578125e-05, 8, num=20, base=10), 'decision_function_shape': ['ovo', 'ovr'], 'degree': list(range(2, 5)), 'coef0': np.logspace(-1, 1, num=20, base=10), 'coef0': np.logspace(1e-5, 1e-1, num=20, base=10), }, cv=StratifiedKFold(n_splits=10, shuffle=True), scoring="accuracy", verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=3, generations_number=10, ) opt.fit(X_train, y_train)
def evolution_method(self): # this does not work, but we need to continue params = dict(epochs=[200], batch_size=[4, 8]) # cv = [(slice(None), slice(None))] es = self.callback.es mc = self.callback.mc tb = self.callback.tb my_callbacks = [es, mc] fit_params = { "epochs": 300, "validation_data": (self.data_obj.x_validation, self.data_obj.y_validation), "callbacks": my_callbacks } self.evo = EvolutionaryAlgorithmSearchCV( estimator=self.keras_regressor, params=params, verbose=0, population_size=10, fit_params=fit_params) evo_hist = self.evo.fit(X=self.data_obj.x_train, y=self.data_obj.y_train) return evo_hist
def readme(): data = sklearn.datasets.load_digits() X = data["data"] y = data["target"] paramgrid = { "kernel": ["rbf"], "C": np.logspace(-9, 9, num=25, base=10), "gamma": np.logspace(-9, 9, num=25, base=10) } random.seed(1) cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=10, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5) cv.fit(X, y) return cv
def svm_ga(X, y, rfe=True, paramgrid=None): # feature selection fltr = RFE(ReliefF(), n_features_to_select=5, step=0.5) if rfe else ReliefF(n_features_to_select=5, n_neighbors=3) clf = SVC() param_grid = { "svc__kernel": ["rbf"], 'svc__C': [10e-2, 10e-1, 10, 10e1, 10e2, 10e3, 10e4], 'svc__gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 1, 1.1] } if paramgrid is None else paramgrid # make pipeline pipe = make_pipeline(preprocessing.StandardScaler(), fltr, clf) from evolutionary_search import EvolutionaryAlgorithmSearchCV cv = EvolutionaryAlgorithmSearchCV(estimator=pipe, params=param_grid, scoring="accuracy", cv=10, verbose=1, population_size=50, gene_mutation_prob=0.1, gene_crossover_prob=0.8, tournament_size=10, generations_number=25) cv.fit(X, y) print(cv.best_params_) print(cv.best_score_)
def get_GeneticGridSearchCV(model, params, X, y): from evolutionary_search import EvolutionaryAlgorithmSearchCV print("performing genetic grid search ...") grid = EvolutionaryAlgorithmSearchCV(estimator=model, params=params, scoring="r2", # cv=StratifiedKFold(n_splits=2), verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=3, generations_number=10, # pmap = pool.map, ) grid.fit(X, y.ravel()) # fit the model and parameters # our classical metric for performance print("Best Accuracy: {}".format(grid.best_score_)) # the best parameters that caused the best accuracy print("Best Parameters: {}".format(grid.best_params_)) # the average time it took a model to fit to the data (in seconds) print("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3))) # the average time it took a model to predict out of sample data (in seconds) # this metric gives us insight into how this model will perform in real-time analysis print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3))) print(pd.DataFrame(grid.cv_results_).sort_values("mean_test_score", ascending=False).head())
def evo_search(xtrain, xtest, ytrain, ytest): layers = [[a, a] for a in range(10, 500, 100)] print(layers) parameters = { 'activation': ['identity', 'logistic', 'tanh', 'relu'], # 'solver': ['lbfgs', 'sgd', 'adam'], # 'learning_rate': ['constant', 'invscaling', 'adaptive'], 'batch_size': [5, 10, 20, 50, 100], 'learning_rate_init': [0.0001, 0.001, 0.01, 0.1], # 'hidden_layer_sizes': generate_networks(), 'hidden_layer_sizes': layers } print(parameters) print('Starting evolutionary search') cv = EvolutionaryAlgorithmSearchCV(estimator=MLPClassifier(random_state=42, max_iter=20000), params=parameters, scoring=make_scorer(f1_score), #cv=StratifiedKFold(n_splits=4), verbose=10, population_size=20, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=10, n_jobs=1) cv.fit(xtrain, ytrain.values.ravel()) print_classifier_stats(cv.best_estimator_, xtrain, xtest, ytrain, ytest) print('Evo search done...')
def SVM(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.svm import SVC from sklearn.model_selection import cross_validate C_range = np.linspace(1, 10, 101) gamma_range = np.linspace(3000, 4000, 100) param_dist = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV(estimator=SVC(max_iter=200), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("SVM_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('C: {}\n'.format(rnds.best_estimator_.C)) file.write('gamma: {}\n'.format(rnds.best_estimator_.gamma)) file.close() if not tune_only: # apply best parameters svc = SVC(max_iter=200, C=rnds.best_estimator_.C, gamma=rnds.best_estimator_.gamma, random_state=SEED) svc.fit(X_train_pca, y_train) sc_tr = cross_validate(svc, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(svc, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = svc.predict(X_test_pca) pred_train = svc.predict(X_train_pca) output_report("SVM", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
def LR2(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate C_range = np.linspace(1, 50, 50) tol_range = np.linspace(0.001, 0.01, 50) param_dist = dict(tol=tol_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV( estimator=LogisticRegression(penalty='l2'), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("LR2_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('C: {}\n'.format(rnds.best_estimator_.C)) file.write('tol: {}\n'.format(rnds.best_estimator_.tol)) file.close() if not tune_only: # apply best parameters l2r = LogisticRegression(C=rnds.best_estimator_.C, tol=rnds.best_estimator_.tol, random_state=SEED) l2r.fit(X_train_pca, y_train) sc_tr = cross_validate(l2r, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(l2r, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = l2r.predict(X_test_pca) pred_train = l2r.predict(X_train_pca) output_report("LR2", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
def sk_params_search_best( clf, X, y, param_grid={"alpha": np.linspace(0, 1, 5)}, method="gridsearch", param_search={ "scorename": "r2", "cv": 5, "population_size": 5, "generations_number": 3 }, ): """ Genetic: population_size=5, ngene_mutation_prob=0.10,,gene_crossover_prob=0.5, tournament_size=3, generations_number=3 :param X: :param y: :param clf: :param param_grid: :param method: :param param_search: :return: """ p = param_search myscore = sk_score_get(p["scorename"]) if method == "gridsearch": from sklearn.model_selection import GridSearchCV grid = GridSearchCV(clf, param_grid, cv=p["cv"], scoring=myscore) grid.fit(X, y) return grid.best_score_, grid.best_params_ if method == "genetic": from evolutionary_search import EvolutionaryAlgorithmSearchCV from sklearn.model_selection import StratifiedKFold # paramgrid = {"alpha": np.linspace(0,1, 20) , "l1_ratio": np.linspace(0,1, 20) } cv = EvolutionaryAlgorithmSearchCV( estimator=clf, params=param_grid, scoring=myscore, cv=StratifiedKFold(y), verbose=True, population_size=p["population_size"], gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=p["generations_number"], ) cv.fit(X, y) return cv.best_score_, cv.best_params_
def ev_tree(self): ev_params = self.default_evparams ev_params['estimator'] = RandomForestClassifier() ev_params['params'] = self.frst_space cv = EvolutionaryAlgorithmSearchCV(**ev_params) cv.fit(self.X_insample, self.y_insample) clf = cv.best_estimator_ self.frst_called = True self.opt_frst = clf
def ev_svm(self): ev_params = self.default_evparams ev_params['estimator'] = SVC(probability=True) ev_params['params'] = self.svm_space cv = EvolutionaryAlgorithmSearchCV(**ev_params) cv.fit(self.X_insample, self.y_insample) clf = cv.best_estimator_ self.svm_called = True self.opt_svm = clf
def NB(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.naive_bayes import BernoulliNB from sklearn.model_selection import cross_validate alpha_range = np.linspace(0, 500, 500) param_dist = dict(alpha=alpha_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2) rnds = EvolutionaryAlgorithmSearchCV(estimator=BernoulliNB(), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("NB_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('alpha: {}\n'.format(rnds.best_estimator_.alpha)) file.close() if not tune_only: # apply best parameters gnb = BernoulliNB(alpha=rnds.best_estimator_.alpha) gnb.fit(X_train_pca, y_train) sc_tr = cross_validate(gnb, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(gnb, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = gnb.predict(X_test_pca) pred_train = gnb.predict(X_train_pca) output_report("NB", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
def GA_tune_lgbm(cls, x, y): tuner = EvolutionaryAlgorithmSearchCV( estimator=LGBMClassifier(), params=cls.lgbm_paramgrid, scoring="accuracy", cv=TimeSeriesSplit(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.2, gene_crossover_prob=0.5, tournament_size=3, generations_number=20, ) tuner.fit(x, y) return tuner.best_params_
def geneticGridTest(self): print("performing Genetic grid search...") gridSearch = EvolutionaryAlgorithmSearchCV(estimator=self.classifier, params=self.gridParams, cv=self.kfold, scoring='accuracy', verbose=True, iid='False', n_jobs=4, population_size=20, gene_mutation_prob=0.30, tournament_size=2, generations_number=5) gridSearch.fit(self.X, self.y)
def get_search(type, pipeline_configuration, hp_metric): fu_pl, clf_pl = pipeline_configuration.pipelines() if type == 'grid': return GridSearchCV( clf_pl, pipeline_configuration.parameters('grid'), scoring=hp_metric, cv=pipeline_configuration.pipeline_parameters_grid_n_splits, refit=False, n_jobs=-1, verbose=1), fu_pl elif type == 'randomized': return RandomizedSearchCV( clf_pl, pipeline_configuration.parameters('randomized'), scoring=hp_metric, random_state=pipeline_configuration. pipeline_parameters_randomized_random_state, n_iter=pipeline_configuration. pipeline_parameters_randomized_n_iter, cv=pipeline_configuration.pipeline_parameters_randomized_n_splits, refit=False, n_jobs=3, #pre_dispatch=6, verbose=1), fu_pl elif type == 'evolutionary': from random import seed seed(pipeline_configuration. pipeline_parameters_evolutionary_random_seed) return EvolutionaryAlgorithmSearchCV( clf_pl, pipeline_configuration.parameters('evolutionary'), scoring=hp_metric, cv=pipeline_configuration. pipeline_parameters_evolutionary_n_splits, population_size=pipeline_configuration. pipeline_parameters_evolutionary_population_size, gene_mutation_prob=pipeline_configuration. pipeline_parameters_evolutionary_gene_mutation_prob, gene_crossover_prob=pipeline_configuration. pipeline_parameters_evolutionary_gene_crossover_prob, tournament_size=pipeline_configuration. pipeline_parameters_evolutionary_tournament_size, generations_number=pipeline_configuration. pipeline_parameters_evolutionary_generations_number, refit=False, n_jobs=-1, verbose=1), fu_pl
def grid_search(clf,params,scoring,search_method='ev'): if search_method == 'grid': gs=GridSearchCV(clf,param_grid=params,scoring=scoring,cv=2,refit=True,n_jobs=2,verbose=2) elif search_method == 'ev': gs=EvolutionaryAlgorithmSearchCV(clf,params=params,scoring=scoring,cv=2,refit=True,n_jobs=2,verbose=2, population_size=8, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=4 ) else: gs=clf gs.fit(train_x,train_y) return gs
def RF_DT(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import cross_validate min_samples_leaf_range = np.round(np.linspace(1, 10, 10)).astype(int) max_depth_range = np.round(np.linspace(1, 30, 30)).astype(int) param_dist = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range) num_features = len(X_train_little[0]) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV( estimator = RandomForestClassifier(n_estimators=int((1+num_features/2))), params = param_dist, scoring = "f1", cv = cv, verbose = 1, population_size = 50, gene_mutation_prob = 0.10, gene_crossover_prob = 0.5, tournament_size = 3, generations_number = 6, n_jobs = 4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("RF_DT_best_parameters.txt","w") file.write("{}\n".format(rnds.best_score_)) file.write('min_samples_leaf: {}\n'.format(rnds.best_estimator_.min_samples_leaf)) file.write('max_depth: {}\n'.format(rnds.best_estimator_.max_depth)) file.close() if not tune_only: # apply best parameters RF rfc = RandomForestClassifier(n_estimators = int((1+num_features/2)), min_samples_leaf = rnds.best_estimator_.min_samples_leaf, max_depth = rnds.best_estimator_.max_depth, random_state = SEED) rfc.fit(X_train_pca,y_train) sc_tr = cross_validate(rfc, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(rfc, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = rfc.predict(X_test_pca) pred_train = rfc.predict(X_train_pca) output_report("RF", y_train, pred_train, y_test, pred, sc_tr, sc_ts) return pred, pred_train
def tune(model, X, y, cv): min_samples_leaf_range = np.round(np.linspace(1, 10, 10)).astype(int) max_depth_range = np.round(np.linspace(1, 30, 30)).astype(int) param_dist = dict(min_samples_leaf=min_samples_leaf_range, max_depth=max_depth_range) best_model = EvolutionaryAlgorithmSearchCV( estimator = model, params = param_dist, scoring = "f1_weighted", cv = cv, verbose = 1, population_size = 50, gene_mutation_prob = 0.10, gene_crossover_prob = 0.5, tournament_size = 3, generations_number = 6, n_jobs = 4) best_model.fit(X, y) return best_model
def main(): paramgrid = {"kernel": ["rbf"], "C" : np.logspace(-9, 9, num=25, base=10), "gamma" : np.logspace(-9, 9, num=25, base=10)} random.seed(1) from evolutionary_search import EvolutionaryAlgorithmSearchCV cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=5, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) cv.fit(X, y)
def tune(model, X, y, cv): C = np.round(np.linspace(1, 10, 10)).astype(int) param_dist = dict(C=C, ) #num_features = len(X[0]) best_model = EvolutionaryAlgorithmSearchCV(estimator=model, params=param_dist, scoring="f1_weighted", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) best_model.fit(X, y) return best_model
gene_crossover_prob=0.25, tournament_size=2, generations_number=3, n_jobs=2) #print(model.wv.most_similar('sensitive')) cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist()) #cv.fit(MeanEmbeddingVectorizer(w2v).transform(X), y['section'].tolist()) ''' #--------------------------------- GA-SVC --------------------------------- paramgrid = {"C": np.logspace(-9, 9, num=25, base=10)} cv = EvolutionaryAlgorithmSearchCV(estimator=LinearSVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
def NN(X_train_little, y_train_little, X_train_pca, X_test_pca, y_train, y_test, tune_only=False): from sklearn.neural_network import MLPClassifier from sklearn.model_selection import cross_validate num_features = len(X_train_little[0]) # prepare parameter grid alpha_range = np.linspace(0.005, 0.015, 50) learning_rate_range = np.linspace(0.01, 0.07, 50) epsilon_range = np.logspace(-9, -6, 50) beta_1_range = np.linspace(0.3, 0.7, 50) beta_2_range = np.linspace(0.3, 0.7, 50) a = int((num_features + 1) / 2) b = int((num_features + 1) / 2 + 10) med_layer_range = np.arange(a, b) param_dist = dict(alpha=alpha_range, hidden_layer_sizes=(num_features, med_layer_range, 1), learning_rate_init=learning_rate_range, epsilon=epsilon_range, beta_1=beta_1_range, beta_2=beta_2_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=SEED) rnds = EvolutionaryAlgorithmSearchCV( estimator=MLPClassifier(early_stopping=True), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search file = open("MLP_best_parameters.txt", "w") file.write("{}\n".format(rnds.best_score_)) file.write('alpha: {}\n'.format(rnds.best_estimator_.alpha)) file.write('hidden_layer_sizes: {}\n'.format( rnds.best_estimator_.hidden_layer_sizes)) file.write('learning_rate_init: {}\n'.format( rnds.best_estimator_.learning_rate_init)) file.write('epsilon: {}\n'.format(rnds.best_estimator_.epsilon)) file.write('beta_1: {}\n'.format(rnds.best_estimator_.beta_1)) file.write('beta_2: {}\n'.format(rnds.best_estimator_.beta_2)) file.close() if not tune_only: # apply best parameters mlp = MLPClassifier( hidden_layer_sizes=rnds.best_estimator_.hidden_layer_sizes, early_stopping=True, alpha=rnds.best_estimator_.alpha, learning_rate_init=rnds.best_estimator_.learning_rate_init, epsilon=rnds.best_estimator_.epsilon, beta_1=rnds.best_estimator_.beta_1, beta_2=rnds.best_estimator_.beta_2, random_state=SEED) mlp.fit(X_train_pca, y_train) sc_tr = cross_validate(mlp, X_train_pca, y_train, scoring=SCORING, cv=5, return_train_score=False) sc_ts = cross_validate(mlp, X_test_pca, y_test, scoring=SCORING, cv=5, return_train_score=False) pred = mlp.predict(X_test_pca) pred_train = mlp.predict(X_train_pca) output_report("MLP", y_train, pred_train, y_test, pred, sc_tr, sc_ts)
from evolutionary_search import EvolutionaryAlgorithmSearchCV from sklearn.linear_model import RandomizedLogisticRegression from sklearn.linear_model import LogisticRegression c_range = np.logspace(-1, 2, 30) print('\nRegularization Parameter C initial grid:') print c_range param_dist = dict(C=c_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=2018) rnds = EvolutionaryAlgorithmSearchCV( estimator=LogisticRegression(random_state=0), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X, y) best_C = rnds.best_estimator_.C # apply best parameters lr = RandomizedLogisticRegression(C=best_C, random_state=0, sample_fraction=0.75, n_resampling=200, selection_threshold=0.25) lr.fit(X, y)
#learner = input("Option? ") learner = True if learner: # SVM from sklearn.svm import SVC C_range = np.linspace(1, 10, 100) gamma_range = np.linspace(3000, 4000, 100) param_dist = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42) rnds = EvolutionaryAlgorithmSearchCV(estimator=SVC(max_iter=200), params=param_dist, scoring="f1", cv=cv, verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=6, n_jobs=4) rnds.fit(X_train_little, y_train_little) # summarize the results of the random parameter search print(rnds.best_score_) print('\nC: ') print(rnds.best_estimator_.C) print('\ngamma: ') print(rnds.best_estimator_.gamma) # apply best parameters svc = SVC(max_iter=400, C=rnds.best_estimator_.C,
# use a full grid over all parameters # run grid search grid_search = GridSearchCV(clf, param_grid=param_grid) start = time() grid_search.fit(X, y) print("GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time() - start, len(grid_search.cv_results_['params']))) #report(grid_search.cv_results_) print(grid_search.best_score_) # run evolutionary_ evolution_search = EvolutionaryAlgorithmSearchCV( estimator=clf, params=param_grid, #scoring="accuracy", verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=4, ) start = time() evolution_search.fit(X, y) print("evolution_searchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) print(evolution_search.cv_results_)
'tournament_size': 4, 'generations_number': 100, 'n_jobs': 6 } # parallel processes # we create an instance of SVM and fit out data. We do not scale our # data since we want to plot the support vectors models = ( EvolutionaryAlgorithmSearchCV(estimator=SVC(), params={ "kernel": ["rbf"], "C": np.logspace(-9, 9, num=10000, base=10), "gamma": np.logspace(-9, 9, num=10000, base=10) }, **generic_args), EvolutionaryAlgorithmSearchCV(estimator=SVC(), params={ "kernel": ["linear"], "C": np.logspace(-9, 9, num=10000, base=10)
def main(): rand_st = 42 classes = ["A", "B", "C", "D", "E", "F", "G", "H"] from itertools import combinations subsets = [] for subset in combinations(classes, 2): subsets.append(subset) try: os.makedirs(PREPROCESS_PATH + str(subset[0] + subset[1])) except: pass for sub in subsets: PATH = "/home/bruno/base-wipo/preprocess-artigo/" + str(sub[0]) + str( sub[1]) + "/" print(" --------------------------" + str(sub[0]) + str(sub[1]) + "--------------------------------- ") treinamento = "treinamento.csv" y = pd.read_csv(os.path.join(os.path.dirname(__file__), PATH + treinamento), header=0, delimiter=";", usecols=["section"], quoting=3) ''' X = pd.read_csv(os.path.join(os.path.dirname(__file__),PATH+treinamento), header=0,delimiter=";",usecols=["data"], quoting=3) X = X["data"].tolist() ''' X = TideneIterCSVGA(PATH + treinamento) tfidf_transformer = TfidfVectorizer() n = len(y) random.seed(1) from evolutionary_search import EvolutionaryAlgorithmSearchCV ''' #--------------------------------- GA-RF --------------------------------- from evolutionary_search import EvolutionaryAlgorithmSearchCV clf_RF_gs = RandomForestClassifier(random_state=rand_st, n_jobs=-1) clf_RF_pg = [{ 'max_depth': np.logspace(0.3,4,num = 10 ,base=10,dtype='int'), #[1, 5, 13, 34, 87, 226, 584, 1505, 3880, 10000] 'n_estimators' : np.logspace(0.1,3,num = 10 ,base=10,dtype='int'), #[1, 2, 5, 11, 24, 51, 107, 226, 476, 1000] 'min_samples_split' : np.logspace(0.4, 1, num=5, base=10, dtype='int'), #[2, 3, 5, 7, 10] 'min_samples_leaf' : np.logspace(0.1,1,num = 4 ,base=9,dtype='int'), #[1, 2, 4, 9] 'max_features' : ['auto', None] }] model_name = "100features_40minwords_10context" model = gensim.models.Word2Vec.load(model_name) w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)} cv = EvolutionaryAlgorithmSearchCV(estimator=clf_RF_gs, params=clf_RF_pg, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=10, gene_mutation_prob=0.05, gene_crossover_prob=0.25, tournament_size=2, generations_number=3, n_jobs=2) #print(model.wv.most_similar('sensitive')) cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist()) #cv.fit(MeanEmbeddingVectorizer(w2v).transform(X), y['section'].tolist()) ''' #--------------------------------- GA-SVC --------------------------------- paramgrid = {"C": np.logspace(-9, 9, num=25, base=10)} cv = EvolutionaryAlgorithmSearchCV(estimator=LinearSVC(), params=paramgrid, scoring="accuracy", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=4) out = cv.fit(tfidf_transformer.fit_transform(X), y['section'].tolist())
pass def score(self, Xt, yt): return roc_auc_score(yt, outdet.IsolationForest(Xt, contamination=0.1, behaviour='new', return_scores = True, **self.params)) params = { 'max_samples': list(range(100, 1000)), 'n_estimators': list(range(50,200)), 'max_features': list(range(1, data.shape[1])) } cv = EvolutionaryAlgorithmSearchCV ( estimator = odIfEstimator(), params = params, gene_type = [2, 2, 2], verbose = 1, population_size = 80, gene_mutation_prob = .1, gene_crossover_prob = .5, tournament_size = 3, generations_number = 8, # this is already validation set, no need for cross validation cv = ShuffleSplit(test_size=0.99, n_splits=1), n_jobs = 40) cv.fit(data, labels) params = { 'k': list(range(100,1000)), 'x': list(range(3,30)), 'qv': [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5] }
for element in columnsSJ: del train_SJ[element] del test_SJ[element] for element in columnsIQ: del train_IQ[element] del test_IQ[element] paramgrid = {"max_depth": range(1, 10), "criterion": ["mae"]} paramgrid2 = {"max_depth": range(1, 10), "criterion": ["mae"]} rtreeForSJ = EvolutionaryAlgorithmSearchCV( estimator=RandomForestRegressor(), params=paramgrid, scoring="neg_mean_absolute_error", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3, generations_number=5, n_jobs=1) rtreeForIQ = EvolutionaryAlgorithmSearchCV( estimator=RandomForestRegressor(), params=paramgrid2, scoring="neg_mean_absolute_error", cv=StratifiedKFold(n_splits=4), verbose=1, population_size=50, gene_mutation_prob=0.10, gene_crossover_prob=0.5, tournament_size=3,
# pip install sklearn-deap from evolutionary_search import EvolutionaryAlgorithmSearchCV parameters = {'xg__learning_rate': [0.03, 0.05], 'xg__n_estimators': [200, 300], 'xg__max_depth': [4,6], 'pca__n_components' : [25,30]} clf2 = EvolutionaryAlgorithmSearchCV( estimator=pipe_xg, # How will objective be evaluated params=parameters, # Parameters range scoring="accuracy", # Criteria cv=2, # No of folds verbose=True, population_size=50, gene_mutation_prob=0.10, tournament_size=3, generations_number=10 ) start = time.time() clf2.fit(X_train, y_train) # 1hr 2 minute end = time.time() (end-start)/60 clf2.best_params_