def train( features_npy: str, targets_csv: str, n_splits: int = 5, n_repeats: int = 10, logdir: str = '.', random_seed=82 ): model = LogisticRegression( penalty='elasticnet', C=1.0, class_weight='balanced', random_state=random_seed, solver='saga', max_iter=200, n_jobs=-1, l1_ratio=1.0 ) X = np.load(features_npy) df = pd.read_csv(targets_csv) y = df['label'].values logs = [] splitter = model_selection.RepeatedStratifiedKFold( n_splits=n_splits, n_repeats=n_repeats, random_state=random_seed ) pbar = tqdm( splitter.split(X, y, groups=df['id']), desc='folds', total=splitter.get_n_splits() ) for i, (train_index, valid_index) in enumerate(pbar): model_ = clone(model) X_train, X_test = X[train_index], X[valid_index] y_train, y_test = y[train_index], y[valid_index] model_.fit(X_train, y_train) preds = model_.predict_proba(X_test)[:, 1] logs.append({'auc': metrics.roc_auc_score(y_test, preds)}) pbar.set_postfix(**logs[-1]) auc_ = np.array([it['auc'] for it in logs]) print(f'AUC (mean): {auc_.mean()}\tAUC (str): {auc_.std()}') with open(os.path.join(logdir, 'logs.pkl'), 'wb') as f: pickle.dump(logs, f) # train final model on all data model.fit(X, y) with open(os.path.join(logdir, 'model.pkl'), 'wb') as f: pickle.dump(model, f)
def get_models_CV_scores(X_train, Y_train, models, cv_loops): # Spot Check Algorithms with cross validation # evaluate each model in turn scores = [] names = [] results = [] for name, model in models: cv_results_colection = None kfold = model_selection.RepeatedStratifiedKFold(n_repeats=cv_loops, n_splits=10) try: cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, n_jobs=-1, scoring=SCORING) except ValueError as e: print('ValueError{}'.format(None)) else: # cv_results_colection.extend(cv_results) scores.append(cv_results) names.append(name) results.append({ "model": SkFModel(name, model), "score": cv_results.mean() }) # Compare Algorithms fig = plt.figure() plt.title('Source-{} ,{} Attributes'.format(os.path.basename(TRAIN_FILE), X_train.shape[-1]), fontsize=10) plt.suptitle('Algorithm Comparison', fontsize=16) ax = fig.add_subplot(111) plt.boxplot(scores) ax.set_xticklabels(names) plt.ylabel('Score ({})'.format(SCORING), fontsize=14) if config['outputs']['save_charts']: fig.savefig(os.path.join( OUTPUT, '{}_Algorithm_Comparison.png'.format(X_train.shape[-1])), dpi=1200) if config['outputs']['show_charts']: plt.show() plt.close(fig) return results
from sklearn import model_selection, metrics from tqdm import tqdm repetitions = 10 datasets = h.datasets() clfs = h.classifiers() for dataset in datasets: print(dataset) # Gather dataset ds = pd.read_csv(dataset[0], header=None).as_matrix() X, y = ds[:, :-1], ds[:, -1].astype("int") # CV for repetition in range(repetitions): cv = model_selection.RepeatedStratifiedKFold( n_splits=2, n_repeats=5, random_state=np.random.randint(9999)) fold = 0 k_accuracies = [] for train, test in cv.split(X, y): fold_X_train, fold_y_train = X[train], y[train] fold_X_test, fold_y_test = X[test], y[test] clf_accuracies = [] for clf_n in clfs: clf = clfs[clf_n] clf.fit(fold_X_train, fold_y_train) probas = clf.predict_proba(fold_X_test) prediction = np.argmax(probas, axis=1) accuracy = metrics.accuracy_score(fold_y_test, prediction) clf_accuracies.append(accuracy) k_accuracies.append(clf_accuracies)
models.append(('HDDT', HDDT)) models.append(('RF', RF)) models.append(('HDRF', HDRF)) models.append(('ADADT', ADADT)) models.append(('ADASVM', ADASVM)) models.append(('BAGDT', BAGDT)) models.append(('BAGSVM', BAGSVM)) models.append(('XGB', XGB)) # COMPUTE RESULTS results = [] names = [] scoring = make_scorer(metrics.fbeta_score, beta=2) for name, model in models: repkfold = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=seed) cv_results_new = model_selection.cross_val_score(model, X_train, Y_train, cv=repkfold, scoring=scoring) results.append(cv_results_new) names.append(name) msg = "%s: %f (%f)" % (name, cv_results_new.mean(), cv_results_new.std()) print(msg) # BOXPLOT fig = plt.figure() ax = fig.add_subplot(111) plt.boxplot(results, showmeans=True)
'bank_r_b_124002971', 'pay_by_ach', 'bank_a_left_blank', 'addres_z_84010' ] target_col = 'good_loan' thresh = 0.0 # INITIALIZE ------------------------------------------------------------------ # Load data data = pd.read_csv(data_file, index_col='customer_id') # Create evaluation folds fold_list = model_selection.RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=1111) # LOOK AT DATA ---------------------------------------------------------------- print('Data Shape: ', data.shape) # Print min/max/mean/std print(data.agg(['min', 'mean', 'median', 'max', 'std']).transpose()) # Look at correlation rfpimp.plot_corr_heatmap(data[train_cols], figsize=(10, 8)) plt.show(block=False) # BUILD A CLASSIFIER ----------------------------------------------------------
return history, test_acc def get_generators(train_index, test_index, graph_labels, batch_size): train_gen = generator.flow( train_index, targets=graph_labels.iloc[train_index].values, batch_size=batch_size ) test_gen = generator.flow( test_index, targets=graph_labels.iloc[test_index].values, batch_size=batch_size ) return train_gen, test_gen stratified_folds = model_selection.RepeatedStratifiedKFold( n_splits=folds, n_repeats=n_repeats ).split(graph_labels, graph_labels) for i, (train_index, test_index) in enumerate(stratified_folds): print(i) train_gen, test_gen = get_generators( train_index, test_index, graph_labels, batch_size=20 ) model = create_graph_classification_model(generator) history, acc = train_fold(model, train_gen, test_gen, es, epochs) test_accs.append(acc) print(np.mean(test_accs))
Seleção Wrapper RFE """ from sklearn.feature_selection import RFECV from sklearn.feature_selection import RFE from sklearn.model_selection import StratifiedKFold from sklearn.metrics import r2_score from sklearn.ensemble import RandomForestRegressor from sklearn import model_selection #Cria estimador estimator3 = RandomForestRegressor() #Cria seletor com validação cruzada 3-fold e 10 repetições selector3 = RFECV(estimator3, min_features_to_select=1, step=1, cv=model_selection.RepeatedStratifiedKFold(n_splits=3, n_repeats=10), scoring='r2', n_jobs=-1) selector3 = selector3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"]) for col in colsNotSalePrice2[selector3.ranking_ == 1]: print("%s" % col, end=", ") #Seleção por forward SequentialFeatureSelector from mlxtend import feature_selection #SFS para random forests sfs3 = feature_selection.SequentialFeatureSelector( estimator3, k_features=79, forward=True,
def fit_predict(self, X, y, test_X, categorical_feature, num_iterations=150, early_stopping_rounds=30): random_seed = 2019 model_idx = 0 predicted = [] random_seed += 1 kfold = model_selection.RepeatedStratifiedKFold( n_splits=self.n_split, n_repeats=10, random_state=random_seed) data = lgb.Dataset(X, label=y, categorical_feature=categorical_feature, free_raw_data=True) # data initialization for time calculation lgb.train(self.params_list[0], data, 1, categorical_feature=categorical_feature) for fold_idx, (train_index, valid_index) in enumerate(kfold.split(X, y)): self.timer.print("{} model learning".format(model_idx)) learn_start_time = time.time() # train_X, valid_X = X[train_index], X[valid_index] # train_y, valid_y = y[train_index], y[valid_index] # train_data = lgb.Dataset(train_X, label=train_y) # valid_data = lgb.Dataset(valid_X, label=valid_y) train_data = data.subset(train_index) valid_data = data.subset(valid_index) random_seed += 1 params = self.params_list[model_idx % len(self.params_list)] params["seed"] = random_seed model = lgb.train( params, train_data, num_iterations, valid_data, early_stopping_rounds=early_stopping_rounds, verbose_eval=50, categorical_feature=categorical_feature, callbacks=[TimeoutCallback(self.timer, params['metric'])]) self.models.append(model) print(model.current_iteration()) predicted.append(model.predict(test_X)) gc.collect() # del train_X, valid_X, train_y, valid_y, train_data, valid_data del train_data, valid_data gc.collect() self.learning_time.append( (time.time() - learn_start_time) * num_iterations / min([ model.current_iteration() + early_stopping_rounds, num_iterations ])) self.timer.print_memory_usage() if self.timer.time_remain < (1.5 * np.max(self.learning_time) + 0.05 * self.timer.time_budget + 10): break model_idx += 1 if model_idx >= self.max_model: break if len(predicted) > 0: return np.stack(predicted).mean(axis=0) else: return np.zeros(len(test_X))
START_TIME = datetime.now().isoformat(timespec='minutes') print(f'Start time is: {START_TIME}') tpot_name = f'exported_pipeline.time.{START_TIME}.{args.feat}.tpot.py' run_log = 'run_' + tpot_name print("TPOT runscript written to:", runscripts_dir / run_log) copyfile(os.path.realpath(__file__), runscripts_dir / run_log) # Read in data (spatial, temporal, features) import cnmfereview as cr X_train, X_test, y_train, y_test = cr.set_up_remote_job(DATA_PATH, feature=args.feat) # stratified K fold chooses same proportion of labels per fold kf = model_selection.RepeatedStratifiedKFold(n_splits=cv_folds, n_repeats=5, random_state=random_state) config_dict = None # 'TPOT light' tpot = TPOTClassifier(population_size=population_size, verbosity=2, scoring='f1', random_state=random_state, cv=kf, n_jobs=n_jobs, max_time_mins=max_time_mins, max_eval_time_mins=max_eval_time_mins, config_dict=config_dict, memory=None, periodic_checkpoint_folder=checkpoint_path) print(f'Starting TPOT training at: {START_TIME}')
print(dict(zip(unique, counts))) #0s are overweighted by 1s with a ratio of 0.217. Therefore, models might not be able to predict 0s as much as 1s.. #general train test splits Xtrain, Xtest, ytrain, ytest = ms.train_test_split(x_data, y_data, random_state=1) #feature selection to reduce unnecessary variables selector = fs.SelectKBest(fs.f_classif, k=5) selector.fit(Xtrain, ytrain) #Support Vector Machine training w/o weight from sklearn import svm model = svm.SVC() model.fit(Xtrain, ytrain) y_model = model.predict(Xtest) cv = ms.RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # evaluate model print("SVM normal Accuracy Score", met.accuracy_score(ytest, y_model)) #accuracy cross_val = ms.cross_val_score(model, Xtrain, ytrain, scoring='roc_auc', cv=cv, n_jobs=-1) #cross val print("SVM normal Cross Validation Score", np.mean(cross_val)) #confusion matrix mat = met.confusion_matrix(ytest, y_model) sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False) plt.xlabel('true SVM(normal)') plt.ylabel('predicted SVM(normal)') plt.show() #SVM with "weights" from sklearn import svm
def k_fold_cross_validation(x, y, splits, repeats): seed = 7 # classificadores para o ensemble clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1') clf2 = MultinomialNB(alpha=1130) clf3 = GaussianNB() clf4 = KNeighborsClassifier(n_neighbors=450) clf5 = ExtraTreesClassifier(random_state=seed, criterion='gini', n_estimators=1000, max_features=5) clf6 = QuadraticDiscriminantAnalysis() eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2), ('NBG', clf3), ('KNN', clf4), ('ET', clf5), ('ADQ', clf6)], voting='hard') # Algoritmos comparados models = [] models.append( ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1'))) models.append(('ADL', LinearDiscriminantAnalysis())) models.append(('ADQ', QuadraticDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier(n_neighbors=450))) models.append(('NBG', GaussianNB())) models.append(('NBM', MultinomialNB(alpha=1130))) models.append(('SVML', SVC(random_state=seed, kernel='linear', C=0.1))) models.append( ('SVMR', SVC(random_state=seed, kernel='rbf', C=1, gamma=0.0001))) models.append(('RF', RandomForestClassifier(random_state=seed, criterion='entropy', n_estimators=1000, max_features=5))) models.append(('ET', ExtraTreesClassifier(random_state=seed, criterion='gini', n_estimators=1000, max_features=5))) models.append(('ENS', eclf)) # loop que analisa cada algoritmo score = 'accuracy' results1 = [] names1 = [] mean1 = [] std1 = [] for name, model in models: kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats, random_state=seed) cv_results = model_selection.cross_val_score(model, x, y, cv=kfold, scoring=score) results1.append(cv_results) names1.append(name) mean1.append(cv_results.mean() * 100) std1.append(cv_results.std() * 100) msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std() * 100) print(msg) list_results_acc = list(zip(names1, results1)) print(list_results_acc) df_results_acc = pd.DataFrame(list_results_acc) if part_ign == 3: df_results_acc.to_csv('df_results_acc_3.csv', sep=';') if part_ign == 10: df_results_acc.to_csv('df_results_acc_10.csv', sep=';') if part_ign == 19: df_results_acc.to_csv('df_results_acc_19.csv', sep=';') if score == 'accuracy': list_acc = list(zip(names1, mean1, std1)) df_acc = pd.DataFrame(list_acc) if part_ign == 3: df_acc.to_csv('df_acc_3.csv', sep=';') if part_ign == 10: df_acc.to_csv('df_acc_10.csv', sep=';') if part_ign == 19: df_acc.to_csv('df_acc_19.csv', sep=';') # classificadores para o ensemble clf1 = LogisticRegression(random_state=seed, C=625, penalty='l1') clf2 = MultinomialNB(alpha=15) clf3 = GaussianNB() clf4 = KNeighborsClassifier(n_neighbors=10) clf5 = ExtraTreesClassifier(random_state=seed, criterion='entropy', n_estimators=1000, max_features=17) clf6 = QuadraticDiscriminantAnalysis() eclf = VotingClassifier(estimators=[('LR', clf1), ('NBM', clf2), ('NBG', clf3), ('KNN', clf4), ('ET', clf5), ('ADQ', clf6)], voting='hard') models = [] models.append( ('RL', LogisticRegression(random_state=seed, C=625, penalty='l1'))) models.append(('ADL', LinearDiscriminantAnalysis())) models.append(('ADQ', QuadraticDiscriminantAnalysis())) models.append(('KNN', KNeighborsClassifier(n_neighbors=10))) models.append(('NBG', GaussianNB())) models.append(('NBM', MultinomialNB(alpha=15))) models.append(('SVML', SVC(random_state=seed, kernel='linear', C=10))) models.append( ('SVMR', SVC(random_state=seed, kernel='rbf', C=10, gamma=0.001))) models.append(('RF', RandomForestClassifier(random_state=seed, criterion='gini', n_estimators=1000, max_features=17))) models.append(('ET', ExtraTreesClassifier(random_state=seed, criterion='entropy', n_estimators=1000, max_features=17))) models.append(('ENS', eclf)) # loop que analisa cada algoritmo score = 'f1_macro' results2 = [] names2 = [] mean2 = [] std2 = [] for name, model in models: kfold = model_selection.RepeatedStratifiedKFold(n_splits=splits, n_repeats=repeats, random_state=seed) cv_results = model_selection.cross_val_score(model, x, y, cv=kfold, scoring=score) results2.append(cv_results) names2.append(name) mean2.append(cv_results.mean() * 100) std2.append(cv_results.std() * 100) msg = "%s: %f (%f)" % (name, cv_results.mean() * 100, cv_results.std() * 100) print(msg) list_results_f1 = list(zip(names2, results2)) print(list_results_f1) df_results_f1 = pd.DataFrame(list_results_f1) if part_ign == 3: df_results_f1.to_csv('df_results_f1_3.csv', sep=';') if part_ign == 10: df_results_f1.to_csv('df_results_f1_10.csv', sep=';') if part_ign == 19: df_results_f1.to_csv('df_results_f1_10.csv', sep=';') if score == 'f1_macro': list_f1 = list(zip(names2, mean2, std2)) df_f1 = pd.DataFrame(list_f1) if part_ign == 3: df_f1.to_csv('df_f1_3.csv', sep=';') if part_ign == 10: df_f1.to_csv('df_f1_10.csv', sep=';') if part_ign == 19: df_f1.to_csv('df_f1_19.csv', sep=';') # plotando gráfico fig = plt.figure(figsize=(15, 5)) ax1 = fig.add_subplot(211) ax2 = fig.add_subplot(212) plt.subplot(211) plt.boxplot(results1) ax1.set_xticklabels(names1, fontsize=14) plt.ylabel('Acurácia', fontsize=18) plt.xlabel('(a)', fontsize=18) plt.yticks(rotation='horizontal', fontsize=14) plt.axhline(y=0.4656, xmin=0, xmax=1, hold=None, color='g') plt.axhline(y=0.5024, xmin=0, xmax=1, hold=None, color='b') plt.subplot(212) plt.xlabel('(b)\nClassificadores', fontsize=18) plt.boxplot(results2) plt.ylabel('F1-score', fontsize=18) ax2.set_xticklabels(names2, fontsize=14) plt.yticks(rotation='horizontal', fontsize=14) ax2.annotate( 'RL = Regressao Logistica\nADL = Analise Discr. Linear\n\ ADQ = Analise Discr. Quadratica\nKNN = K-Nearest Neighbors\n\ NBG = Naive Bayes Gaussiano\nNBM = Naive Bayes Multinomial\n\ SVML = SVM Linear\nSVMR = SVM kernel rbf\nRF = Random Forest\n\ ET = Extra Trees', # The point that we'll place the text in relation to xy=(1.01, 0.5), # Interpret the x as axes coords, and the y as figure coords xycoords=('axes fraction', 'figure fraction'), # The distance from the point that the text will be at xytext=(0, 0), # Interpret `xytext` as an offset in points... textcoords='offset points', # Any other text parameters we'd like size=12, ha='left', va='center') plt.subplot(212) plt.show(fig)
import pickle from sklearn.preprocessing import scale import descritores as desc warnings.simplefilter("ignore") ss = pylab.loadtxt(sys.argv[1]) path = sys.argv[2] dim = ss.shape[1]-8 with open(path+"classes.txt","rb") as f: with open(path+"names.pkl","rb") as g: cl = pickle.load(f) nomes = pickle.load(g) clf = neighbors.KNeighborsClassifier(n_neighbors = 3) it = model_selection.RepeatedStratifiedKFold(n_splits = 5,n_repeats = 50) for s in ss: sigma = s[4:4+dim] SI,DB,CH = s[dim+4],s[dim+5],s[dim+6] db = {} for im_file in nomes: nmbe = desc.bendenergy(path+im_file,sigma) db[im_file] = np.hstack((cl[im_file],np.log(nmbe()))) # nome das figuras Y = np.array([db[i][0] for i in db.keys()]).astype(int) X = scale(np.array([db[i][1:] for i in db.keys()])) res = model_selection.cross_val_score(clf,X,Y,cv = it,scoring = "accuracy") st = str("{0} {1} {2} {3} {4} {5}").format(s[1],s[2],s[3],SI,DB,CH)
# from sklearn.ensemble import RandomForestRegressor # from sklearn import model_selection # #Cria estimador # estimator3 = RandomForestRegressor() # #Cria seletor com validação cruzada 3-fold e 10 repetições # selector3 = RFECV(estimator3, step=1, cv=model_selection.RepeatedStratifiedKFold(n_splits=4, n_repeats=10), # scoring='r2', n_jobs=-1) # selector3 = selector3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"]) # pd.to_pickle(selector3, "./PickledObjects/selector3.pkl") from mlxtend import feature_selection from sklearn.ensemble import RandomForestRegressor from sklearn import model_selection from mlxtend import feature_selection estimator = RandomForestRegressor() sfs3 = feature_selection.SequentialFeatureSelector( estimator, k_features=79, forward=True, scoring="r2", cv=model_selection.RepeatedStratifiedKFold(3, 10), n_jobs=-1) sfs4 = sfs3.fit(dataFrame3[colsNotSalePrice2], dataFrame3["SalePrice"]) pd.to_pickle(sfs4, "./PickledObjects/sfs4.pkl")