def decision_tree_bagging(Xtrain, Xtest, ytrain, ytest, ensemble_size=60): # bagging accuracies = [] ensemble_sizes = [] for i in range(1, ensemble_size): bagging = BaggingClassifier( base_estimator=tree.DecisionTreeClassifier(), n_estimators=i, bootstrap=True, max_samples=1.0, max_features=1.0) bagging.fit(Xtrain, ytrain) ypred = bagging.predict(Xtest) accuracy = np.mean(ypred == ytest) ensemble_sizes.append(i) accuracies.append(accuracy) plt.plot(ensemble_sizes, accuracies) plt.xlabel('number of estimators') plt.ylabel('accuracy') plt.grid(True) plt.title('Decision tree (bagging)') plt.show() print('Highest accuracy of bagging = %f' % np.max(accuracies))
def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0): self._hyperparams = { 'base_estimator': make_sklearn_compat(base_estimator), 'n_estimators': n_estimators, 'max_samples': max_samples, 'max_features': max_features, 'bootstrap': bootstrap, 'bootstrap_features': bootstrap_features, 'oob_score': oob_score, 'warm_start': warm_start, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose } self._wrapped_model = SKLModel(**self._hyperparams)
def bagging_cv(X_train, y_train, seed, verbose=3): # Results: # DEFAULT Z-SCORE OUTLIERS # n_estimators 250 150 150 # warm_start True True True # max_samples 0.6 0.6 0.6 # -------------------------------------------------------- # f1-micro 0.9220 0.9268 0.9403 clf = BaggingClassifier(n_estimators=140, random_state=seed) params = { 'n_estimators': list(range(100, 1500, 50)), 'warm_start': [True, False], 'max_samples': [0.6, 0.8, 1.0] } gCV = GridSearchCV(estimator=clf, param_grid=params, scoring='f1_micro', n_jobs=-1, refit=True, cv=3, verbose=verbose, return_train_score='warn') return gCV.fit(X_train.values, y_train)
class BaggingClassifierImpl(): def __init__(self, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0): self._hyperparams = { 'base_estimator': make_sklearn_compat(base_estimator), 'n_estimators': n_estimators, 'max_samples': max_samples, 'max_features': max_features, 'bootstrap': bootstrap, 'bootstrap_features': bootstrap_features, 'oob_score': oob_score, 'warm_start': warm_start, 'n_jobs': n_jobs, 'random_state': random_state, 'verbose': verbose} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
def fit(self, x, y): ''' metodo para treinar a arquitetura de dois niveis :x: dados para treinamento :y: rotulo dos dados :dsel_x: padroes da janela de validacao :dsel_y: rotulos da janela de validacao ''' # salvando os dados de trainamento self.x_train = x self.y_train = y # salvando as dificuldades das instancias self.H = self.kDN(x, y) # treinando o nivel 1 ######################################### self.levelone = KNeighborsClassifier(self.n_vizinhos) self.levelone.fit(x, y) # realizando a previsao para o conjunto de treinamento y_pred = self.levelone.predict(x) # salvando os indices das instancias que foram classificadas erradas indices = [i for i in range(len(y)) if y_pred[i] != y[i]] # obtendo o limiar de dificuldade do problema self.limiar = self.defineThreshold(indices) ############################################################### # treinando o nivel 2 ######################################### # obtendo as instancias dificeis x_dificeis, y_dificeis = self.hardInstances(x, y, self.limiar) # criando o ensemble self.ensemble = BaggingClassifier(base_estimator=Perceptron(), max_samples=0.9, max_features=1.0, n_estimators=100) self.ensemble.fit(x_dificeis, y_dificeis) # treinando o modelo 2 self.leveltwo = KNORAU(self.ensemble.estimators_, self.n_vizinhos) self.leveltwo.fit(x_dificeis, y_dificeis)
def ensemble(): pipeline = Pipeline([ ('count_vectorizer', CountVectorizer(binary=True, ngram_range=(1, 2), max_features=15000, stop_words=stopwords)), ( 'clf', VotingClassifier( estimators=[ ('nb', BaggingClassifier(MultinomialNB(alpha=0.2))), ('lr', BaggingClassifier( LogisticRegression(class_weight='balanced', C=10, n_jobs=2))), # ('rf', RandomForestClassifier(n_estimators=200, max_features='log2', class_weight='balanced', n_jobs=2)) ], n_jobs=2, voting='soft', weights=[1, 1])) ]) train_report(pipeline)
def performClassification(dataset, split, symbol, output_dir, forecast_out): """ Performing Classification on Various algorithms """ predicted_values = [] features = dataset.columns[:-1] index = int(np.floor(dataset.shape[0] * split)) train, test, test_forecast = dataset[:index], dataset[ index:-forecast_out], dataset[-forecast_out:] #dataset_all, test_forecast = dataset[:-forecast_out], dataset[-forecast_out:] #test = dataset_all.sample(frac=0.025) #train = dataset_all.loc[~dataset_all.index.isin(test.index)] log.info('-' * 80) log.info('%s train set: %s, test set: %s', symbol, train.shape, test.shape) predicted_values.append(str(symbol)) predicted_values.append(str(train.shape)) predicted_values.append(str(test.shape)) #train, test = getFeatures(train[features], \ # train[output], test[features], 16) out_params = (symbol, output_dir) output = dataset.columns[-1] classifiers = [ RandomForestClassifier(n_estimators=100, n_jobs=-1), SVC(degree=100, C=10000), BaggingClassifier(), AdaBoostClassifier(), neighbors.KNeighborsClassifier(), GradientBoostingClassifier(n_estimators=100), #QDA(), ] for classifier in classifiers: model_name, forecast_set, accuracy = benchmark_classifier(classifier, \ train, test, test_forecast, features, symbol, output, out_params) log.info('%s, %s, %s, %s', symbol, model_name, forecast_set, accuracy) predicted_values.append(str(round(forecast_set.ravel()[0], 3))) predicted_values.append(str(round(accuracy, 3))) return predicted_values
def defaultModels(df_xmat, df_ymat_cat): #### representitive common classifiers in sklearn #### classifiers = [ GaussianNB(), LogisticRegression(max_iter=500), DecisionTreeClassifier(), KNeighborsClassifier(), SVC(kernel='rbf'), AdaBoostClassifier(), BaggingClassifier(), ExtraTreesClassifier(), GradientBoostingClassifier(), RandomForestClassifier(), ] cv = StratifiedKFold(n_splits=10) res = [] for clf in classifiers: print('processing...' + str(clf)[:10]) metrics_cv = [] for train_index, test_index in cv.split(df_xmat.values, df_ymat_cat): X_train = df_xmat.iloc[train_index, :].values X_test = df_xmat.iloc[test_index, :].values y_train = [df_ymat_cat[i] for i in train_index] y_test = [df_ymat_cat[i] for i in test_index] clf.fit(X_train, y_train) metrics_cv.append(clf.score(X_test, y_test)) res.append([ str(clf)[:10], np.array(metrics_cv).mean(axis=0), np.array(metrics_cv).std(axis=0) ]) return res
def __init__(self): self.random_rate=33 clf1=SVC(C=1.0,random_state=33) clf2=XGBClassifier(n_estimators=220,learning_rate=0.2,min_child_weight=2.3) clf3=RandomForestClassifier(n_estimators=80,random_state=330,n_jobs=-1) clf4=BaggingClassifier(n_estimators=40,random_state=101) clf5=AdaBoostClassifier(n_estimators=70,learning_rate=1.5,random_state=33) clf6=GradientBoostingClassifier(n_estimators=250,learning_rate=0.23,random_state=33) clf7=XGBClassifier(n_estimators=100,learning_rate=0.12,min_child_weight=1) base_model=[ ['svc',clf1], ['xgbc',clf2], ['rfc',clf3], ['bgc',clf4], ['adbc',clf5], ['gdbc',clf6] ] self.base_models=base_model self.XGB=clf7
tuned_parameters = [{'n_neighbors':[3, 5, 7], 'weights':['uniform', 'distance'], 'algorithm':['ball_tree', 'kd_tree', 'brute'], 'p':[1, 2, 3] }] algo = KNeighborsClassifier() elif choice=='g' or choice=='G': print("\n**********************************\n") print(" \t Bagging") tuned_parameters = [{'n_estimators':[5, 10, 100, 200], 'max_features':[1, 3, 9], 'max_samples':[1, 5, 9, 21], 'random_state':[1, 2, 3, 5] }] algo = BaggingClassifier() elif choice=='h' or choice=='H': print("\n**********************************\n") print(" \t Random Forest") tuned_parameters = [{'n_estimators':[5, 10, 100, 200], 'criterion':['gini', 'entropy'], 'max_features':['log2', 'sqrt'], 'max_depth':[10, 100] }] algo = RandomForestClassifier() elif choice=='i' or choice=='I': print("\n**********************************\n") print(" \t AdaBoost Classifier") tuned_parameters = [{'n_estimators':[5, 10, 50, 100, 200],
def all_classifier_models(): models = [] metrix = [] c_report = [] train_accuracy = [] test_accuracy = [] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier())) models.append(('GaussianNB', GaussianNB())) models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100))) models.append(('SVM', SVC(gamma='auto'))) models.append(('Linear_SVM', LinearSVC())) models.append(('XGB', XGBClassifier())) models.append(('SGD', SGDClassifier())) models.append(('Perceptron', Perceptron())) models.append(('ExtraTreeClassifier', ExtraTreeClassifier())) models.append(('OneClassSVM', OneClassSVM(gamma = 'auto'))) models.append(('NuSVC', NuSVC())) models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1))) models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0))) models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0))) models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('RidgeClassifierCV', RidgeClassifierCV())) models.append(('RidgeClassifier', RidgeClassifier())) models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier())) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))] models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))) clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'))) models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('ExtraTreesClassifier', ExtraTreesClassifier())) models.append(('CategoricalNB', CategoricalNB())) models.append(('ComplementNB', ComplementNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('MultinomialNB', MultinomialNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('NearestCentroid', NearestCentroid())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('GaussianMixture', GaussianMixture())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) test_accuracy= [] names = [] for name, model in models: try: m = model m.fit(X_train, y_train) y_pred = m.predict(X_test) train_acc = round(m.score(X_train, y_train) * 100, 2) test_acc = metrics.accuracy_score(y_test,y_pred) *100 c_report.append(classification_report(y_test, y_pred)) test_accuracy.append(test_acc) names.append(name) metrix.append([name, train_acc, test_acc]) except: print("Exception Occurred :",name) return metrix,test_accuracy,names
elif(validacao[k] == validacao[1]): x_val, y_val = validacaoInstanciasFaceis(x_train, y_train, n_vizinhos) elif(validacao[k] == validacao[2]): x_val, y_val = validacaoInstanciasDificeis(x_train, y_train, n_vizinhos) # 3.3. End ################################################################################################ # 3.4. Instanciando os classificadores ########################################################## ########## instanciando o modelo Bagging+REP ########################################### # definindo o numero do modelo na tabela num_model = 0 # intanciando o classificador ensemble = BaggingClassifier(base_estimator=Perceptron(), max_samples=qtd_amostras, max_features=1.0, n_estimators = qtd_modelos) # treinando o modelo ensemble.fit(x_train, y_train) # realizando a poda ensemble = REP(x_val, y_val, ensemble) # computando a previsao pred = ensemble.predict(x_test) # computando a diversidade do ensemble q_statistic = MedidasDiversidade('q', x_val, y_val, ensemble) double_fault = MedidasDiversidade('disagreement', x_val, y_val, ensemble)
# Normalization (L1 & L2): # NOTE: Change 'normtype' value to 'l1' / 'l2' to change normalization type: normtype = 'l2'#'l1' # model_selection is used for manually enabling the individual models. # NOTE: Setting boolean value, eanbles/disables model. model_selection = { 'ExtraTrees': ( True, ExtraTreesClassifier(n_estimators='warn', criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None) ), 'RandomForest': ( True, RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1) ), 'AdaBoost': ( True, AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None) ), 'DecisionTree': ( True, DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=5, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort=False) ), 'GradientBoosting': (True, GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='auto', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001) ), 'BernoulliNB': (True, BernoulliNB(alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None) ), 'BaggingClassifier': (True, BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0) ), 'NearestNeighbors': (True, KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None) ), # (n_neighbors=4) ), 'LogisticRegressionCV': (True, LogisticRegressionCV(Cs=10, fit_intercept=True, cv='warn', dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1.0, multi_class='warn', random_state=None, l1_ratios=None) ), 'LDA': (True, LinearDiscriminantAnalysis(solver='svd', shrinkage=None, priors=None, n_components=None, store_covariance=False, tol=0.0001) ), 'LogisticRegression': (True, LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='warn', max_iter=100, multi_class='warn', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None) ), 'CalibratedClassifierCV': (True, CalibratedClassifierCV(base_estimator=None, method='sigmoid', cv='warn') ), 'LinearSVC': (True, LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.0001, C=1.0, multi_class='ovr', fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000) ), 'LinearSVM': ( True, SVC(kernel='linear', C=0.025) ), # (C=0.01, penalty='l1', dual=False) ), 'RBF_SVM': (True, SVC(gamma='auto') ),#gamma=2, C=1) ), # 'Nu_SVM': (True, NuSVC(gamma='auto') ), 'GaussianProcess': (False, GaussianProcessClassifier() ), #(1.0 * RBF(1.0)) ), 'NeuralNet': (True, MLPClassifier(alpha=1, max_iter=1000) ), 'QDA': (True, QuadraticDiscriminantAnalysis() ), 'NaiveBayes': (True, GaussianNB() ), 'RadiusNeighborsClassifier': (True, RadiusNeighborsClassifier() ), 'SGDClassifier': (True, SGDClassifier() ),
classifier.fit(audit_X, audit_y) store_pkl(classifier, name + ".pkl") adjusted = DataFrame(classifier.predict(audit_X), columns=["Adjusted"]) if (with_proba == True): adjusted_proba = DataFrame(classifier.predict_proba(audit_X), columns=["probability_0", "probability_1"]) adjusted = pandas.concat((adjusted, adjusted_proba), axis=1) store_csv(adjusted, name + ".csv") build_audit(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), "DecisionTreeAudit") build_audit( BaggingClassifier(DecisionTreeClassifier(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAudit") build_audit(ExtraTreesClassifier(random_state=13, min_samples_leaf=5), "ExtraTreesAudit") build_audit( GradientBoostingClassifier(random_state=13, loss="exponential", init=None), "GradientBoostingAudit") build_audit(LinearDiscriminantAnalysis(solver="lsqr"), "LinearDiscriminantAnalysisAudit") build_audit(LogisticRegressionCV(), "LogisticRegressionAudit") build_audit( BaggingClassifier(LogisticRegression(), random_state=13, n_estimators=3, max_features=0.5), "LogisticRegressionEnsembleAudit")
from sklearn.manifold.t_sne import TSNE from sklearn.linear_model.theil_sen import TheilSenRegressor from sklearn.mixture.dpgmm import VBGMM from sklearn.feature_selection.variance_threshold import VarianceThreshold import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) clf_dict = {'ARDRegression':ARDRegression(), 'AdaBoostClassifier':AdaBoostClassifier(), 'AdaBoostRegressor':AdaBoostRegressor(), 'AdditiveChi2Sampler':AdditiveChi2Sampler(), 'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(),
verbose=0, warm_start=False), 'DecisionTreeClassifier': DecisionTreeClassifier(max_depth=9, random_state=123, splitter="best", criterion="gini"), 'KNeighborsClassifier': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform'), 'RandomForestClassifier': RandomForestClassifier(n_estimators=100, random_state=123, max_depth=9, criterion="gini"), 'GaussianNB': GaussianNB(priors=None), 'SVC': SVC(C=1.0, kernel='linear', probability=True, random_state=124), 'MLPClassifier': MLPClassifier(alpha=1, max_iter=1000, random_state=124), 'BaggingClassifier': BaggingClassifier(random_state=124) }
PowerTransformer(method='yeo-johnson'), # PowerTransformer(method='box-cox'), QuantileTransformer(output_distribution='normal'), QuantileTransformer(output_distribution='uniform'), Normalizer() ] #=================Classifier classifier_test = [ OneVsRestClassifier(SVC()), DecisionTreeClassifier(max_depth=5), SVC(), SVC(kernel="linear", C=0.025), LogisticRegressionCV(cv=5, random_state=0), GradientBoostingClassifier(random_state=0), BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(features, target), ExtraTreesClassifier(n_estimators=100, random_state=0), HistGradientBoostingClassifier(), MLPClassifier(random_state=1, max_iter=300), OneVsOneClassifier(LinearSVC(random_state=0)), OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0), random_state=0) ] print('Importacao OK') # %% # =================Looping here from sklearn.preprocessing import StandardScaler from sklearn.ensemble import GradientBoostingClassifier from sklearn.pipeline import Pipeline
def main(): # 1. Definindo variaveis para o experimento ######################################################################### qtd_modelos = 100 qtd_execucoes = 30 qtd_amostras = 0.9 qtd_folds = 10 n_vizinhos = 7 nome_datasets = ['kc1', 'kc2'] # 1. End ############################################################################################################ # for para variar entre os datasets for h in range(len(nome_datasets)): # 2. Lendo os datasets ############################################################################################ # lendo o dataset data = pd.read_csv('dataset/'+nome_datasets[h]+'.csv') # obtendo os padroes e seus respectivos rotulos df_x = np.asarray(data.iloc[:,0:-1]) df_y = np.asarray(data.iloc[:,-1]) # 2.1. Criando a tabela para salvar os dados ################################################# # criando a tabela que vai acomodar o modelo tabela = Tabela_excel() tabela.Criar_tabela(nome_tabela='arquivos_lista03/'+nome_datasets[h], folhas=['OLA', 'LCA', 'KNORA-E', 'KNORA-U', 'Arquitetura'], cabecalho=['acuracy', 'auc', 'fmeasure', 'gmean'], largura_col=5000) # 2.1. End ##################################################################################### # 2. End ############################################################################################################ # executando os algoritmos x vezes for j in range(qtd_execucoes): # 3. Dividindo os dados para treinamento e teste ################################################################ # quebrando o dataset sem sobreposicao em 90% para treinamento e 10% para teste skf = StratifiedKFold(df_y, n_folds=qtd_folds) # tomando os indices para treinamento e teste train_index, test_index = next(iter(skf)) # obtendo os conjuntos de dados para treinamento e teste x_train = df_x[train_index] y_train = df_y[train_index] x_test = df_x[test_index] y_test = df_y[test_index] # 3. End ######################################################################################################### # 4. Gerando o pool de classificadores ########################################################################## # intanciando o classificador ensemble = BaggingClassifier(base_estimator=Perceptron(), max_samples=qtd_amostras, max_features=1.0, n_estimators = qtd_modelos) # treinando o modelo ensemble.fit(x_train, y_train) # 4. End ######################################################################################################## # 5. Instanciando os classificadores ########################################################## ################################### OLA ######################################################## executar_modelo('OLA', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela) ################################################################################################ ################################### LCA ######################################################## executar_modelo('LCA', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela) ################################################################################################ ################################### KNORAE ##################################################### executar_modelo('KNORAE', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela) ################################################################################################ ################################### KNORAU ##################################################### executar_modelo('KNORAU', x_train, y_train, x_test, y_test, ensemble.estimators_, n_vizinhos, nome_datasets, h, j, tabela) ################################################################################################ ################################### Arquitetura ################################################ # importando o metodo arq = Arquitetura(n_vizinhos) # treinando o metodo arq.fit(x_train, y_train) # realizando a previsao pred = arq.predict(x_test) # printando os resultados nome = 'Arquitetura' acuracia, auc, f1measure, gmean = printar_resultados(y_test, pred, nome_datasets[h]+'-'+nome+'-['+str(j)+']') # escrevendo os resultados obtidos tabela.Adicionar_Sheet_Linha(4, j, [acuracia, auc, f1measure, gmean])
from sklearn.ensemble.bagging import BaggingClassifier train = pd.read_csv("train.csv") train.drop(['Cabin'], 1, inplace=True) train = train.dropna() y = train['Survived'] train.drop(['Survived', 'PassengerId', 'Name', 'Ticket'], 1, inplace=True) train.fillna({'Age': 30}) X = pd.get_dummies(train) bag_clf = BaggingClassifier( tree.DecisionTreeClassifier(), n_estimators=500, max_samples=200, bootstrap=True, # True => bagging, False => pasting n_jobs=-1 # use all cores ) bag_clf.fit(X, y) test = pd.read_csv('test.csv') ids = test[['PassengerId']] test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], 1, inplace=True) test.fillna(2, inplace=True) test = pd.get_dummies(test) predictions = bag_clf.predict(test) results = ids.assign(Survived=predictions) results.to_csv('titanic_result_bagging.csv', index=False)
(SGDRegressor(), ['predict'], create_regression_problem_1()), (Lasso(), ['predict'], create_regression_problem_1()), (Pipeline([('earth', Earth()), ('logistic', LogisticRegression())]), ['predict', 'predict_proba'], create_weird_classification_problem_1()), (FeatureUnion([('earth', Earth()), ('earth2', Earth(max_degree=2))], transformer_weights={ 'earth': 1, 'earth2': 2 }), ['transform'], create_weird_classification_problem_1()), (RandomForestRegressor(), ['predict'], create_regression_problem_1()), (CalibratedClassifierCV(LogisticRegression(), 'isotonic'), ['predict_proba'], create_weird_classification_problem_1()), (AdaBoostRegressor(), ['predict'], create_regression_problem_1()), (BaggingRegressor(), ['predict'], create_regression_problem_1()), (BaggingClassifier(), ['predict_proba'], create_weird_classification_problem_1()), (GradientBoostingRegressor(verbose=True), ['predict'], create_regression_problem_1(m=100000, n=200)), (XGBRegressor(), ['predict'], create_regression_problem_for_xgb_1()) ] # Create tests for numpy_flat language def create_case_numpy_flat(estimator, methods, fit_data, predict_data, export_predict_data): def test_case(self): model = clone(estimator) model.fit(**fit_data) for method in methods: pred = getattr(model, method)(**predict_data)
QuantileTransformer(output_distribution='normal'), QuantileTransformer(output_distribution='uniform'), Normalizer() ] # %% #=================Classifier classifier_test = [ OneVsRestClassifier(SVC()), DecisionTreeClassifier(max_depth=5), SVC(), SVC(kernel="linear", C=0.025), LogisticRegressionCV(cv=5, random_state=0), GradientBoostingClassifier(random_state=0), BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(X, y), ExtraTreesClassifier(n_estimators=100, random_state=0), HistGradientBoostingClassifier(), MLPClassifier(random_state=1, max_iter=300), OneVsOneClassifier(LinearSVC(random_state=0)), OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0), random_state=0) ] print('Importacao OK') #%% count = 0 dict_test = {} dict_all = {} for i in range(len(scaler)): scaler_i = scaler[i] for j in range(len(classifier_test)):
pred_nb = gc_clf_nb.predict(X_test) accuracy_nb = accuracy_score(y_test, pred_nb) precision_nb = precision_score(y_test, pred_nb, average='weighted') f1_score_nb = f1_score(y_test, pred_nb, average='weighted') recall_scaore_nb = recall_score(y_test, pred_nb, average='weighted') print("####FOR NB######") print("Accuracy: ", accuracy_nb) print("Precision:", precision_nb) print("F1 Score:", f1_score_nb) print("Recall Score:", recall_scaore_nb) # In[14]: pipe_bag = Pipeline([ ('vect', CountVectorizer()), ('tfdf', TfidfTransformer()), ('boost', BaggingClassifier(base_estimator=naive_bayes.MultinomialNB())) ]) parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfdf__use_idf': (True, False), } gc_clf_bc = GridSearchCV(pipe_bag, parameters, n_jobs=1) gc_clf_bc = gc_clf_bc.fit(X_train, y_train) print(gc_clf_bc.best_score_) print(gc_clf_bc.best_params_) # In[15]: pred_bc = gc_clf_bc.predict(X_test) accuracy_bc = accuracy_score(y_test, pred_bc)
n_estimators=10, max_features=1))) if ",MLPC," in Functions: models.append(('MLPC', MLPClassifier(alpha=0.1))) if ",ABC," in Functions: models.append(('ABC', AdaBoostClassifier())) if ",GNB," in Functions: models.append(('GNB', GaussianNB())) if ",QDA," in Functions: models.append(('QDA', QuadraticDiscriminantAnalysis())) if ",GBC," in Functions: models.append(('GBC', GradientBoostingClassifier())) if ",ETC," in Functions: models.append(('ETC', ExtraTreeClassifier())) if ",BC," in Functions: models.append(('BC', BaggingClassifier())) if ",SGDC," in Functions: models.append(('SGDC', SGDClassifier())) if ",RC," in Functions: models.append(('RC', RidgeClassifier())) if ",PAC," in Functions: models.append(('PAC', PassiveAggressiveClassifier())) if ",ETSC," in Functions: models.append(('ETSC', ExtraTreesClassifier())) if ",BNB," in Functions: models.append(('BNB', BernoulliNB())) if ",GM," in Functions: models.append(('GM', GaussianMixture())) from sklearn.model_selection import KFold from collections import Counter
class Arquitetura: def __init__(self, n_vizinhos): ''' :n_vizinhos: quantidade de vizinhos mais proximos que serao utilizados para regiao de competencia ''' self.n_vizinhos = n_vizinhos def kDN(self, x, y): ''' Metodo para computar o grau de dificuldade de cada observacao em um conjunto de dados :param: x: padroes dos dados :param: y: respectivos rotulos :return: dificuldades: vetor com a probabilidade de cada instancia ''' # instanciando os vizinhos mais proximos nbrs = NearestNeighbors(n_neighbors=self.n_vizinhos + 1, algorithm='ball_tree').fit(x) # variavel para salvar as probabilidades dificuldades = [] # for para cada instancia do dataset for i in range(len(x)): # computando os vizinhos mais proximos para cada instancia _, indices = nbrs.kneighbors([x[i]]) # verificando o rotulo dos vizinhos cont = 0 for j in indices[0]: if (j != i and y[j] != y[i]): cont += 1 # computando a porcentagem dificuldades.append(cont / (self.n_vizinhos + 1)) return dificuldades def neighbors(self, dsel, x_query): ''' metodo para retornar apenas os indices dos vizinhos ''' # instanciando os vizinhos mais proximos nbrs = NearestNeighbors(n_neighbors=self.n_vizinhos + 1, algorithm='ball_tree').fit(dsel) # computando os vizinhos mais proximos para cada instancia _, indices = nbrs.kneighbors([x_query]) return indices def hardInstances(self, x, y, limiar): ''' Metodo para retornar um subconjunto de validacao apenas com as instacias faceis :param: x: padroes dos dados :param: y: respectivos rotulos :return: x_new, y_new: ''' # computando as dificulades para cada instancia dificuldades = self.kDN(x, y) # variaveis para salvar as novas instancias x_new = [] y_new = [] # salvando apenas as instancias faceis for i in range(len(dificuldades)): if (dificuldades[i] > limiar): x_new.append(x[i]) y_new.append(y[i]) return np.asarray(x_new), np.asarray(y_new) def neighborhoodDifficulty(self, dsel, x_query, H): ''' metodo para calcular o grau de dificuldade da vizinhanca :dsel: dataset para pesquisar os vizinhos :x_query: instancia a ser pesquisada :H: dificuldade do dataset dsel ''' # obtendo a vizinhanca do exemplo indices = self.neighbors(dsel, x_query)[0] # dificuldade da regiao dificuldades = [H[i] for i in indices] # media da dificuldadde da regiao return np.min(dificuldades) def defineThreshold(self, indices): ''' Metodo para definir o threshold :indices: os indices das instancias que foram classificadas incorretamente ''' # obtendo a vizinhanca do exemplo lista = [] for i in indices: lista.append( self.neighborhoodDifficulty(self.x_train, self.x_train[i], self.H)) return np.mean(lista) def fit(self, x, y): ''' metodo para treinar a arquitetura de dois niveis :x: dados para treinamento :y: rotulo dos dados :dsel_x: padroes da janela de validacao :dsel_y: rotulos da janela de validacao ''' # salvando os dados de trainamento self.x_train = x self.y_train = y # salvando as dificuldades das instancias self.H = self.kDN(x, y) # treinando o nivel 1 ######################################### self.levelone = KNeighborsClassifier(self.n_vizinhos) self.levelone.fit(x, y) # realizando a previsao para o conjunto de treinamento y_pred = self.levelone.predict(x) # salvando os indices das instancias que foram classificadas erradas indices = [i for i in range(len(y)) if y_pred[i] != y[i]] # obtendo o limiar de dificuldade do problema self.limiar = self.defineThreshold(indices) ############################################################### # treinando o nivel 2 ######################################### # obtendo as instancias dificeis x_dificeis, y_dificeis = self.hardInstances(x, y, self.limiar) # criando o ensemble self.ensemble = BaggingClassifier(base_estimator=Perceptron(), max_samples=0.9, max_features=1.0, n_estimators=100) self.ensemble.fit(x_dificeis, y_dificeis) # treinando o modelo 2 self.leveltwo = KNORAU(self.ensemble.estimators_, self.n_vizinhos) self.leveltwo.fit(x_dificeis, y_dificeis) # verificando se o ola acerta os exemplos errados pelo svm ############################################################### def predict_svm(self, x): # to predict multiple examples if (len(x.shape) > 1): # returning all labels return [ self.levelone.predict(np.array([pattern]))[0] for pattern in x ] # to predict only one example else: return self.levelone.predict(np.array([x]))[0] def predict_ola(self, x): # to predict multiple examples if (len(x.shape) > 1): # returning all labels return [ self.leveltwo.predict(np.array([pattern]))[0] for pattern in x ] # to predict only one example else: return self.leveltwo.predict(np.array([x]))[0] def predict_one(self, x): ''' metodo para computar a previsao de um exemplo :x: padrao a ser predito ''' # media da dificuldadde da regiao media = self.neighborhoodDifficulty(self.x_train, x, self.H) # verificando a dificuldade da instancia if (media >= self.limiar): return self.leveltwo.predict(np.array([x]))[0] else: return self.levelone.predict(np.array([x]))[0] def predict(self, x): ''' metodo para computar a previsao de um exemplo :x: padrao a ser predito ''' # to predict multiple examples if (len(x.shape) > 1): # returning all labels return [self.predict_one(pattern) for pattern in x] # to predict only one example else: return self.predict_one(x)
kernel_pca = KernelPCA(n_components=150) # Costs huge amounts of ram randomized_pca = RandomizedPCA(n_components=500) # REGRESSORS random_forest_regressor = RandomForestRegressor(n_estimators=256) gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=60) support_vector_regressor = svm.SVR() # CLASSIFIERS support_vector_classifier = svm.SVC(probability=True, verbose=True) linear_support_vector_classifier = svm.LinearSVC(dual=False) nearest_neighbor_classifier = KNeighborsClassifier() extra_trees_classifier = ExtraTreesClassifier(n_estimators=256) bagging_classifier = BaggingClassifier( base_estimator=GradientBoostingClassifier(n_estimators=200, max_features=4), max_features=0.5, n_jobs=2, verbose=1) gradient_boosting_classifier = GradientBoostingClassifier(n_estimators=200, max_features=4, learning_rate=0.3, verbose=0) random_forest_classifier = RandomForestClassifier(n_estimators=2) logistic_regression = LogisticRegression(C=0.5) ridge_classifier = RidgeClassifier(alpha=0.1, solver='svd') bayes = MultinomialNB() sgd = SGDClassifier() boundary_forest = BoundaryForestClassifier(num_trees=4) # FEATURE UNION feature_union = FeatureUnion(transformer_list=[('PCA', pca)])
x = dfbalanceado.iloc[:, 1:6] y = dfbalanceado.iloc[:, 6:7] #Data split with 80% dedicated to training and 20% to test. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1) #Feed new DataFrame with data that we need predict datosFinal = pd.read_csv('data/nuevos_individuos_credito.csv', delimiter=',', decimal='.') #Data split of data, specially X_Test X_test = datosFinal.iloc[:, 1:6] #Configure different level 1 classifier related with stacking methodology. models = [ BaggingClassifier(), SVC(), ExtraTreeClassifier(), KNeighborsClassifier(n_neighbors=5, n_jobs=-1), RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators=100), XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3) ] S_train, S_test = stacking(models, X_train, y_train.values.ravel(), X_test, regression=False,
# from sklearn.ensemble import AdaBoostClassifier as Boost from sklearn.ensemble.bagging import BaggingClassifier as Boost from sklearn.naive_bayes import GaussianNB from csxdata import CData from SciProjects.grapes import path, indepsn if __name__ == '__main__': data = CData(path, indepsn, feature="evjarat", headers=1, cross_val=0.2, lower=True) data.transformation = "std" model = Boost(GaussianNB(), n_estimators=100) model.fit(data.learning, data.lindeps) preds = model.predict(data.testing) eq = [left == right for left, right in zip(preds, data.tindeps)] print("Acc:", sum(eq) / len(eq))
y_train = df_y[train_index] x_test = df_x[test_index] y_test = df_y[test_index] # 3.1. End ################################################################################################### # 3.2 Instanciando os classificadores ######################################################################### # 3.2.1. Bagging com DecisionTree ############################################################ # numero do modelo na tabela num_model = 0 # modelo bg = BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=pct_trainamento[i], max_features=1.0, n_estimators=qtd_modelos) # treinando o modelo bg.fit(x_train, y_train) # computando a previsao pred = bg.predict(x_test) # printando os resultados acuracia, auc, f1measure, gmean = printar_resultados( y_test, pred, nome_datasets[h] + '-pct-' + str(pct_trainamento[i]) + '- Bagging com DecisionTree [' + str(j) + ']') # escrevendo os resultados obtidos tabela.Adicionar_Sheet_Linha(num_model, j,
plt.figure() plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, BigFeaturenames[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance based on bagging method') plt.show() except: print('不展示特征重要性') if 0: print(' '.join(['*' * 25, 'RandomForestClassifier', '*' * 25, '\n'])) from sklearn.ensemble.bagging import BaggingClassifier clf_svm0=SVC(C=10,kernel='rbf',gamma=0.1,probability=True,\ decision_function_shape='ovr',random_state=seed,class_weight='balanced') pipe_svm0 = Pipeline([('scaler', Scaler()), ('clf', clf_svm0)]) clf_bg = BaggingClassifier(base_estimator=pipe_svm0, n_estimators=10, max_samples=1.0, \ max_features=1.0, random_state=seed) start = time.time() clf_bg = clf_bg.fit(X_train, Y_train) print('Total running time is {}s'.format(time.time() - start)) judge = cross_val_score(clf_bg, X, Y, groups=None, scoring=Evaluate(score_func), cv=5) #print('Cross-validation score is {}'.format(judge)) print('Mean cross-validation score is {}'.format(judge.mean())) # Boosting method:GradBoost if 1: print(' '.join(