def train_UsingExtraTreesClassifier(df,header,x_train, y_train,x_test,y_test) : # training clf = ExtraTreesClassifier(n_estimators=200,random_state=0,criterion='gini',bootstrap=True,oob_score=1,compute_importances=True) # Also tried entropy for the information gain but 'gini' seemed to give marginally better fit, bith in sample & out of sample clf.fit(x_train, y_train) #estimation of goodness of fit print "Estimation of goodness of fit using the ExtraTreesClassifier is : %f \n" % clf.score(x_test,y_test) print "Estimation of out of bag score using the ExtraTreesClassifier is : %f \n \n " % clf.oob_score_ # getting paramters back, if needed clf.get_params() # get the vector of predicted prob back y_test_predicted= clf.predict(x_test) X = df[df.columns - [header[-1]]] feature_importance = clf.feature_importances_ # On a scale of 10 - make importances relative to max importance and plot them feature_importance = 10.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) #Returns the indices that would sort an array. pos = np.arange(sorted_idx.shape[0]) + .5 plt.figure(figsize=(12, 6)) plt.subplot(1, 1, 1) plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, X.columns[sorted_idx]) plt.xlabel('Relative Importance') plt.title('Variable Importance') plt.show() return y_test_predicted
import scipy.io from DataSetLoaderLib import DataSetLoader from sklearn.ensemble import ExtraTreesClassifier from sklearn import metrics print("") print("") print("") print("") targets = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl')) d = DataSetLoader() G = d.LoadDataSet("A") indices = joblib.load('selected_indicesv2.joblib.pkl') result = numpy.array(G)[:, indices] clf = ExtraTreesClassifier() import time start_time = time.time() scores = cross_val_score(clf, result, targets, cv=10) end_time = time.time() - start_time print end_time for i in scores: print i print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) filename = 'ExtraTreesClassifier_k-fold.joblib.pkl' joblib.dump(clf, filename, compress=9) params = clf.get_params() print params
print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format( accr[0], accr[1])) #****************************************************************************** #****************************************************************************** # *** Applying Machine Learning Technique #5 *** from sklearn.ensemble import ExtraTreesClassifier Extr = ExtraTreesClassifier(n_estimators=5, n_jobs=4) from pprint import pprint # Look at parameters used by our current forest print('Parameters currently in use:\n') pprint(Extr.get_params()) Extr.fit(X_train, y_train) score_ETC = Extr.score(X_test, y_test) print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_ETC)) # Accuracy of Extratrees classifier on test set: 0.8295 #****************************************************************************** #****************************************************************************** # *** Applying Machine Learning Technique #6 *** from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier
# max_depths = [3,4,5] roll = np.random.randint(len(max_depths), size=1)[0] this_max_depth = max_depths[roll] roll = np.random.randint(len(max_features_list), size=1)[0] max_features = max_features_list[roll] ic = GradientBoostingClassifier(n_estimators=this_n_boosts, learning_rate=learning_rate, max_depth=this_max_depth, max_features=max_features, verbose=1) print ic gbc_params = ic.get_params(deep=True) # Make a dataframe of the parameters if sample_paste_id == 0: keys = gbc_params.keys() values = gbc_params.values() header = ('values bite %i' % sample_paste_id) params_df = pd.DataFrame(data=values, index=keys, columns=[header]) else: header = ('values bite %i' % sample_paste_id) values = gbc_params.values() params_df[header] = values
def feature_importance_Einstein(base): df1 = base['inputs'] df_out = base['outputs'] try: df_out = df_out.drop(columns=['Timestamp']) df1 = df1.drop(columns=['Timestamp']) except: pass # Encontra as variáveis mais relevante para a incidência de COVID-19 model = ExtraTreesClassifier() model.fit(df1, df_out) lista_importances = pd.DataFrame([model.feature_importances_]) lista_importances.columns = list(df1.columns) lista_importances = lista_importances * 100 lista_importances = lista_importances.sort_values(by=0, axis=1, ascending=False) top15 = list(lista_importances.columns[0:15]) top15_values = [] print("Variáveis mais impactantes:") for l in lista_importances.columns[0:15]: print("Nome: " + str(l) + " - " + str(lista_importances[l][0]) + " %") top15_values.append(lista_importances[l][0]) print(top15) # cria dataset para predição df_in = df1[top15] df_out = df_out # pega a lista das variáveis mais relevantes e cria outra planilha para a rede neural lista_neural_in = df_in lista_neural_out = df_out ### como, neste caso, o timestamp não importa, pode-se preencher com qualquer valor sequencial ### # pega a quantidade de linhas qtde_linhas = len(lista_neural_in.index) # cria uma coluna de Timestamps sequenciais na primeira posição lista_neural_in.insert(0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H')) lista_neural_out.insert(0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H')) df2_in = lista_neural_in.copy() df2_out = lista_neural_out.copy() writer = pd.ExcelWriter('base_simulate.xlsx', engine='openpyxl') lista_neural_in.to_excel(writer, sheet_name="INPUTS") lista_neural_out.to_excel(writer, sheet_name="OUTPUTS") writer.save() top15_aws = zip(top15, top15_values) response = { 'top15' : top15_aws, 'top15_names' : top15, 'df_in' : df2_in, 'df_out': df2_out, 'model' : model.get_params(), } return response
clf_etree.fit(X_train, y_train) print "Validation set score: ERF " , clf_etree.score(X_val, y_val) clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) clf_boost.fit(X_train, y_train) print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val) #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) #clf_gboost.fit(X_train, y_train) #print "Validation set score:LR " , clf_gboost.score(X_val, y_val) print "Classifier:" print clf, clf.get_params() print clf_etree, clf_etree.get_params() print clf_boost, clf_boost.get_params() if(fe==1): #L1 norm based feature elimination clf_fe = LogisticRegression(C=1000,penalty='l1',random_state=0) clf_fe.fit(X_train, y_train) X_train = X_train[:,clf_fe.coef_.ravel()!=0] print "Xtrain.shape: ", X_train.shape X_val = X_val[:,clf_fe.coef_.ravel()!=0] clf2_l = svm.SVC(kernel='linear', C=reg) clf2_l.fit(X_train, y_train) print "Lasso Validation set score filtered coeff linear: " , clf2_l.score(X_val, y_val) clf2 = svm.SVC(kernel='rbf', C=reg, gamma=g) clf2.fit(X_train, y_train)
# In[14]: from sklearn import grid_search from sklearn.metrics import f1_score, make_scorer from sklearn.ensemble import ExtraTreesClassifier parameters = {'n_estimators': [1, 32]} model = ExtraTreesClassifier() f1_scorer = make_scorer(f1_score, pos_label='yes') clf = grid_search.GridSearchCV(model, param_grid=parameters, scoring=f1_scorer) model.fit(X_train, Y_train) Y_pred = model.predict(X_test) print model.get_params() print "F1 score for test set: {}".format(metrics.f1_score(Y_test, Y_pred)) # In[12]: import numpy as np import matplotlib.pyplot as plt from sklearn.ensemble import ExtraTreesClassifier # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, Y) importances = forest.feature_importances_
# # scores.mean() # # clf = ExtraTreesClassifier(n_estimators=150) # # scores = cross_val_score(clf, feature_matrix, labels, cv=10) # # scores.mean() # # clf = clf.fit(feature_train,label_train) # clf = svm.SVC(C=1.0,kernel='rbf',cache_size=1000,decision_function_shape='ovr',shrinking=True,probability=True) # scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True)) # print (scores, scores.mean()) # clf.fit(feature_train, label_train) '''Extra-Trees''' clf = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,max_features=30,criterion='gini') scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True)) print (scores, scores.mean()) clf = clf.fit(feature_train,label_train) result = clf.predict(feature_test) accuracy_score(label_test,result) print (classification_report(label_test,result,digits=4)) print (clf.max_depth) clf.get_params()# print(classification_report_imbalanced(label_test, result)) clf.score(feature_test,label_test) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #print ('hlo',clf.oob_score_) cm=sklearn.metrics.confusion_matrix(label_test,result ) print(cm) pl.matshow(cm) pl.colorbar() pl.show()
for homid in targetHOMS: allseqs.append(printHOM(homid)) df = DataFrame() df.addColumns(['sample'] + [homid for homid in targetHOMS]) for org in mc + nmc: rowdict = {'sample': org} for homid in targetHOMS: homID = homid val = homDB.get_cluster(homID) if org in val: rowdict[homID] = 1 else: rowdict[homID] = 0 dfrow = DataRow.fromDict(rowdict) df.addRow(dfrow) df.export(outFile=None) print(forest.get_params()) for elem in allseqs: print(">" + elem[0] + " " + str(len(elem[1]))) print(elem[1])