def train_UsingExtraTreesClassifier(df,header,x_train, y_train,x_test,y_test) :

    # training
    clf = ExtraTreesClassifier(n_estimators=200,random_state=0,criterion='gini',bootstrap=True,oob_score=1,compute_importances=True)
    # Also tried entropy for the information gain but 'gini' seemed to give marginally better fit, bith in sample & out of sample
    clf.fit(x_train, y_train)
    #estimation of goodness of fit
    print "Estimation of goodness of fit using the ExtraTreesClassifier is : %f  \n" % clf.score(x_test,y_test)
    print "Estimation of out of bag score  using the ExtraTreesClassifier is : %f \n \n  " % clf.oob_score_
    # getting paramters back, if needed
    clf.get_params()
    # get the vector of predicted prob back
    y_test_predicted= clf.predict(x_test)
    X = df[df.columns - [header[-1]]]

    feature_importance = clf.feature_importances_
    # On a scale of 10 - make importances relative to max importance and plot them
    feature_importance = 10.0 * (feature_importance / feature_importance.max())
    sorted_idx = np.argsort(feature_importance) #Returns the indices that would sort an array.
    pos = np.arange(sorted_idx.shape[0]) + .5
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 1, 1)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, X.columns[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Variable Importance')
    plt.show()
    return y_test_predicted
import scipy.io
from DataSetLoaderLib import DataSetLoader
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import metrics

print("")
print("")
print("")
print("")

targets = numpy.array(joblib.load('DatasetA_ValidationClasses.joblib.pkl'))
d = DataSetLoader()
G = d.LoadDataSet("A")
indices = joblib.load('selected_indicesv2.joblib.pkl')
result = numpy.array(G)[:, indices]
clf = ExtraTreesClassifier()
import time
start_time = time.time()
scores = cross_val_score(clf, result, targets, cv=10)
end_time = time.time() - start_time
print end_time
for i in scores:
    print i
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

filename = 'ExtraTreesClassifier_k-fold.joblib.pkl'
joblib.dump(clf, filename, compress=9)

params = clf.get_params()

print params
Beispiel #3
0
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(
    accr[0], accr[1]))

#******************************************************************************
#******************************************************************************

# *** Applying Machine Learning Technique #5 ***

from sklearn.ensemble import ExtraTreesClassifier

Extr = ExtraTreesClassifier(n_estimators=5, n_jobs=4)

from pprint import pprint
# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(Extr.get_params())

Extr.fit(X_train, y_train)

score_ETC = Extr.score(X_test, y_test)
print('Accuracy of Extratrees classifier on test set: %0.04f' % (score_ETC))

# Accuracy of Extratrees classifier on test set: 0.8295

#******************************************************************************
#******************************************************************************

# *** Applying Machine Learning Technique #6 ***

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
        # max_depths = [3,4,5]

        roll = np.random.randint(len(max_depths), size=1)[0]
        this_max_depth = max_depths[roll]

        roll = np.random.randint(len(max_features_list), size=1)[0]
        max_features = max_features_list[roll]

        ic = GradientBoostingClassifier(n_estimators=this_n_boosts,
                                        learning_rate=learning_rate,
                                        max_depth=this_max_depth,
                                        max_features=max_features,
                                        verbose=1)
        print ic

        gbc_params = ic.get_params(deep=True)

        # Make a dataframe of the parameters

        if sample_paste_id == 0:

            keys = gbc_params.keys()
            values = gbc_params.values()

            header = ('values bite %i' % sample_paste_id)
            params_df = pd.DataFrame(data=values, index=keys, columns=[header])

        else:
            header = ('values bite %i' % sample_paste_id)
            values = gbc_params.values()
            params_df[header] = values
Beispiel #5
0
def feature_importance_Einstein(base):

    df1 = base['inputs']
    df_out = base['outputs']

    try:
        df_out = df_out.drop(columns=['Timestamp'])
        df1 = df1.drop(columns=['Timestamp'])
    except:
        pass

    # Encontra as variáveis mais relevante para a incidência de COVID-19
    model = ExtraTreesClassifier()
    model.fit(df1, df_out)

    lista_importances = pd.DataFrame([model.feature_importances_])
    lista_importances.columns = list(df1.columns)
    lista_importances = lista_importances * 100

    lista_importances = lista_importances.sort_values(by=0, axis=1, ascending=False)

    top15 = list(lista_importances.columns[0:15])
    top15_values = []
    print("Variáveis mais impactantes:")
    for l in lista_importances.columns[0:15]:
        print("Nome: " + str(l) + " - " + str(lista_importances[l][0]) + " %")
        top15_values.append(lista_importances[l][0])
    print(top15)

    # cria dataset para predição
    df_in = df1[top15]
    df_out = df_out

    # pega a lista das variáveis mais relevantes e cria outra planilha para a rede neural
    lista_neural_in = df_in
    lista_neural_out = df_out

    ### como, neste caso, o timestamp não importa, pode-se preencher com qualquer valor sequencial ###
    # pega a quantidade de linhas
    qtde_linhas = len(lista_neural_in.index)
    # cria uma coluna de Timestamps sequenciais na primeira posição
    lista_neural_in.insert(0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H'))
    lista_neural_out.insert(0, "Timestamp", pd.date_range(start='1/1/2020', periods=qtde_linhas, freq='H'))

    df2_in = lista_neural_in.copy()
    df2_out = lista_neural_out.copy()
    writer = pd.ExcelWriter('base_simulate.xlsx', engine='openpyxl')
    lista_neural_in.to_excel(writer, sheet_name="INPUTS")
    lista_neural_out.to_excel(writer, sheet_name="OUTPUTS")
    writer.save()

    top15_aws = zip(top15, top15_values)

    response = {
        'top15' : top15_aws,
        'top15_names' : top15,
        'df_in' : df2_in,
        'df_out': df2_out,
        'model' : model.get_params(),
    }

    return response
Beispiel #6
0
    clf_etree.fit(X_train, y_train)
    print "Validation set score: ERF " , clf_etree.score(X_val, y_val)

    clf_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),algorithm="SAMME", n_estimators=500, random_state=74494, learning_rate=0.8) 
    clf_boost.fit(X_train, y_train)
    print "Validation set score: ABOOST " , clf_boost.score(X_val, y_val)


    #clf_gboost = GradientBoostingClassifier(n_estimators=int(reg), random_state=74494, learning_rate=0.2) 
    #clf_gboost.fit(X_train, y_train)
    #print "Validation set score:LR " , clf_gboost.score(X_val, y_val)


    print "Classifier:"
    print clf, clf.get_params()
    print clf_etree, clf_etree.get_params()
    print clf_boost, clf_boost.get_params()
    

    if(fe==1): #L1 norm based feature elimination
        clf_fe = LogisticRegression(C=1000,penalty='l1',random_state=0)
        clf_fe.fit(X_train, y_train)
        X_train = X_train[:,clf_fe.coef_.ravel()!=0]
        print "Xtrain.shape: ", X_train.shape
        X_val = X_val[:,clf_fe.coef_.ravel()!=0]

        clf2_l = svm.SVC(kernel='linear', C=reg)
        clf2_l.fit(X_train, y_train)
        print "Lasso Validation set score filtered coeff linear: " , clf2_l.score(X_val, y_val)
        clf2 = svm.SVC(kernel='rbf', C=reg, gamma=g)
        clf2.fit(X_train, y_train)
Beispiel #7
0

# In[14]:

from sklearn import grid_search
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import ExtraTreesClassifier

parameters = {'n_estimators': [1, 32]}
model = ExtraTreesClassifier()
f1_scorer = make_scorer(f1_score, pos_label='yes')
clf = grid_search.GridSearchCV(model, param_grid=parameters, scoring=f1_scorer)
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

print model.get_params()
print "F1 score for test set: {}".format(metrics.f1_score(Y_test, Y_pred))


# In[12]:

import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import ExtraTreesClassifier

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250, random_state=0)

forest.fit(X, Y)
importances = forest.feature_importances_
# # scores.mean()  
# # clf = ExtraTreesClassifier(n_estimators=150)
# # scores = cross_val_score(clf, feature_matrix, labels, cv=10)
# # scores.mean()
# # clf = clf.fit(feature_train,label_train)
# clf = svm.SVC(C=1.0,kernel='rbf',cache_size=1000,decision_function_shape='ovr',shrinking=True,probability=True)
# scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True))
# print (scores, scores.mean())
# clf.fit(feature_train, label_train)

'''Extra-Trees'''
clf = ExtraTreesClassifier(n_estimators=200,n_jobs=-1,max_features=30,criterion='gini')
scores = cross_val_score(clf,feature_matrix,labels,cv=StratifiedKFold(n_splits=4,shuffle=True))
print (scores, scores.mean())
clf = clf.fit(feature_train,label_train)
result = clf.predict(feature_test)
accuracy_score(label_test,result)
print (classification_report(label_test,result,digits=4))
print (clf.max_depth)
clf.get_params()# print(classification_report_imbalanced(label_test, result))
clf.score(feature_test,label_test)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
#print ('hlo',clf.oob_score_)

cm=sklearn.metrics.confusion_matrix(label_test,result )
print(cm)
pl.matshow(cm)
pl.colorbar()
pl.show()

Beispiel #9
0
        for homid in targetHOMS:
            allseqs.append(printHOM(homid))

    df = DataFrame()
    df.addColumns(['sample'] + [homid for homid in targetHOMS])

    for org in mc + nmc:

        rowdict = {'sample': org}

        for homid in targetHOMS:

            homID = homid
            val = homDB.get_cluster(homID)

            if org in val:
                rowdict[homID] = 1
            else:
                rowdict[homID] = 0

        dfrow = DataRow.fromDict(rowdict)
        df.addRow(dfrow)

    df.export(outFile=None)

    print(forest.get_params())

    for elem in allseqs:
        print(">" + elem[0] + " " + str(len(elem[1])))
        print(elem[1])