Beispiel #1
0
def preprocessingCV(foldK=2,dataSize=2000,fileName="logstash.csv"):
        
    print("loading data ...")
    dataDf=dh.loadLogData(fileName=fileName).loc[:dataSize]
    
    print(foldK,"-fold cross validation ...")
    dataArr=np.array(dataDf)
    dataSKF=skf(dataArr[:,1],n_folds=foldK,shuffle=True)
    dataIndexList=list(dataSKF)
    
    print("transforming dataIndexList into dataList ...")
    dataList=[([dataArr[row0Item][0] for row0Item in row[0]],[dataArr[row0Item][1] for row0Item in row[1]]) for row in dataIndexList]
    
    return dataList
Beispiel #2
0
def preprocessingCVFromDf(dataDf,foldK=2,dataSize=2000):
    
    print("-transforming df into arr to fit sklearn...")
    dataArr=np.array(dataDf)
    
    print("-folding ...")
    yArr=dataArr[:,1]
    mySkf=skf(yArr,n_folds=foldK,shuffle=True)
    dataIndexList=list(mySkf)
    
    print("transforming dataIndexList into dataList ...")
    dataList=[([(dataArr[row0Item][0],dataArr[row0Item][1]) for row0Item in row[0]],\
               [(dataArr[row0Item][0],dataArr[row0Item][1]) for row0Item in row[1]])\
               for row in dataIndexList]
    
    return dataList
print white_corr_rho
print white_corr_pval

#RANDOM FOREST MODELING: RED---------------------------------------------------

#set iterations
iterations=20

#create empty data frames for prediction results and feature importances
red_results=pd.DataFrame(index=dfr_exp.index, columns=range(0,iterations))
red_features=pd.DataFrame(index=range(0,11), columns=range(0,iterations))

#fit model using StratifiedKFold
rf=rfc(n_estimators=360, max_features=5, criterion='gini')
for j in range(0,iterations):
    folds = skf(dfr_res, 5, shuffle=True)
    for train, test in folds:
        model=rf.fit(dfr_exp.ix[train,], dfr_res[train])
        red_results.ix[test,j] = pd.Series(model.predict(dfr_exp.ix[test,]), index=test, name=[j])
        red_features[j]=pd.Series(model.feature_importances_)
    print j

#write results to file
red_results.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=True)
red_features.to_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=True)

#retrieve results as needed
#red_results=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt', sep='\t', header=False, names=range(0,iterations))
#red_features=pd.read_csv('C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt', sep='\t', header=False, names=range(0,iterations))

#transform results to calculate accuracy, sensitivity (TPR) and precision (PPV)
Beispiel #4
0
print round(kn_accuracy.best_score_ * 100, 2)
print round(nb_accuracy.mean() * 100, 2)
print round(dt_accuracy.best_score_ * 100, 2)

#Compare models: F1
print round(kn_f1.best_score_ * 100, 2)
print round(nb_f1.mean() * 100, 2)
print round(dt_f1.best_score_ * 100, 2)

#Compare models: ROC AUC
print round(kn_auc.best_score_ * 100, 2)
print round(nb_auc.mean() * 100, 2)
print round(dt_auc.best_score_ * 100, 2)

#Confusion Matrix, ROC Curve by K-fold Slice
folds = skf(df_res, 10, indices=False)

#produce confusion matrices for each fold
for i, (train, test) in enumerate(folds):
    preds = dtree.fit(df_exp.ix[train, ],
                      df_res[train]).predict(df_exp.ix[test, ])
    print '----FOLD #%d----' % i
    print pd.crosstab(df_res[test],
                      preds,
                      rownames=['Actual'],
                      colnames=['Predicted'],
                      margins=True)

#produce ROC curves for each fold
dtree = tr.DecisionTreeClassifier()
mean_tpr = 0.0
Beispiel #5
0
print white_corr_rho
print white_corr_pval

#RANDOM FOREST MODELING: RED---------------------------------------------------

#set iterations
iterations = 20

#create empty data frames for prediction results and feature importances
red_results = pd.DataFrame(index=dfr_exp.index, columns=range(0, iterations))
red_features = pd.DataFrame(index=range(0, 11), columns=range(0, iterations))

#fit model using StratifiedKFold
rf = rfc(n_estimators=360, max_features=5, criterion='gini')
for j in range(0, iterations):
    folds = skf(dfr_res, 5, shuffle=True)
    for train, test in folds:
        model = rf.fit(dfr_exp.ix[train, ], dfr_res[train])
        red_results.ix[test, j] = pd.Series(model.predict(dfr_exp.ix[test, ]),
                                            index=test,
                                            name=[j])
        red_features[j] = pd.Series(model.feature_importances_)
    print j

#write results to file
red_results.to_csv(
    'C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_results.txt',
    sep='\t',
    header=True)
red_features.to_csv(
    'C:/Users/mmcgoldr/Dropbox/GA/DataScience/Project/red_features.txt',
print round(nb_accuracy.mean()*100,2) 
print round(dt_accuracy.best_score_*100,2) 

#Compare models: F1
print round(kn_f1.best_score_*100,2) 
print round(nb_f1.mean()*100,2) 
print round(dt_f1.best_score_*100,2) 

#Compare models: ROC AUC
print round(kn_auc.best_score_*100,2) 
print round(nb_auc.mean()*100,2) 
print round(dt_auc.best_score_*100,2) 


#Confusion Matrix, ROC Curve by K-fold Slice
folds = skf(df_res, 10, indices=False)

#produce confusion matrices for each fold
for i, (train, test) in enumerate(folds):    
    preds = dtree.fit(df_exp.ix[train,], df_res[train]).predict(df_exp.ix[test,])
    print '----FOLD #%d----' % i 
    print pd.crosstab(df_res[test], preds, rownames=['Actual'], colnames=['Predicted'], margins=True)

#produce ROC curves for each fold
dtree = tr.DecisionTreeClassifier()
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 100)
for i, (train, test) in enumerate(folds):    
    preds = dtree.fit(df_exp.ix[train,], df_res[train]).predict(df_exp.ix[test,])
    fpr, tpr, thresholds = mt.roc_curve(df_res[test], preds)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, mt.auc(fpr, tpr)))
def train_model_bagging (features, labels) :
	base_model = rfc(n_estimators = 80, max_features = 20,
                      max_depth=6, random_state = 30,
                      criterion = 'entropy')
	# model = BaggingClassifier(base_estimator = base_model)
	params_dict = {'max_features': [0.5, 0.8], 'max_samples': [0.5, 0.8, 1], 'n_estimators':[25, 50, 75]}
	
	clf = GridSearchCV(BaggingClassifier(random_state = 30, n_jobs = -1, base_estimator = base_model), params_dict, scoring = 'roc_auc', cv = skf(labels, n_folds = 5, random_state = 30))
	clf.fit(features, labels)

	print ("Best estimator: ", clf.best_estimator_)
	print ("Best best scores: %.4f" %(clf.best_score_))
	return clf
Beispiel #8
0
shuffle(data)
ys = data[:,4]    #classe
xs = data[:,:4]   #features

means = np.mean(xs, axis = 0)
stdevs = np.std(xs, axis = 0)
xs = (xs - means)/stdevs
xr,xt,yr,yt = tts(xs, ys,test_size = 0.33, stratify=ys)


    
errs = []
# numero de vezes que itera para depois fazer a média
folds = 10
kf = skf(yr,n_folds = folds)




''' Logistic Regression '''
menorC_va_err=200000

#Parametro especifico
C=1;
#Vamos guardar em que numero de parametro tivemos o menor valor de va_err (guardado em cima)
bestNumberofC=0
#Plot the errors against the logarithm of the C value
arrayC = []
for idx in range(1,21):
    tr_err = va_err = 0