Esempio n. 1
0
def oneHot():
    dr = DataReader()
    dr.readInCSV(_pathMain, _mode)
    tmpColumnPrefix = _typeName + "_"
    df = pd.read_csv(_eventTypePath, header=0, sep=',')
    if _mode == "train":
        processDf = dr._trainDataFrame
    else:
        processDf = dr._testDataFrame
        
    for i in range (1, 55):
        tmpColName = tmpColumnPrefix + "one_hot_" + str(i)
        processDf[tmpColName] = 0
    
    tmpLastI2 = 0
    for i1 in range(0, len(processDf[processDf.columns[0]])):

        tmpFlag = False
        for i2 in range(tmpLastI2, len(df[df.columns[0]])):
            tmpMainId = processDf[processDf.columns[0]][i1]
            tmpId = df[df.columns[0]][i2]
            tmpVal = df[df.columns[1]][i2]
            # tmpVal2= df[df.columns[2]][i2]
            if  tmpMainId == tmpId:
                tmpFlag = True
                print tmpVal
                processDf[processDf.columns[tmpVal + 394]][i1] = 1
            if tmpFlag == True and tmpMainId != tmpId:
                tmpLastI2 = i2
                break
            print i1, i2
    # outDf = pd.concat([dr._ansDataFrame, processDf], axis=1)
    outDf = processDf
    outDf.to_csv(_outputPathName, sep=',', encoding='utf-8')  
Esempio n. 2
0
    def sumExist(self):
        dr = DataReader()
        dr.readInCSV(self._pathMain, self._mode)
        tmpColumnPrefix = self._typeName + "_"
        df = pd.read_csv(self._pathMain, header=0, sep=',')
        if self._mode == "train":
            processDf = dr._trainDataFrame
        else:
            processDf = dr._testDataFrame
            
        for i in range (1,1127):
            tmpColName = tmpColumnPrefix + "one_hot_" + str(i)
            processDf[tmpColName] = 0
        
        tmpLastI2 = 0
        for i1 in range(0, len(processDf[processDf.columns[0]] )):

            tmpFlag = False
            for i2 in range(tmpLastI2, len(df[df.columns[0]] )):
                tmpMainId = processDf[processDf.columns[0]][i1]
                tmpId = df[df.columns[0]][i2]
                tmpVal= df[df.columns[1]][i2]
                #tmpVal2= df[df.columns[2]][i2]
                if  tmpMainId == tmpId:
                    tmpFlag = True
                    processDf[processDf.columns[tmpVal+394]][i1] =1
                if tmpFlag == True and tmpMainId != tmpId:
                    tmpLastI2 = i2
                    break
                #print i1, i2
        #outDf = pd.concat([dr._ansDataFrame, processDf], axis=1)
        outDf = processDf
        outDf.to_csv(self._outputPathName, sep=',', encoding='utf-8')  
Esempio n. 3
0
if __name__ == '__main__':
    
    # 1. read in data
    expNo = "003"
    expInfo = expNo + "_one_hot_event_type" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    
    doTestFlag = False
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX
 
    
    # 3. get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 30
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
from Telstra.Bartender.Blender import Blender


if __name__ == '__main__':
    
    
   # 1. read in data
    expNo = "014"
    expInfo = expNo + "_one_hot_each_features" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"]
    
    ansPath = _basePath + "014_ans_array.csv"
    drAns = DataReader()
    drAns.readInCSV(ansPath, "train")
    newY = drAns._ansDataFrame
    
    
       
    for i in range(1,32):
        log( "start " + str(i) + "/32 ...")
        tmpCurFeatureList = []
        
        flagList =[]
        for i2 in range (0, 7- len(bin(i))):
            flagList.append(0)
        for i2 in range(2,len(bin(i))):
            flagList.append(int(bin(i)[i2]))
        
        for j in range(0,5):
Esempio n. 5
0
   # 1. read in data
    expNo = "020"
    expInfo = expNo + "_groupby_sum" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"]
    ans1List = []
    ans2List = []
#     ansPath = _basePath + "014_ans_array.csv"
#     drAns = DataReader()
#     drAns.readInCSV(ansPath, "train")
#     newY = drAns._ansDataFrame

    tmpPath = _basePath + "train_merge_one_hot.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    newX = dr._trainDataFrame
    newY = dr._ansDataFrame


    fab = ModelFactory()
    #fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "groupby_sum"  
    fab._n_iter_search = 1
    fab._expInfo = expInfo
    clf = fab.getXgboostClf(newX, newY)
#     
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
Esempio n. 6
0
if __name__ == '__main__':

    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    testSortIdPath = Config.FolderBasePath + "test_sort_id.csv"
    trainSortIdPath = _basePath + "train_sort_id.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    dr2 = DataReader()
    dr2.readInCSV(testPath, "test")
    #newX = dr2._testDataFrame

    dr3 = DataReader()
    dr3.readInCSV(testSortIdPath, "test")
    sortIdDf = dr3._testDataFrame

    dr4 = DataReader()
    dr4.readInCSV(trainSortIdPath, "test")
    sortIdDf = dr4._testDataFrame

    modelFolder = _basePath + "models" + Config.osSep + "binary" + Config.osSep
Esempio n. 7
0
    tmpPath = _basePath + "008_submission_1_train_Xgboost.csv"
    newX, newY =  dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)
    
    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("RandomForest")
    clfNameList.append("Xgboost")
    
    
    b1 = Blender(clfNameList, tmpDfList, newY)
    b1.autoFlow(1000, outputPath)
    
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX
 
    
    # 3. get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._n_iter_search = 100
#     fab._expInfo = expInfo
#     fab.getXgboostClf(newX, newY)
    
    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
Esempio n. 8
0
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    featureList = [
        "location", "event_type", "resource_type", "severity_type",
        "log_feature"
    ]
    ans1List = []
    ans2List = []
    #     ansPath = _basePath + "014_ans_array.csv"
    #     drAns = DataReader()
    #     drAns.readInCSV(ansPath, "train")
    #     newY = drAns._ansDataFrame

    tmpPath = _basePath + "train_v1.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    newX = dr._trainDataFrame
    newY = dr._ansDataFrame

    #     newX = pd.concat([newY, newX], axis =1)
    #
    #     logFeaturePath = _basePath + "log_feature_train.csv"
    #     dr = DataReader()
    #     dr.readInCSV(logFeaturePath, "test")
    #     newX = dr._testDataFrame

    #     for i, tmpVal in enumerate (newY):
    #         if tmpVal == 1:
    #             ans1List.append(i)
    #         elif tmpVal ==2:
    #             ans2List.append(i)
Esempio n. 9
0
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model
    #Logistic_Regression
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
        "Logistic_Regression"
    ]
    featureList = [
        "event_type", "log_feature", "resource_type", "severity_type"
    ]

    for tmpFeature in featureList:
        for tmpModel in modelList:
            subFolder = tmpFeature
            curModel = tmpModel

            tmpCsvPath = _basePath + expNo + "_" + tmpFeature + "_test_tobe.csv"
            dr = DataReader()
            dr.readInCSV(tmpCsvPath, "train")
            newX = dr._trainDataFrame
            modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep
            modelPath = modelFolder + str(
                getMatchNameModelPath(modelFolder, curModel))
            tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv"
            tmpClf = loadModel(modelPath)
            log(tmpClf.predict_proba(newX))
            outDf = pd.concat(
                [newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
            outDf = pd.DataFrame(tmpClf.predict_proba(newX))
            outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #musicAlarm()
#     log("004 Done")
Esempio n. 10
0
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo

    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")

    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)

    #     log( "xgb start")
    #     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
    #     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
    #     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')

    #     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
    #     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"

    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm

    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)


#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance

    if doTestFlag == True:
        return finalClf.predict_proba(dr._testDataFrame)
Esempio n. 11
0
if __name__ == '__main__':
    
    
    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"
    
    testSortIdPath = Config.FolderBasePath + "test_sort_id.csv"
    trainSortIdPath = _basePath + "train_sort_id.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    dr2 = DataReader()
    dr2.readInCSV( testPath, "test")
    #newX = dr2._testDataFrame
    
    dr3 = DataReader()
    dr3.readInCSV( testSortIdPath, "test")
    sortIdDf =dr3._testDataFrame
    
    dr4 = DataReader()
    dr4.readInCSV(trainSortIdPath, "test")
    sortIdDf =dr4._testDataFrame
    
    modelFolder = _basePath + "models" + Config.osSep  + "binary" + Config.osSep
Esempio n. 12
0
#     dr = DataReader()
#     dr.readInCSV(tmpPath, "test")
#     newX = dr._testDataFrame
#     newY = dr._ansDataFrame
#     newX  = xgb.DMatrix(newX)
#     #print clf.predict(newX)
#     tmpOutPath = _basePath + expNo +"_" + "Xgboost" + "_testXgboost7_ans.csv"
#     log(clf.predict(newX))
#     outDf = pd.DataFrame(clf.predict(newX))
#     outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
#     musicAlarm()
    
    clf = joblib.load( "F:\\xgboost_tmp_best_020.model" )
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "test")
    newX = dr._testDataFrame
    newX  = xgb.DMatrix(newX)
    tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans_" + "2" + ".csv"
    log(clf.predict(newX))
    outDf = pd.DataFrame(clf.predict(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
    
    
        
#     sampleRows = np.random.choice(X.index, len(X)*evalDataPercentage) 
#     
#     print  X.ix[sampleRows]
#     exit()
#     dtest  = xgb.DMatrix( X.ix[sampleRows], label=Y.ix[sampleRows])
Esempio n. 13
0
#     fab._expInfo = expInfo
#     fab.getAllModels(newX, newY)
    
    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
    #D:\Kaggle\Telstra\004_one_hot_resource_type\(K_NN)_(2016-02-06_11_40_10).model
    #Logistic_Regression
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
    featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
    for tmpFeature in featureList: 
        for tmpModel in modelList:
            subFolder = tmpFeature
            curModel = tmpModel
            
            tmpCsvPath = _basePath + expNo + "_" + tmpFeature +"_test_tobe.csv"
            dr = DataReader()
            dr.readInCSV(tmpCsvPath , "train")
            newX = dr._trainDataFrame
            modelFolder = _basePath + "models" + Config.osSep + subFolder + Config.osSep
            modelPath =  modelFolder + str(getMatchNameModelPath(modelFolder, curModel))
            tmpOutPath = _basePath + "010_" + curModel + "_stack_" + subFolder + "_test.csv"
            tmpClf = loadModel( modelPath)
            log(tmpClf.predict_proba(newX))
            outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
            outDf = pd.DataFrame(tmpClf.predict_proba(newX))
            outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #musicAlarm()
#     log("004 Done")
Esempio n. 14
0
from Telstra.Bartender.Blender import Blender
from test._mock_backport import inplace
import random
import xgboost as xgb
import numpy as np

if __name__ == '__main__':

    # 1. read in data
    expNo = "021"
    expInfo = expNo + "_stacking"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    tmpPath = _basePath + "train.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    ori_X = X
    ori_Y = Y

    evalDataPercentage = 0.5
    sampleRows = np.random.choice(X.index, len(X) * evalDataPercentage)

    train_fold_1 = X.ix[sampleRows]
    train_fold_label_1 = Y.ix[sampleRows]
    train_fold_2 = X.drop(sampleRows)
    train_fold_label_2 = Y.drop(sampleRows)

    #     tmpOutPath = _basePath + expNo +"_" + "fold_1.csv"
    #     train_fold_1.to_csv(tmpOutPath, sep=',', encoding='utf-8')
Esempio n. 15
0
    tmpPath = _basePath + "008_submission_1_train_Xgboost.csv"
    newX, newY = dr.cvtPathListToDfList(tmpPath, "train")
    tmpDfList.append(newX)

    clfNameList = []
    clfNameList.append("Extra_Trees")
    clfNameList.append("K_NN")
    clfNameList.append("RandomForest")
    clfNameList.append("Xgboost")

    b1 = Blender(clfNameList, tmpDfList, newY)
    b1.autoFlow(1000, outputPath)

    if doTestFlag == True:
        dr.readInCSV(testPath, "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX

    # 3. get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._n_iter_search = 100
#     fab._expInfo = expInfo
#     fab.getXgboostClf(newX, newY)

# 4. test all data, output 3 ans as features
#D:\Kaggle\Telstra\004_one_hot_resource_type\(Xgboost)_(2016-02-06_11_14_31).model
#D:\Kaggle\Telstra\004_one_hot_resource_type\(Random_Forest)_(2016-02-06_11_24_09).model
#D:\Kaggle\Telstra\004_one_hot_resource_type\(Extra_Trees)_(2016-02-06_11_30_52).model
from Telstra.Bartender.Blender import Blender


if __name__ == '__main__':
    
    
   # 1. read in data
    expNo = "011"
    expInfo = expNo + "_remove_one_hot" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
#     fab._subFolderName = "stacked"
    fab._n_iter_search = 100
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
#     for tmpFeature in featureList:
Esempio n. 17
0
_basePath =""
if os.name == 'nt':
    _basePath = "D:\\Kaggle\\Telstra\\"
else:
    _basePath = "/Users/whmou/Kaggle/Telstra/"
testPath = _basePath + "test6.csv"    # take id list only
testPath2 = _basePath + "test11.csv"   

samplePath = _basePath + "sample_submission.csv" 
outputPath = _basePath+"temp_submission4.csv"

if __name__ == '__main__':
    print "start to make submission version:", outputPath
    dr = DataReader()
    dr.readInCSV(testPath, "test")
    idList =  dr._testDataFrame[dr._testDataFrame.columns[0]]
    
    
    dr2= DataReader()
    dr2.readInCSV(testPath2, "test")
    
    
    dr3= DataReader()
    dr3.readInCSV(samplePath, "test")
    sampleIdList =  dr3._testDataFrame[dr3._testDataFrame.columns[0]]
    
    tmp = pd.DataFrame(exp())
    ansArr = pd.concat([idList, tmp], axis=1)
    print ansArr
    
Esempio n. 18
0
from Telstra.util.ModelUtils import loadModel
import pandas as pd

if __name__ == '__main__':

    # 1. read in data
    expInfo = "001_location_only" + Config.osSep
    _basePath = Config.FolderBasePath + expInfo

    doTestFlag = True
    path = _basePath + "001_train_tobe.csv"
    testPath = _basePath + "001_test_tobe.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        print newX
    # 2. stratify 60 % data and train location only
#     newX, newY = stratifyData(dr._trainDataFrame, dr._ansDataFrame, 0.4)

# 3. get all best model from newX
#     fab = ModelFactory()
#     fab._gridSearchFlag = True
#     fab._n_iter_search = 500
#     fab._expInfo = "001_location_only"
#     fab.getAllModels(newX, newY)
Esempio n. 19
0
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo
    
    
    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
    
    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)
    
#     log( "xgb start")
#     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
#     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
#     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')
    
#     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
#     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"
    
    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm
    
    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)
    
    
#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):  
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#     
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance 

    if doTestFlag == True:       
        return finalClf.predict_proba(dr._testDataFrame)