newX = pd.DataFrame()
        
        for tmpFeature in tmpCurFeatureList:
            path = _basePath + tmpFeature + "_train.csv"
            dr = DataReader()
            tmpX = dr.cvtPathListToDfList(path, "test")
            newX = pd.concat([newX, tmpX], axis=1)
        #log("feature len: " , len(newX))
            
        # Get all best model from newX
        fab = ModelFactory()
        fab._setXgboostTheradToOne = True
        fab._gridSearchFlag = True
        fab._onlyTreeBasedModels = True
        fab._subFolderName = "one_hot_each_" + str(i)
        fab._n_iter_search = 30
        fab._expInfo = expInfo
#         fab.getAllModels(newX, newY)
        fab.getRandomForestClf(newX, newY)
#         fab.getXgboostClf(newX, newY)
        log ( i , "/32 done..." )
    
    
    
   
    
    
    musicAlarm()
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
Esempio n. 2
0
#         newX = newX.append(newX.iloc()[tmpIdx])
#         newX[newX.columns[0]][len(newX)-1] = 2
#         print i
#      
#     #print newX.iloc()[0]
#     tmpOutPath = _basePath + "location_log_feature_over_sampling.csv"
#     print len(newX)
#     newX.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #
    #print len(newX)        
    # Get all best model from newX
    fab = ModelFactory()
    fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "ismail4"  
    fab._n_iter_search = 10
    fab._expInfo = expInfo
#         fab.getAllModels(newX, newY)
    #fab.getRandomForestClf(newX, newY)
    fab.getXgboostClf(newX, newY)
#         fab.getXgboostClf(newX, newY)
#    log ( i , "/32 done..." )
    
    
    
   
    
    
   # musicAlarm()
    # Test all data
Esempio n. 3
0
#     drAns = DataReader()
#     drAns.readInCSV(ansPath, "train")
#     newY = drAns._ansDataFrame

    tmpPath = _basePath + "train_merge_one_hot.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    newX = dr._trainDataFrame
    newY = dr._ansDataFrame


    fab = ModelFactory()
    #fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "groupby_sum"  
    fab._n_iter_search = 1
    fab._expInfo = expInfo
    clf = fab.getXgboostClf(newX, newY)
#     
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "test")
    newX = dr._testDataFrame
    newX  = xgb.DMatrix(newX)
    tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans.csv"
    log(clf.predict(newX))
    outDf = pd.DataFrame(clf.predict(newX))
    outDf.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    musicAlarm()
    
Esempio n. 4
0
    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._subFolderName = "binary"
    fab._n_iter_search = 50
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)

    musicAlarm()
    # Test all data
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
        "Logistic_Regression"
    ]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]

#     for tmpFeature in featureList:
#     modelFolder = _basePath + "models" + Config.osSep
#     for tmpModel in modelList:
Esempio n. 5
0
    

    clfList = ["xgboost", "rf","extra_tree",]

    
    fab = ModelFactory()
    fab._gridSearchFlag = True
    
    dfUpper = pd.DataFrame()
    dfTestUpper = pd.DataFrame()
    eachClfLoopTimes = 3
    iter_search_times = 1
    
    for tmpClfName in clfList:
        for i in range(0,eachClfLoopTimes):
            fab._subFolderName = tmpClfName
            fab._n_iter_search = iter_search_times
            fab._expInfo = expInfo
            if  tmpClfName == "rf":
                clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "knn":
                clf = fab.getKnnClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "extra_tree":
                clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "xgboost":
                clf = fab.getXgboostClf(train_fold_1, train_fold_label_1)
             
            if tmpClfName == "xgboost":
                predictResult = clf.predict(xgb.DMatrix(train_fold_2))
                predictTestResult = clf.predict(xgb.DMatrix(test_fold_2))
            else:
Esempio n. 6
0
    expInfo = expNo + "_data_exploration" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    
   
    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._subFolderName = "binary"
    fab._n_iter_search = 50
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    musicAlarm()
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
#     for tmpFeature in featureList:
#     modelFolder = _basePath + "models" + Config.osSep 
#     for tmpModel in modelList:  
#         curModel = tmpModel
#         
#         dr = DataReader()
Esempio n. 7
0
        "xgboost",
        "rf",
        "extra_tree",
    ]

    fab = ModelFactory()
    fab._gridSearchFlag = True

    dfUpper = pd.DataFrame()
    dfTestUpper = pd.DataFrame()
    eachClfLoopTimes = 3
    iter_search_times = 1

    for tmpClfName in clfList:
        for i in range(0, eachClfLoopTimes):
            fab._subFolderName = tmpClfName
            fab._n_iter_search = iter_search_times
            fab._expInfo = expInfo
            if tmpClfName == "rf":
                clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "knn":
                clf = fab.getKnnClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "extra_tree":
                clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "xgboost":
                clf = fab.getXgboostClf(train_fold_1, train_fold_label_1)

            if tmpClfName == "xgboost":
                predictResult = clf.predict(xgb.DMatrix(train_fold_2))
                predictTestResult = clf.predict(xgb.DMatrix(test_fold_2))
            else:
Esempio n. 8
0
    #         newX = newX.append(newX.iloc()[tmpIdx])
    #         newX[newX.columns[0]][len(newX)-1] = 2
    #         print i
    #
    #     #print newX.iloc()[0]
    #     tmpOutPath = _basePath + "location_log_feature_over_sampling.csv"
    #     print len(newX)
    #     newX.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #
    #print len(newX)
    # Get all best model from newX
    fab = ModelFactory()
    fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "ismail4"
    fab._n_iter_search = 10
    fab._expInfo = expInfo
    #         fab.getAllModels(newX, newY)
    #fab.getRandomForestClf(newX, newY)
    fab.getXgboostClf(newX, newY)
    #         fab.getXgboostClf(newX, newY)
    #    log ( i , "/32 done..." )

    # musicAlarm()
    # Test all data
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
        "Logistic_Regression"
    ]
    #     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]