Beispiel #1
0
    doTestFlag = False
    path = _basePath + expInfo + "_train_tobe.csv"
    testPath = _basePath + expInfo + "_test_tobe.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX

    # 3. get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 30
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)

    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model
    #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model
    #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model
    #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model

    #     modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model"
    #     tmpOutPath = _basePath + "002_submission_1_K_NN.csv"
    #     tmpClf = loadModel( modelPath)
    #     log(tmpClf.predict_proba(newX))
                tmpCurFeatureList.append(featureList[j])
        
        log(tmpCurFeatureList)        
        
        
        newX = pd.DataFrame()
        
        for tmpFeature in tmpCurFeatureList:
            path = _basePath + tmpFeature + "_train.csv"
            dr = DataReader()
            tmpX = dr.cvtPathListToDfList(path, "test")
            newX = pd.concat([newX, tmpX], axis=1)
        #log("feature len: " , len(newX))
            
        # Get all best model from newX
        fab = ModelFactory()
        fab._setXgboostTheradToOne = True
        fab._gridSearchFlag = True
        fab._onlyTreeBasedModels = True
        fab._subFolderName = "one_hot_each_" + str(i)
        fab._n_iter_search = 30
        fab._expInfo = expInfo
#         fab.getAllModels(newX, newY)
        fab.getRandomForestClf(newX, newY)
#         fab.getXgboostClf(newX, newY)
        log ( i , "/32 done..." )
    
    
    
   
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX
 
    
    # 3. get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 30
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model
    #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model
    #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model
    #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model
    
#     modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model"
#     tmpOutPath = _basePath + "002_submission_1_K_NN.csv"
#     tmpClf = loadModel( modelPath)
#     log(tmpClf.predict_proba(newX))
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo

    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")

    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)

    #     log( "xgb start")
    #     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
    #     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
    #     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')

    #     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
    #     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"

    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm

    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)


#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance

    if doTestFlag == True:
        return finalClf.predict_proba(dr._testDataFrame)
Beispiel #5
0
    featureList = ["location", "event_type", "resource_type" , "severity_type", "log_feature"]
    ans1List = []
    ans2List = []
#     ansPath = _basePath + "014_ans_array.csv"
#     drAns = DataReader()
#     drAns.readInCSV(ansPath, "train")
#     newY = drAns._ansDataFrame

    tmpPath = _basePath + "train_merge_one_hot.csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "train")
    newX = dr._trainDataFrame
    newY = dr._ansDataFrame


    fab = ModelFactory()
    #fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "groupby_sum"  
    fab._n_iter_search = 1
    fab._expInfo = expInfo
    clf = fab.getXgboostClf(newX, newY)
#     
    tmpPath = _basePath + "test_merge_one_hot" + ".csv"
    dr = DataReader()
    dr.readInCSV(tmpPath, "test")
    newX = dr._testDataFrame
    newX  = xgb.DMatrix(newX)
    tmpOutPath = _basePath + expNo +"_" + "Xgboost_" + "groupby_sum"+ "_ans.csv"
    log(clf.predict(newX))
if __name__ == "__main__":

    # 1. read in data
    expNo = "012"
    expInfo = expNo + "_rf_chk_important"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_asis.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    #     fab._subFolderName = "stacked"
    fab._n_iter_search = 250
    fab._expInfo = expInfo
    #     fab.getAllModels(newX, newY)
    finalClf = fab.getRandomForestClf(newX, newY)

    featureImportance = []
    for i in range(0, len(finalClf.feature_importances_)):
        if i != len(dr._trainDataFrame.columns):
            # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
            featureImportance.append([dr._trainDataFrame.columns[i], finalClf.feature_importances_[i]])

    # log( featureImportance)
    featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
Beispiel #7
0
# #     for i in range(0,3):
#         tmpIdx = ans2List[random.randint(0, len(ans2List)-1)]
#         tmpAnsCol = pd.DataFrame()
#         tmpAnsCol[0] = 2
#         newX = newX.append(newX.iloc()[tmpIdx])
#         newX[newX.columns[0]][len(newX)-1] = 2
#         print i
#      
#     #print newX.iloc()[0]
#     tmpOutPath = _basePath + "location_log_feature_over_sampling.csv"
#     print len(newX)
#     newX.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #
    #print len(newX)        
    # Get all best model from newX
    fab = ModelFactory()
    fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "ismail4"  
    fab._n_iter_search = 10
    fab._expInfo = expInfo
#         fab.getAllModels(newX, newY)
    #fab.getRandomForestClf(newX, newY)
    fab.getXgboostClf(newX, newY)
#         fab.getXgboostClf(newX, newY)
#    log ( i , "/32 done..." )
    
    
    
   
Beispiel #8
0
    dr2.readInCSV(tmpPath, "test")
    testX = dr2._testDataFrame
    ori_testX = testX
    #sampleRows = np.random.choice(testX.index, len(testX)*evalDataPercentage) 
    sampleRows = []
    for i in range(0, int(len(testX)/2)):
        sampleRows.append(i)
    
    test_fold_2  =  testX.ix[sampleRows]
    test_fold_1  =  testX.drop(sampleRows)
    

    clfList = ["xgboost", "rf","extra_tree",]

    
    fab = ModelFactory()
    fab._gridSearchFlag = True
    
    dfUpper = pd.DataFrame()
    dfTestUpper = pd.DataFrame()
    eachClfLoopTimes = 3
    iter_search_times = 1
    
    for tmpClfName in clfList:
        for i in range(0,eachClfLoopTimes):
            fab._subFolderName = tmpClfName
            fab._n_iter_search = iter_search_times
            fab._expInfo = expInfo
            if  tmpClfName == "rf":
                clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "knn":
if __name__ == '__main__':

    # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._subFolderName = "binary"
    fab._n_iter_search = 50
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)

    musicAlarm()
    # Test all data
    modelList = [
        "Xgboost", "Random_Forest", "Extra_Trees", "K_NN",
        "Logistic_Regression"
    ]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]

#     for tmpFeature in featureList:
   # 1. read in data
    expNo = "013"
    expInfo = expNo + "_data_exploration" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_asis.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    
   
    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._subFolderName = "binary"
    fab._n_iter_search = 50
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    musicAlarm()
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
#     for tmpFeature in featureList:
#     modelFolder = _basePath + "models" + Config.osSep 
#     for tmpModel in modelList:  
#         curModel = tmpModel
Beispiel #11
0
    ori_testX = testX
    #sampleRows = np.random.choice(testX.index, len(testX)*evalDataPercentage)
    sampleRows = []
    for i in range(0, int(len(testX) / 2)):
        sampleRows.append(i)

    test_fold_2 = testX.ix[sampleRows]
    test_fold_1 = testX.drop(sampleRows)

    clfList = [
        "xgboost",
        "rf",
        "extra_tree",
    ]

    fab = ModelFactory()
    fab._gridSearchFlag = True

    dfUpper = pd.DataFrame()
    dfTestUpper = pd.DataFrame()
    eachClfLoopTimes = 3
    iter_search_times = 1

    for tmpClfName in clfList:
        for i in range(0, eachClfLoopTimes):
            fab._subFolderName = tmpClfName
            fab._n_iter_search = iter_search_times
            fab._expInfo = expInfo
            if tmpClfName == "rf":
                clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "knn":
Beispiel #12
0
    # #     for i in range(0,3):
    #         tmpIdx = ans2List[random.randint(0, len(ans2List)-1)]
    #         tmpAnsCol = pd.DataFrame()
    #         tmpAnsCol[0] = 2
    #         newX = newX.append(newX.iloc()[tmpIdx])
    #         newX[newX.columns[0]][len(newX)-1] = 2
    #         print i
    #
    #     #print newX.iloc()[0]
    #     tmpOutPath = _basePath + "location_log_feature_over_sampling.csv"
    #     print len(newX)
    #     newX.to_csv(tmpOutPath, sep=',', encoding='utf-8')
    #
    #print len(newX)
    # Get all best model from newX
    fab = ModelFactory()
    fab._setXgboostTheradToOne = True
    fab._gridSearchFlag = True
    fab._singleModelMail = True
    fab._subFolderName = "ismail4"
    fab._n_iter_search = 10
    fab._expInfo = expInfo
    #         fab.getAllModels(newX, newY)
    #fab.getRandomForestClf(newX, newY)
    fab.getXgboostClf(newX, newY)
    #         fab.getXgboostClf(newX, newY)
    #    log ( i , "/32 done..." )

    # musicAlarm()
    # Test all data
    modelList = [
Beispiel #13
0
    newY = dr._ansDataFrame
    
#     t = pd.get_dummies(newY)
#     finalList = []
#     for tmp in range(0,len(t)):
#         tmpList =[]
#         for i in range(0, len(t.ix[tmp])):
#             tmpList.append( int( t.ix[tmp][i]))
#         finalList.append(tmpList)
#     print finalList
    #exit()
     
 
    #print len(newX)        
    # Get all best model from newX
    fab = ModelFactory()
    fab._setXgboostTheradToOne = False
    fab._gridSearchFlag = True
    fab._subFolderName = "ismail3"  
    fab._n_iter_search = 1
    fab._expInfo = expInfo
  
    #clf = fab.getXgboostClf(newX, newY)
    clf = fab.getRandomForestClf(newX, newY)
    #print fab.getLogloss(clf,newX,newY)


    
    def llfun(act, pred):
        epsilon = 1e-15
        pred = sp.maximum(epsilon, pred)
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo
    
    
    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
    
    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)
    
#     log( "xgb start")
#     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
#     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
#     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')
    
#     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
#     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"
    
    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm
    
    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)
    
    
#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):  
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#     
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance 

    if doTestFlag == True:       
        return finalClf.predict_proba(dr._testDataFrame)