Esempio n. 1
0
    testPath = _basePath + expInfo + "_test_tobe.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX

    # 3. get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 30
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)

    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model
    #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model
    #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model
    #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model

    #     modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model"
    #     tmpOutPath = _basePath + "002_submission_1_K_NN.csv"
    #     tmpClf = loadModel( modelPath)
    #     log(tmpClf.predict_proba(newX))
    #     outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
    #     outDf = pd.DataFrame(tmpClf.predict_proba(newX))
    expNo = "011"
    expInfo = expNo + "_remove_one_hot" 
    _basePath = Config.FolderBasePath + expInfo + Config.osSep
    
    path = _basePath + expNo + "_train_tobe.csv"
    testPath = _basePath + expNo + "_test_tobe.csv"
    
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    
    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
#     fab._subFolderName = "stacked"
    fab._n_iter_search = 100
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    # Test all data
    modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"]
#     featureList = ["event_type", "log_feature", "resource_type", "severity_type"]
    
#     for tmpFeature in featureList:
    modelFolder = _basePath + "models" + Config.osSep 
    for tmpModel in modelList:  
        curModel = tmpModel
        
        dr = DataReader()
        newX = dr.cvtPathListToDfList(testPath, "test")
        
Esempio n. 3
0
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
        newX = dr._testDataFrame
        #newX = pd.DataFrame(newX[newX.columns[0]])
        #print newX
 
    
    # 3. get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 30
    fab._expInfo = expInfo
    fab.getAllModels(newX, newY)
    
    # 4. test all data, output 3 ans as features
    #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model
    #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model
    #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model
    #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model
    
#     modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model"
#     tmpOutPath = _basePath + "002_submission_1_K_NN.csv"
#     tmpClf = loadModel( modelPath)
#     log(tmpClf.predict_proba(newX))
#     outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
#     outDf = pd.DataFrame(tmpClf.predict_proba(newX))
Esempio n. 4
0
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo

    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"

    # 1. read data
    dr = DataReader()
    dr.readInCSV(path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath, "test")

    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)

    #     log( "xgb start")
    #     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
    #     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
    #     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')

    #     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
    #     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"

    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm

    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)


#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance

    if doTestFlag == True:
        return finalClf.predict_proba(dr._testDataFrame)
Esempio n. 5
0
    expNo = "012"
    expInfo = expNo + "_rf_chk_important"
    _basePath = Config.FolderBasePath + expInfo + Config.osSep

    path = _basePath + expNo + "_train_asis.csv"
    testPath = _basePath + expNo + "_test_asis.csv"

    dr = DataReader()
    dr.readInCSV(path, "train")
    newX, newY = dr._trainDataFrame, dr._ansDataFrame

    # Get all best model from newX
    fab = ModelFactory()
    fab._gridSearchFlag = True
    #     fab._subFolderName = "stacked"
    fab._n_iter_search = 250
    fab._expInfo = expInfo
    #     fab.getAllModels(newX, newY)
    finalClf = fab.getRandomForestClf(newX, newY)

    featureImportance = []
    for i in range(0, len(finalClf.feature_importances_)):
        if i != len(dr._trainDataFrame.columns):
            # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
            featureImportance.append([dr._trainDataFrame.columns[i], finalClf.feature_importances_[i]])

    # log( featureImportance)
    featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
    log(featureImportance)

    trainNewX = dr._ansDataFrame
Esempio n. 6
0
    clfList = ["xgboost", "rf","extra_tree",]

    
    fab = ModelFactory()
    fab._gridSearchFlag = True
    
    dfUpper = pd.DataFrame()
    dfTestUpper = pd.DataFrame()
    eachClfLoopTimes = 3
    iter_search_times = 1
    
    for tmpClfName in clfList:
        for i in range(0,eachClfLoopTimes):
            fab._subFolderName = tmpClfName
            fab._n_iter_search = iter_search_times
            fab._expInfo = expInfo
            if  tmpClfName == "rf":
                clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "knn":
                clf = fab.getKnnClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "extra_tree":
                clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "xgboost":
                clf = fab.getXgboostClf(train_fold_1, train_fold_label_1)
             
            if tmpClfName == "xgboost":
                predictResult = clf.predict(xgb.DMatrix(train_fold_2))
                predictTestResult = clf.predict(xgb.DMatrix(test_fold_2))
            else:
                predictResult = clf.predict_proba(train_fold_2)
Esempio n. 7
0
        "rf",
        "extra_tree",
    ]

    fab = ModelFactory()
    fab._gridSearchFlag = True

    dfUpper = pd.DataFrame()
    dfTestUpper = pd.DataFrame()
    eachClfLoopTimes = 3
    iter_search_times = 1

    for tmpClfName in clfList:
        for i in range(0, eachClfLoopTimes):
            fab._subFolderName = tmpClfName
            fab._n_iter_search = iter_search_times
            fab._expInfo = expInfo
            if tmpClfName == "rf":
                clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "knn":
                clf = fab.getKnnClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "extra_tree":
                clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1)
            elif tmpClfName == "xgboost":
                clf = fab.getXgboostClf(train_fold_1, train_fold_label_1)

            if tmpClfName == "xgboost":
                predictResult = clf.predict(xgb.DMatrix(train_fold_2))
                predictTestResult = clf.predict(xgb.DMatrix(test_fold_2))
            else:
                predictResult = clf.predict_proba(train_fold_2)
Esempio n. 8
0
def exp():
    expInfo = "location_only\\"
    _basePath = Config.FolderBasePath + expInfo
    
    
    doTestFlag = False
    path = _basePath + "train.csv"
    testPath = _basePath + "test10.csv"
   
    # 1. read data
    dr = DataReader()
    dr.readInCSV( path, "train")
    if doTestFlag == True:
        dr.readInCSV(testPath , "test")
    
    # 2. run models
    #print dr._trainDataFrame.as_matrix
    fab = ModelFactory()
    fab._gridSearchFlag = True
    fab._n_iter_search = 10
    fab._expInfo = "location_only"

    X = dr._trainDataFrame
    Y = dr._ansDataFrame
    #fab.getRandomForestClf(X, Y)
    #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame)
    
#     log( "xgb start")
#     param = {'max_depth':10,  'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'}
#     num_round = 5
    #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame,  dr._ansDataFrame)
    #testResult = gbm.predict_proba(dr._testDataFrame)
    #print testResult
#     gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob')
    
#     scores = cross_val_score(rfClf, dr._trainDataFrame,  dr._ansDataFrame, n_jobs = -1)
#     log( "xgboost Validation Precision: ", scores.mean() )
    #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame),  num_round, nfold=5,metrics={'error'}, seed = 0)
    #gbTrain = gbm.fit(dr._trainDataFrame,  dr._ansDataFrame)
    #joblib.dump(gbTrain, xgbModelPath)
    #clf = joblib.load( xgbModelPath )
    #clf.predict_proba(dr._testDataFrame)
    #xgb.save(gbm, xgbModelPath)
    #print xgbCv
    #print "xgb end"
    
    #gbm = joblib.load( xgbModelPath )
    #finalClf = gbm
    
    if doTestFlag == True:
        print finalClf.predict_proba(dr._testDataFrame)
    
    
#     featureImportance =[]
#     for i in range(0,len(finalClf.feature_importances_)):
#         if i !=  len(dr._trainDataFrame.columns):  
#             if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1:
#                 featureImportance.append(  [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] )
#     
#     print featureImportance
#     featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True)
#     print featureImportance 

    if doTestFlag == True:       
        return finalClf.predict_proba(dr._testDataFrame)