testPath = _basePath + expInfo + "_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath, "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model # modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model" # tmpOutPath = _basePath + "002_submission_1_K_NN.csv" # tmpClf = loadModel( modelPath) # log(tmpClf.predict_proba(newX)) # outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) # outDf = pd.DataFrame(tmpClf.predict_proba(newX))
expNo = "011" expInfo = expNo + "_remove_one_hot" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_tobe.csv" testPath = _basePath + expNo + "_test_tobe.csv" dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 100 fab._expInfo = expInfo fab.getAllModels(newX, newY) # Test all data modelList = ["Xgboost","Random_Forest","Extra_Trees", "K_NN", "Logistic_Regression"] # featureList = ["event_type", "log_feature", "resource_type", "severity_type"] # for tmpFeature in featureList: modelFolder = _basePath + "models" + Config.osSep for tmpModel in modelList: curModel = tmpModel dr = DataReader() newX = dr.cvtPathListToDfList(testPath, "test")
# 1. read data dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath , "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model # modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model" # tmpOutPath = _basePath + "002_submission_1_K_NN.csv" # tmpClf = loadModel( modelPath) # log(tmpClf.predict_proba(newX)) # outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1) # outDf = pd.DataFrame(tmpClf.predict_proba(newX))
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") if doTestFlag == True: dr.readInCSV(testPath, "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)
expNo = "012" expInfo = expNo + "_rf_chk_important" _basePath = Config.FolderBasePath + expInfo + Config.osSep path = _basePath + expNo + "_train_asis.csv" testPath = _basePath + expNo + "_test_asis.csv" dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame # Get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True # fab._subFolderName = "stacked" fab._n_iter_search = 250 fab._expInfo = expInfo # fab.getAllModels(newX, newY) finalClf = fab.getRandomForestClf(newX, newY) featureImportance = [] for i in range(0, len(finalClf.feature_importances_)): if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: featureImportance.append([dr._trainDataFrame.columns[i], finalClf.feature_importances_[i]]) # log( featureImportance) featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) log(featureImportance) trainNewX = dr._ansDataFrame
clfList = ["xgboost", "rf","extra_tree",] fab = ModelFactory() fab._gridSearchFlag = True dfUpper = pd.DataFrame() dfTestUpper = pd.DataFrame() eachClfLoopTimes = 3 iter_search_times = 1 for tmpClfName in clfList: for i in range(0,eachClfLoopTimes): fab._subFolderName = tmpClfName fab._n_iter_search = iter_search_times fab._expInfo = expInfo if tmpClfName == "rf": clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1) elif tmpClfName == "knn": clf = fab.getKnnClf(train_fold_1, train_fold_label_1) elif tmpClfName == "extra_tree": clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1) elif tmpClfName == "xgboost": clf = fab.getXgboostClf(train_fold_1, train_fold_label_1) if tmpClfName == "xgboost": predictResult = clf.predict(xgb.DMatrix(train_fold_2)) predictTestResult = clf.predict(xgb.DMatrix(test_fold_2)) else: predictResult = clf.predict_proba(train_fold_2)
"rf", "extra_tree", ] fab = ModelFactory() fab._gridSearchFlag = True dfUpper = pd.DataFrame() dfTestUpper = pd.DataFrame() eachClfLoopTimes = 3 iter_search_times = 1 for tmpClfName in clfList: for i in range(0, eachClfLoopTimes): fab._subFolderName = tmpClfName fab._n_iter_search = iter_search_times fab._expInfo = expInfo if tmpClfName == "rf": clf = fab.getRandomForestClf(train_fold_1, train_fold_label_1) elif tmpClfName == "knn": clf = fab.getKnnClf(train_fold_1, train_fold_label_1) elif tmpClfName == "extra_tree": clf = fab.getExtraTressClf(train_fold_1, train_fold_label_1) elif tmpClfName == "xgboost": clf = fab.getXgboostClf(train_fold_1, train_fold_label_1) if tmpClfName == "xgboost": predictResult = clf.predict(xgb.DMatrix(train_fold_2)) predictTestResult = clf.predict(xgb.DMatrix(test_fold_2)) else: predictResult = clf.predict_proba(train_fold_2)
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") if doTestFlag == True: dr.readInCSV(testPath , "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)