path = _basePath + expInfo + "_train_tobe.csv" testPath = _basePath + expInfo + "_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath, "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model # modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model" # tmpOutPath = _basePath + "002_submission_1_K_NN.csv" # tmpClf = loadModel( modelPath) # log(tmpClf.predict_proba(newX)) # outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV(path, "train") if doTestFlag == True: dr.readInCSV(testPath, "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)
testPath = _basePath + expNo + "_test_tobe.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") newX, newY = dr._trainDataFrame, dr._ansDataFrame if doTestFlag == True: dr.readInCSV(testPath , "test") newX = dr._testDataFrame #newX = pd.DataFrame(newX[newX.columns[0]]) #print newX # 3. get all best model from newX fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 30 fab._expInfo = expInfo fab.getAllModels(newX, newY) # 4. test all data, output 3 ans as features #D:\Kaggle\Telstra\002_blending\(Xgboost)_(2016-02-03_20_09_03).model #D:\Kaggle\Telstra\002_blending\(Random_Forest)_(2016-02-03_20_16_16).model #D:\Kaggle\Telstra\002_blending\(Extra_Trees)_(2016-02-03_20_21_58).model #D:\Kaggle\Telstra\002_blending\(K_NN)_(2016-02-03_20_32_22).model # modelPath = _basePath+"(K_NN)_(2016-02-03_20_32_22).model" # tmpOutPath = _basePath + "002_submission_1_K_NN.csv" # tmpClf = loadModel( modelPath) # log(tmpClf.predict_proba(newX)) # outDf = pd.concat([newX, pd.DataFrame(tmpClf.predict_proba(newX))], axis=1)
def exp(): expInfo = "location_only\\" _basePath = Config.FolderBasePath + expInfo doTestFlag = False path = _basePath + "train.csv" testPath = _basePath + "test10.csv" # 1. read data dr = DataReader() dr.readInCSV( path, "train") if doTestFlag == True: dr.readInCSV(testPath , "test") # 2. run models #print dr._trainDataFrame.as_matrix fab = ModelFactory() fab._gridSearchFlag = True fab._n_iter_search = 10 fab._expInfo = "location_only" X = dr._trainDataFrame Y = dr._ansDataFrame #fab.getRandomForestClf(X, Y) #fab.getAllModels(dr._trainDataFrame, dr._ansDataFrame) # log( "xgb start") # param = {'max_depth':10, 'n_estimators':300 , 'num_class':3, 'learning_rate':0.05, 'objective':'multi:softprob'} # num_round = 5 #gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob').fit(dr._trainDataFrame, dr._ansDataFrame) #testResult = gbm.predict_proba(dr._testDataFrame) #print testResult # gbm = xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.05, objective='multi:softprob') # scores = cross_val_score(rfClf, dr._trainDataFrame, dr._ansDataFrame, n_jobs = -1) # log( "xgboost Validation Precision: ", scores.mean() ) #xgbCv = xgb.cv(param, xgb.DMatrix(dr._trainDataFrame, dr._ansDataFrame), num_round, nfold=5,metrics={'error'}, seed = 0) #gbTrain = gbm.fit(dr._trainDataFrame, dr._ansDataFrame) #joblib.dump(gbTrain, xgbModelPath) #clf = joblib.load( xgbModelPath ) #clf.predict_proba(dr._testDataFrame) #xgb.save(gbm, xgbModelPath) #print xgbCv #print "xgb end" #gbm = joblib.load( xgbModelPath ) #finalClf = gbm if doTestFlag == True: print finalClf.predict_proba(dr._testDataFrame) # featureImportance =[] # for i in range(0,len(finalClf.feature_importances_)): # if i != len(dr._trainDataFrame.columns): # if (dr._trainDataFrame.columns[i]).find("_one_hot") == -1: # featureImportance.append( [dr._trainDataFrame.columns[i] , finalClf.feature_importances_[i]] ) # # print featureImportance # featureImportance.sort(lambda x, y: cmp(x[1], y[1]), reverse=True) # print featureImportance if doTestFlag == True: return finalClf.predict_proba(dr._testDataFrame)