log(tmpCurFeatureList) newX = pd.DataFrame() for tmpFeature in tmpCurFeatureList: path = _basePath + tmpFeature + "_train.csv" dr = DataReader() tmpX = dr.cvtPathListToDfList(path, "test") newX = pd.concat([newX, tmpX], axis=1) #log("feature len: " , len(newX)) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = True fab._gridSearchFlag = True fab._onlyTreeBasedModels = True fab._subFolderName = "one_hot_each_" + str(i) fab._n_iter_search = 30 fab._expInfo = expInfo # fab.getAllModels(newX, newY) fab.getRandomForestClf(newX, newY) # fab.getXgboostClf(newX, newY) log ( i , "/32 done..." )
# t = pd.get_dummies(newY) # finalList = [] # for tmp in range(0,len(t)): # tmpList =[] # for i in range(0, len(t.ix[tmp])): # tmpList.append( int( t.ix[tmp][i])) # finalList.append(tmpList) # print finalList #exit() #print len(newX) # Get all best model from newX fab = ModelFactory() fab._setXgboostTheradToOne = False fab._gridSearchFlag = True fab._subFolderName = "ismail3" fab._n_iter_search = 1 fab._expInfo = expInfo #clf = fab.getXgboostClf(newX, newY) clf = fab.getRandomForestClf(newX, newY) #print fab.getLogloss(clf,newX,newY) def llfun(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred)