def train(): posi_result = {} train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str) tmp1 = [m < 32 for m in trainTarList] tmp1 = np.array(tmp1) # train_feature = train_feature[tmp1] target_list = np.array(trainTarList) target_list = target_list[tmp1] # train_id_list = np.array(train_id_list) # train_id_list = train_id_list[tmp1] c_feature = trainFeature.columns[:] clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf1.fit(trainFeature[c_feature], target_list) # rf_preds = clf1.predict(test_feature) rf_prob = clf1.predict_proba(test_feature) gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17) gbdt1.fit(trainFeature[c_feature], target_list) # gbdt_preds = gbdt1.predict(test_feature) gbdt_prob = gbdt1.predict_proba(test_feature) all_prob = rf_prob + gbdt_prob all_preds = [] print all_prob.shape for k in range(all_prob.shape[0]): prob1 = list(allProb[k, :]) ind1 = prob.index(max(prob1)) allPreds.append(ind1) for j in range(len(all_preds)): all_pre_name = dl.get_num_position(all_preds[j]) posi_result[test_id_list[j]] = all_pre_name return posi_result
def train(): posi_result = {} train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature( feature_str) tmp1 = [m < 32 for m in trainTarList] tmp1 = np.array(tmp1) # train_feature = train_feature[tmp1] target_list = np.array(trainTarList) target_list = target_list[tmp1] # train_id_list = np.array(train_id_list) # train_id_list = train_id_list[tmp1] c_feature = trainFeature.columns[:] clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf1.fit(trainFeature[c_feature], target_list) # rf_preds = clf1.predict(test_feature) rf_prob = clf1.predict_proba(test_feature) gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17) gbdt1.fit(trainFeature[c_feature], target_list) # gbdt_preds = gbdt1.predict(test_feature) gbdt_prob = gbdt1.predict_proba(test_feature) all_prob = rf_prob + gbdt_prob all_preds = [] print all_prob.shape for k in range(all_prob.shape[0]): prob1 = list(allProb[k, :]) ind1 = prob.index(max(prob1)) allPreds.append(ind1) for j in range(len(all_preds)): all_pre_name = dl.get_num_position(all_preds[j]) posi_result[test_id_list[j]] = all_pre_name return posi_result
trainIdList = trainIdList[tmp] cFeature = trainFeature.columns[:] clf = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf.fit(trainFeature[cFeature], target) rfPreds = clf.predict(testFeature) rfProb = clf.predict_proba(testFeature) gbdt = GradientBoostingClassifier(n_estimators=100, min_samples_split=17) gbdt.fit(trainFeature[cFeature], target) gbdtPreds = gbdt.predict(testFeature) gbdtProb = gbdt.predict_proba(testFeature) allProb = rfProb + gbdtProb allPreds = [] print allProb.shape for tt in range(allProb.shape[0]): prob = list(allProb[tt, :]) ind = prob.index(max(prob)) allPreds.append(ind) rfRight, gbRight, allRight = 0, 0, 0 for i in range(len(rfPreds)): rfPreName = dl.get_num_position(rfPreds[i]) gbdtPreName = dl.get_num_position(rfPreds[i]) allPreName = dl.get_num_position(allPreds[i]) # if rfPreName == real:rfRight+=1.0 # if rfPreName == real:gbRight+=1.0 # if allPreName == real:allRight+=1.0 # outFile1.write(testIdList[i]+'\t'+rfPreName+'\t'+real+'\n') # outFile2.write(testIdList[i]+'\t'+gbdtPreName+'\t'+real+'\n') # outFile3.write(testIdList[i]+'\t'+allPreName+'\t'+real+'\n') print 'rf:' + str(rfRight / 20000) + '\n gbdt:' + str(gbRight / 20000) + '\n all:' + str(allRight / 20000) outFile1.write(str(rfRight / 20000))
idList = np.array(idList) tt = [] for i in range(20000): tt.append(i % 15) tFeature['is_train'] = tt rightAll = 0 for i in range(15): print i train, test = tFeature[tFeature['is_train'] != i], tFeature[ tFeature['is_train'] == i] tmp1 = np.array([t != i for t in tFeature['is_train']]) tmp2 = np.array([t == i for t in tFeature['is_train']]) trainTar, testTar = target[tmp1], target[tmp2] testId = idList[tmp2] clf = RandomForestClassifier( n_estimators=200, min_samples_split=13) # ,max_depth=35,max_features=0.4) features = tFeature.columns[:-1] clf.fit(train[features], trainTar) preds = clf.predict(test[features]) right = 0 for n in range(len(preds)): if preds[n] == testTar[n]: right += 1.0 rightAll += 1.0 outFiles.write(testId[n] + '\t' + dl.get_num_position(preds[n]) + '\t' + dl.get_num_position(testTar[n]) + '\n') print right / len(preds) outFiles.close() print rightAll / 20000
tFeature = get_feature(Tlines1, Tlines2, TrfProb, TgbdtProb) eFeature = get_feature(Tlines1, Tlines2, TrfProb, TgbdtProb) target = np.array(tarList) idList = np.array(idList) tt = [] for i in range(20000): tt.append(i % 15) tFeature['is_train'] = tt rightAll = 0 for i in range(15): print i train, test = tFeature[tFeature['is_train'] != i], tFeature[tFeature['is_train'] == i] tmp1 = np.array([t != i for t in tFeature['is_train']]) tmp2 = np.array([t == i for t in tFeature['is_train']]) trainTar, testTar = target[tmp1], target[tmp2] testId = idList[tmp2] clf = RandomForestClassifier(n_estimators=200, min_samples_split=13) # ,max_depth=35,max_features=0.4) features = tFeature.columns[:-1] clf.fit(train[features], trainTar) preds = clf.predict(test[features]) right = 0 for n in range(len(preds)): if preds[n] == testTar[n]: right += 1.0 rightAll += 1.0 outFiles.write( testId[n] + '\t' + dl.get_num_position(preds[n]) + '\t' + dl.get_num_position(testTar[n]) + '\n') print right / len(preds) outFiles.close() print rightAll / 20000
cFeature = trainFeature.columns[:] clf = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf.fit(trainFeature[cFeature], target) rfPreds = clf.predict(testFeature) rfProb = clf.predict_proba(testFeature) gbdt = GradientBoostingClassifier(n_estimators=100, min_samples_split=17) gbdt.fit(trainFeature[cFeature], target) gbdtPreds = gbdt.predict(testFeature) gbdtProb = gbdt.predict_proba(testFeature) allProb = rfProb + gbdtProb allPreds = [] print allProb.shape for tt in range(allProb.shape[0]): prob = list(allProb[tt, :]) ind = prob.index(max(prob)) allPreds.append(ind) rfRight, gbRight, allRight = 0, 0, 0 for i in range(len(rfPreds)): rfPreName = dl.get_num_position(rfPreds[i]) gbdtPreName = dl.get_num_position(rfPreds[i]) allPreName = dl.get_num_position(allPreds[i]) # if rfPreName == real:rfRight+=1.0 # if rfPreName == real:gbRight+=1.0 # if allPreName == real:allRight+=1.0 # outFile1.write(testIdList[i]+'\t'+rfPreName+'\t'+real+'\n') # outFile2.write(testIdList[i]+'\t'+gbdtPreName+'\t'+real+'\n') # outFile3.write(testIdList[i]+'\t'+allPreName+'\t'+real+'\n') print 'rf:' + str(rfRight / 20000) + '\n gbdt:' + str( gbRight / 20000) + '\n all:' + str(allRight / 20000) outFile1.write(str(rfRight / 20000))
for i in range(len(trainFeatureR)): tt.append(i % 5) i = 4 tmp1 = np.array([t != i for t in tt]) tmp2 = np.array([t == i for t in tt]) trainFeature, testFeature = trainFeatureR[tmp1], trainFeatureR[tmp2] trainTar, testTar = targetR[tmp1], targetR[tmp2] trainId, testId = trainIdListR[tmp1], trainIdListR[tmp2] clf = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf.fit(trainFeature[Cfeature], trainTar) preds = clf.predict(testFeature) predPro = clf.predict_proba(testFeature) rfPro = predPro right = 0 for n in range(len(preds)): preName = dl.get_num_position(preds[n]) real = dl.get_num_position(testTar[n]) if preName == real: right += 1.0 outFile1.write(str(testId[n]) + '\t' + preName + '\t' + real + '\n') print right / (len(trainFeatureR) / 5.0) pickle.dump(rfPro, outPkl1) i = 4 print i tmp1 = np.array([t != i for t in tt]) tmp2 = np.array([t == i for t in tt]) trainFeature, testFeature = trainFeatureR[tmp1], trainFeatureR[tmp2] trainTar, testTar = targetR[tmp1], targetR[tmp2] trainId, testId = trainIdListR[tmp1], trainIdListR[tmp2] clf = GradientBoostingClassifier(n_estimators=6, min_samples_split=17)