コード例 #1
0
def train():
    posi_result = {}
    train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str)
    tmp1 = [m < 32 for m in trainTarList]
    tmp1 = np.array(tmp1)
    # train_feature = train_feature[tmp1]
    target_list = np.array(trainTarList)
    target_list = target_list[tmp1]
    # train_id_list = np.array(train_id_list)
    # train_id_list = train_id_list[tmp1]
    c_feature = trainFeature.columns[:]
    clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf1.fit(trainFeature[c_feature], target_list)
    # rf_preds = clf1.predict(test_feature)
    rf_prob = clf1.predict_proba(test_feature)
    gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17)
    gbdt1.fit(trainFeature[c_feature], target_list)
    # gbdt_preds = gbdt1.predict(test_feature)
    gbdt_prob = gbdt1.predict_proba(test_feature)
    all_prob = rf_prob + gbdt_prob
    all_preds = []
    print all_prob.shape
    for k in range(all_prob.shape[0]):
        prob1 = list(allProb[k, :])
        ind1 = prob.index(max(prob1))
        allPreds.append(ind1)
    for j in range(len(all_preds)):
        all_pre_name = dl.get_num_position(all_preds[j])
        posi_result[test_id_list[j]] = all_pre_name
    return posi_result
コード例 #2
0
def train():
    posi_result = {}
    train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(
        feature_str)
    tmp1 = [m < 32 for m in trainTarList]
    tmp1 = np.array(tmp1)
    # train_feature = train_feature[tmp1]
    target_list = np.array(trainTarList)
    target_list = target_list[tmp1]
    # train_id_list = np.array(train_id_list)
    # train_id_list = train_id_list[tmp1]
    c_feature = trainFeature.columns[:]
    clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf1.fit(trainFeature[c_feature], target_list)
    # rf_preds = clf1.predict(test_feature)
    rf_prob = clf1.predict_proba(test_feature)
    gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17)
    gbdt1.fit(trainFeature[c_feature], target_list)
    # gbdt_preds = gbdt1.predict(test_feature)
    gbdt_prob = gbdt1.predict_proba(test_feature)
    all_prob = rf_prob + gbdt_prob
    all_preds = []
    print all_prob.shape
    for k in range(all_prob.shape[0]):
        prob1 = list(allProb[k, :])
        ind1 = prob.index(max(prob1))
        allPreds.append(ind1)
    for j in range(len(all_preds)):
        all_pre_name = dl.get_num_position(all_preds[j])
        posi_result[test_id_list[j]] = all_pre_name
    return posi_result
コード例 #3
0
    trainIdList = trainIdList[tmp]
    cFeature = trainFeature.columns[:]
    clf = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf.fit(trainFeature[cFeature], target)
    rfPreds = clf.predict(testFeature)
    rfProb = clf.predict_proba(testFeature)
    gbdt = GradientBoostingClassifier(n_estimators=100, min_samples_split=17)
    gbdt.fit(trainFeature[cFeature], target)
    gbdtPreds = gbdt.predict(testFeature)
    gbdtProb = gbdt.predict_proba(testFeature)
    allProb = rfProb + gbdtProb
    allPreds = []
    print allProb.shape
    for tt in range(allProb.shape[0]):
        prob = list(allProb[tt, :])
        ind = prob.index(max(prob))
        allPreds.append(ind)
    rfRight, gbRight, allRight = 0, 0, 0
    for i in range(len(rfPreds)):
        rfPreName = dl.get_num_position(rfPreds[i])
        gbdtPreName = dl.get_num_position(rfPreds[i])
        allPreName = dl.get_num_position(allPreds[i])
        # if rfPreName == real:rfRight+=1.0
        # if rfPreName == real:gbRight+=1.0
        # if allPreName == real:allRight+=1.0
        # outFile1.write(testIdList[i]+'\t'+rfPreName+'\t'+real+'\n')
        # outFile2.write(testIdList[i]+'\t'+gbdtPreName+'\t'+real+'\n')
        # outFile3.write(testIdList[i]+'\t'+allPreName+'\t'+real+'\n')
    print 'rf:' + str(rfRight / 20000) + '\n gbdt:' + str(gbRight / 20000) + '\n all:' + str(allRight / 20000)
    outFile1.write(str(rfRight / 20000))
コード例 #4
0
    idList = np.array(idList)
    tt = []
    for i in range(20000):
        tt.append(i % 15)
    tFeature['is_train'] = tt
    rightAll = 0
    for i in range(15):
        print i
        train, test = tFeature[tFeature['is_train'] != i], tFeature[
            tFeature['is_train'] == i]
        tmp1 = np.array([t != i for t in tFeature['is_train']])
        tmp2 = np.array([t == i for t in tFeature['is_train']])
        trainTar, testTar = target[tmp1], target[tmp2]
        testId = idList[tmp2]
        clf = RandomForestClassifier(
            n_estimators=200,
            min_samples_split=13)  # ,max_depth=35,max_features=0.4)
        features = tFeature.columns[:-1]
        clf.fit(train[features], trainTar)
        preds = clf.predict(test[features])
        right = 0
        for n in range(len(preds)):
            if preds[n] == testTar[n]:
                right += 1.0
                rightAll += 1.0
            outFiles.write(testId[n] + '\t' + dl.get_num_position(preds[n]) +
                           '\t' + dl.get_num_position(testTar[n]) + '\n')
        print right / len(preds)
    outFiles.close()
    print rightAll / 20000
コード例 #5
0
    tFeature = get_feature(Tlines1, Tlines2, TrfProb, TgbdtProb)
    eFeature = get_feature(Tlines1, Tlines2, TrfProb, TgbdtProb)
    target = np.array(tarList)
    idList = np.array(idList)
    tt = []
    for i in range(20000):
        tt.append(i % 15)
    tFeature['is_train'] = tt
    rightAll = 0
    for i in range(15):
        print i
        train, test = tFeature[tFeature['is_train'] != i], tFeature[tFeature['is_train'] == i]
        tmp1 = np.array([t != i for t in tFeature['is_train']])
        tmp2 = np.array([t == i for t in tFeature['is_train']])
        trainTar, testTar = target[tmp1], target[tmp2]
        testId = idList[tmp2]
        clf = RandomForestClassifier(n_estimators=200, min_samples_split=13)  # ,max_depth=35,max_features=0.4)
        features = tFeature.columns[:-1]
        clf.fit(train[features], trainTar)
        preds = clf.predict(test[features])
        right = 0
        for n in range(len(preds)):
            if preds[n] == testTar[n]:
                right += 1.0
                rightAll += 1.0
            outFiles.write(
                testId[n] + '\t' + dl.get_num_position(preds[n]) + '\t' + dl.get_num_position(testTar[n]) + '\n')
        print right / len(preds)
    outFiles.close()
    print rightAll / 20000
コード例 #6
0
    cFeature = trainFeature.columns[:]
    clf = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf.fit(trainFeature[cFeature], target)
    rfPreds = clf.predict(testFeature)
    rfProb = clf.predict_proba(testFeature)
    gbdt = GradientBoostingClassifier(n_estimators=100, min_samples_split=17)
    gbdt.fit(trainFeature[cFeature], target)
    gbdtPreds = gbdt.predict(testFeature)
    gbdtProb = gbdt.predict_proba(testFeature)
    allProb = rfProb + gbdtProb
    allPreds = []
    print allProb.shape
    for tt in range(allProb.shape[0]):
        prob = list(allProb[tt, :])
        ind = prob.index(max(prob))
        allPreds.append(ind)
    rfRight, gbRight, allRight = 0, 0, 0
    for i in range(len(rfPreds)):
        rfPreName = dl.get_num_position(rfPreds[i])
        gbdtPreName = dl.get_num_position(rfPreds[i])
        allPreName = dl.get_num_position(allPreds[i])
        # if rfPreName == real:rfRight+=1.0
        # if rfPreName == real:gbRight+=1.0
        # if allPreName == real:allRight+=1.0
        # outFile1.write(testIdList[i]+'\t'+rfPreName+'\t'+real+'\n')
        # outFile2.write(testIdList[i]+'\t'+gbdtPreName+'\t'+real+'\n')
        # outFile3.write(testIdList[i]+'\t'+allPreName+'\t'+real+'\n')
    print 'rf:' + str(rfRight / 20000) + '\n gbdt:' + str(
        gbRight / 20000) + '\n all:' + str(allRight / 20000)
    outFile1.write(str(rfRight / 20000))
コード例 #7
0
    for i in range(len(trainFeatureR)):
        tt.append(i % 5)
    i = 4
    tmp1 = np.array([t != i for t in tt])
    tmp2 = np.array([t == i for t in tt])
    trainFeature, testFeature = trainFeatureR[tmp1], trainFeatureR[tmp2]
    trainTar, testTar = targetR[tmp1], targetR[tmp2]
    trainId, testId = trainIdListR[tmp1], trainIdListR[tmp2]
    clf = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf.fit(trainFeature[Cfeature], trainTar)
    preds = clf.predict(testFeature)
    predPro = clf.predict_proba(testFeature)
    rfPro = predPro
    right = 0
    for n in range(len(preds)):
        preName = dl.get_num_position(preds[n])
        real = dl.get_num_position(testTar[n])
        if preName == real:
            right += 1.0
        outFile1.write(str(testId[n]) + '\t' + preName + '\t' + real + '\n')
    print right / (len(trainFeatureR) / 5.0)
    pickle.dump(rfPro, outPkl1)

    i = 4
    print i
    tmp1 = np.array([t != i for t in tt])
    tmp2 = np.array([t == i for t in tt])
    trainFeature, testFeature = trainFeatureR[tmp1], trainFeatureR[tmp2]
    trainTar, testTar = targetR[tmp1], targetR[tmp2]
    trainId, testId = trainIdListR[tmp1], trainIdListR[tmp2]
    clf = GradientBoostingClassifier(n_estimators=6, min_samples_split=17)
コード例 #8
0
    for i in range(len(trainFeatureR)):
        tt.append(i % 5)
    i = 4
    tmp1 = np.array([t != i for t in tt])
    tmp2 = np.array([t == i for t in tt])
    trainFeature, testFeature = trainFeatureR[tmp1], trainFeatureR[tmp2]
    trainTar, testTar = targetR[tmp1], targetR[tmp2]
    trainId, testId = trainIdListR[tmp1], trainIdListR[tmp2]
    clf = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf.fit(trainFeature[Cfeature], trainTar)
    preds = clf.predict(testFeature)
    predPro = clf.predict_proba(testFeature)
    rfPro = predPro
    right = 0
    for n in range(len(preds)):
        preName = dl.get_num_position(preds[n])
        real = dl.get_num_position(testTar[n])
        if preName == real:
            right += 1.0
        outFile1.write(str(testId[n]) + '\t' + preName + '\t' + real + '\n')
    print right / (len(trainFeatureR) / 5.0)
    pickle.dump(rfPro, outPkl1)

    i = 4
    print i
    tmp1 = np.array([t != i for t in tt])
    tmp2 = np.array([t == i for t in tt])
    trainFeature, testFeature = trainFeatureR[tmp1], trainFeatureR[tmp2]
    trainTar, testTar = targetR[tmp1], targetR[tmp2]
    trainId, testId = trainIdListR[tmp1], trainIdListR[tmp2]
    clf = GradientBoostingClassifier(n_estimators=6, min_samples_split=17)