def get_feature(lines1, lines2, rf_prob, gbdt_prob):
    tmp = 0
    gbdt_po_list, rf_po_list, high_list, dip_list = [], [], [], []

    for p in range(len(lines1) - 3):
        # idStr,idStr = '1','1'
        id_str, gbdt_posi, answer1 = lines1[p].strip().split('\t')
        id_str, rf_posi, answer2 = lines2[p].strip().split('\t')
        gbdt_posi = dl.get_position_num(gbdt_posi)
        rf_posi = dl.get_position_num(rf_posi)
        answer2 = dl.get_position_num(answer2)
        gbdt_po_list.append(gbdt_posi)
        rf_po_list.append(rf_posi)
        high = rf_prob[p, rf_posi] > gbdt_prob[p, gbdt_posi]
        dip = rf_prob[p, rf_posi] - gbdt_prob[p, gbdt_posi]
        high_list.append(high)
        dip_list.append(dip)
        tarList.append(answer2)
        idList.append(id_str)
    feature['high'] = high_list
    feature['dip'] = dip_list
    feature['gbPo'] = gbdt_po_list
    feature['rfPo'] = rf_po_list
    for j in range(rf_prob.shape[1]):
        feature[str(j)] = list(rf_prob[:, j])
    for k in range(gbdt_prob.shape[1]):
        tmp = k + j
        feature[str(tmp)] = list(gbdt_prob[:, t])
    for m in range(gbdt_prob.shape[1]):
        tmp3 = tmp + 1
        feature[str(tmp3)] = list(gbdt_prob[:, m] + rf_prob[:, m])
    m_feature = pd.DataFrame(feature)
    return m_feature
def get_feature(lines1, lines2, rf_prob, gbdt_prob):
    tmp = 0
    gbdt_po_list, rf_po_list, high_list, dip_list = [], [], [], []

    for p in range(len(lines1) - 3):
        # idStr,idStr = '1','1'
        id_str, gbdt_posi, answer1 = lines1[p].strip().split('\t')
        id_str, rf_posi, answer2 = lines2[p].strip().split('\t')
        gbdt_posi = dl.get_position_num(gbdt_posi)
        rf_posi = dl.get_position_num(rf_posi)
        answer2 = dl.get_position_num(answer2)
        gbdt_po_list.append(gbdt_posi)
        rf_po_list.append(rf_posi)
        high = rf_prob[p, rf_posi] > gbdt_prob[p, gbdt_posi]
        dip = rf_prob[p, rf_posi] - gbdt_prob[p, gbdt_posi]
        high_list.append(high)
        dip_list.append(dip)
        tarList.append(answer2)
        idList.append(id_str)
    feature['high'] = high_list
    feature['dip'] = dip_list
    feature['gbPo'] = gbdt_po_list
    feature['rfPo'] = rf_po_list
    for j in range(rf_prob.shape[1]):
        feature[str(j)] = list(rf_prob[:, j])
    for k in range(gbdt_prob.shape[1]):
        tmp = k + j
        feature[str(tmp)] = list(gbdt_prob[:, t])
    for m in range(gbdt_prob.shape[1]):
        tmp3 = tmp + 1
        feature[str(tmp3)] = list(gbdt_prob[:, m] + rf_prob[:, m])
    m_feature = pd.DataFrame(feature)
    return m_feature
def merge_feature(data_file):
    tmp_n = 0
    # 定义特征list
    # 逐样本提取特征
    for doc in data_file:
        tmp_n += 1
        if tmp_n == 100000:
            break
        doc = json.loads(doc)
        id_str = str(doc['_id']['$oid'])
        idList.append(id_str)
        ageList.append(dl.age_deal(doc['age']))
        genderList.append(dl.get_gender(doc['gender']))
        major_str = doc['major']
        if not major_str or major_str.strip() == u'None':
            major_str = u'0'
        majorClassList.append(dl.get_major_calss(major_str))
        majorNumList.append(dl.get_major_num(major_str))
        work_expList = doc['workExperienceList']
        name1 = format_name(work_expList[0]['position_name'])
        name1List.append(dl.get_position_num(name1))
        name3 = format_name(work_expList[2]['position_name'])
        name3List.append(dl.get_position_num(name3))
        nameNList.append(dl.get_position_num(format_name(work_expList[-1]['position_name'])))
        Size1List.append(int(work_expList[0]['size']))
        Size3List.append(int(work_expList[2]['size']))
        SizeNList.append(int(work_expList[-1]['size']))
        salary1List.append(int(work_expList[0]['salary']))
        salary3List.append(int(work_expList[2]['salary']))
        salaryNList.append(int(work_expList[-1]['salary']))
        ind1_str = work_expList[0]['industry']
        if not ind1_str:
            ind1_str = 'Null'
        ind1_num = dl.get_industry_num(ind1_str.strip())
        industry1Num.append(ind1_num)
        ind3_str = work_expList[2]['industry']
        if not ind3_str:
            ind3_str = 'Null'
        ind3_num = dl.get_industry_num(ind3_str.strip())
        industry3Num.append(ind3_num)
        if work_expList[1]:
            degreeList.append(int(doc['degree']))
            name2List.append(dl.get_position_num(format_name(work_expList[1]['position_name'])))
            Size2List.append(int(work_expList[1]['size']))
            salary2List.append(int(work_expList[1]['salary']))
        else:
            work_expList[1] = {}
            work_expList[1]['position_name'] = 'no name'
        time1.append(dl.time_deal(work_expList[0]['start_date'], work_expList[0]['end_date']))
        time2.append(dl.time_deal(work_expList[2]['end_date'], work_expList[0]['start_date']))
        time3.append(dl.time_deal(work_expList[2]['start_date'], work_expList[2]['end_date']))
        time4.append(dl.time_deal(work_expList[-1]['start_date'], work_expList[0]['end_date']))
        tem_year1 = dl.get_year(work_expList[0]['end_date'])
        tem_year3 = dl.get_year(work_expList[2]['end_date'])
        tem_yearn = dl.get_year(work_expList[-1]['start_date'])
        year1.append(tem_year1)
        year2.append(dl.get_year(work_expList[0]['start_date']))
        year3.append(tem_year3)
        yearN.append(tem_yearn)
        firstAgeList.append(dl.get_frist_age(dl.age_deal(doc['age']), tem_yearn))
        ind0_str = work_expList[0]['industry']
        if not ind0_str:
            ind0_str = 'Null'
        ind0Num.append(dl.get_industry_num(ind0_str.strip()))
        ind2_Str = work_expList[2]['industry']
        if not ind2_Str:
            ind2_Str = 'Null'
        ind2Num.append(dl.get_industry_num(ind2_Str.strip()))
        salary_lv1, year_salary1 = salary_feature.get_sala_feature(name1, tem_year1, int(work_expList[0]['salary']))
        salary_lv3, year_salary3 = salary_feature.get_sala_feature(name3, tem_year3, int(work_expList[2]['salary']))
        salaryLv1List.append(salary_lv1)
        salaryLv3List.append(salary_lv3)
        yearSalary1List.append(year_salary1)
        yearSalary3List.append(year_salary3)

        aver1, lv1 = get_level_aver(tem_year1, int(work_expList[0]['salary']))
        aver3, lv3 = get_level_aver(tem_year3, int(work_expList[2]['salary']))
        lv1List.append(lv1)
        lv3List.append(lv3)
        aver1List.append(aver1)
        aver3List.append(aver3)
        name_list = [format_name(name) for name in [work_expList[i]['position_name'] for i in range(len(work_expList))]]
        name_list.pop(1)
        feature_line = fe.get_matrix(name_list)
        featureMat.append(np.array(feature_line))
        lengthList.append(len(work_expList))
        get_feature_dict(work_expList, feature)
def merge_feature(data_file):
    tmp_n = 0
    # 定义特征list
    # 逐样本提取特征
    for doc in data_file:
        tmp_n += 1
        if tmp_n == 100000:
            break
        doc = json.loads(doc)
        id_str = str(doc['_id']['$oid'])
        idList.append(id_str)
        ageList.append(dl.age_deal(doc['age']))
        genderList.append(dl.get_gender(doc['gender']))
        major_str = doc['major']
        if not major_str or major_str.strip() == u'None':
            major_str = u'0'
        majorClassList.append(dl.get_major_calss(major_str))
        majorNumList.append(dl.get_major_num(major_str))
        work_expList = doc['workExperienceList']
        name1 = format_name(work_expList[0]['position_name'])
        name1List.append(dl.get_position_num(name1))
        name3 = format_name(work_expList[2]['position_name'])
        name3List.append(dl.get_position_num(name3))
        nameNList.append(
            dl.get_position_num(format_name(
                work_expList[-1]['position_name'])))
        Size1List.append(int(work_expList[0]['size']))
        Size3List.append(int(work_expList[2]['size']))
        SizeNList.append(int(work_expList[-1]['size']))
        salary1List.append(int(work_expList[0]['salary']))
        salary3List.append(int(work_expList[2]['salary']))
        salaryNList.append(int(work_expList[-1]['salary']))
        ind1_str = work_expList[0]['industry']
        if not ind1_str:
            ind1_str = 'Null'
        ind1_num = dl.get_industry_num(ind1_str.strip())
        industry1Num.append(ind1_num)
        ind3_str = work_expList[2]['industry']
        if not ind3_str:
            ind3_str = 'Null'
        ind3_num = dl.get_industry_num(ind3_str.strip())
        industry3Num.append(ind3_num)
        if work_expList[1]:
            degreeList.append(int(doc['degree']))
            name2List.append(
                dl.get_position_num(
                    format_name(work_expList[1]['position_name'])))
            Size2List.append(int(work_expList[1]['size']))
            salary2List.append(int(work_expList[1]['salary']))
        else:
            work_expList[1] = {}
            work_expList[1]['position_name'] = 'no name'
        time1.append(
            dl.time_deal(work_expList[0]['start_date'],
                         work_expList[0]['end_date']))
        time2.append(
            dl.time_deal(work_expList[2]['end_date'],
                         work_expList[0]['start_date']))
        time3.append(
            dl.time_deal(work_expList[2]['start_date'],
                         work_expList[2]['end_date']))
        time4.append(
            dl.time_deal(work_expList[-1]['start_date'],
                         work_expList[0]['end_date']))
        tem_year1 = dl.get_year(work_expList[0]['end_date'])
        tem_year3 = dl.get_year(work_expList[2]['end_date'])
        tem_yearn = dl.get_year(work_expList[-1]['start_date'])
        year1.append(tem_year1)
        year2.append(dl.get_year(work_expList[0]['start_date']))
        year3.append(tem_year3)
        yearN.append(tem_yearn)
        firstAgeList.append(
            dl.get_frist_age(dl.age_deal(doc['age']), tem_yearn))
        ind0_str = work_expList[0]['industry']
        if not ind0_str:
            ind0_str = 'Null'
        ind0Num.append(dl.get_industry_num(ind0_str.strip()))
        ind2_Str = work_expList[2]['industry']
        if not ind2_Str:
            ind2_Str = 'Null'
        ind2Num.append(dl.get_industry_num(ind2_Str.strip()))
        salary_lv1, year_salary1 = salary_feature.get_sala_feature(
            name1, tem_year1, int(work_expList[0]['salary']))
        salary_lv3, year_salary3 = salary_feature.get_sala_feature(
            name3, tem_year3, int(work_expList[2]['salary']))
        salaryLv1List.append(salary_lv1)
        salaryLv3List.append(salary_lv3)
        yearSalary1List.append(year_salary1)
        yearSalary3List.append(year_salary3)

        aver1, lv1 = get_level_aver(tem_year1, int(work_expList[0]['salary']))
        aver3, lv3 = get_level_aver(tem_year3, int(work_expList[2]['salary']))
        lv1List.append(lv1)
        lv3List.append(lv3)
        aver1List.append(aver1)
        aver3List.append(aver3)
        name_list = [
            format_name(name) for name in [
                work_expList[i]['position_name']
                for i in range(len(work_expList))
            ]
        ]
        name_list.pop(1)
        feature_line = fe.get_matrix(name_list)
        featureMat.append(np.array(feature_line))
        lengthList.append(len(work_expList))
        get_feature_dict(work_expList, feature)