def get_feature(lines1, lines2, rf_prob, gbdt_prob): tmp = 0 gbdt_po_list, rf_po_list, high_list, dip_list = [], [], [], [] for p in range(len(lines1) - 3): # idStr,idStr = '1','1' id_str, gbdt_posi, answer1 = lines1[p].strip().split('\t') id_str, rf_posi, answer2 = lines2[p].strip().split('\t') gbdt_posi = dl.get_position_num(gbdt_posi) rf_posi = dl.get_position_num(rf_posi) answer2 = dl.get_position_num(answer2) gbdt_po_list.append(gbdt_posi) rf_po_list.append(rf_posi) high = rf_prob[p, rf_posi] > gbdt_prob[p, gbdt_posi] dip = rf_prob[p, rf_posi] - gbdt_prob[p, gbdt_posi] high_list.append(high) dip_list.append(dip) tarList.append(answer2) idList.append(id_str) feature['high'] = high_list feature['dip'] = dip_list feature['gbPo'] = gbdt_po_list feature['rfPo'] = rf_po_list for j in range(rf_prob.shape[1]): feature[str(j)] = list(rf_prob[:, j]) for k in range(gbdt_prob.shape[1]): tmp = k + j feature[str(tmp)] = list(gbdt_prob[:, t]) for m in range(gbdt_prob.shape[1]): tmp3 = tmp + 1 feature[str(tmp3)] = list(gbdt_prob[:, m] + rf_prob[:, m]) m_feature = pd.DataFrame(feature) return m_feature
def merge_feature(data_file): tmp_n = 0 # 定义特征list # 逐样本提取特征 for doc in data_file: tmp_n += 1 if tmp_n == 100000: break doc = json.loads(doc) id_str = str(doc['_id']['$oid']) idList.append(id_str) ageList.append(dl.age_deal(doc['age'])) genderList.append(dl.get_gender(doc['gender'])) major_str = doc['major'] if not major_str or major_str.strip() == u'None': major_str = u'0' majorClassList.append(dl.get_major_calss(major_str)) majorNumList.append(dl.get_major_num(major_str)) work_expList = doc['workExperienceList'] name1 = format_name(work_expList[0]['position_name']) name1List.append(dl.get_position_num(name1)) name3 = format_name(work_expList[2]['position_name']) name3List.append(dl.get_position_num(name3)) nameNList.append(dl.get_position_num(format_name(work_expList[-1]['position_name']))) Size1List.append(int(work_expList[0]['size'])) Size3List.append(int(work_expList[2]['size'])) SizeNList.append(int(work_expList[-1]['size'])) salary1List.append(int(work_expList[0]['salary'])) salary3List.append(int(work_expList[2]['salary'])) salaryNList.append(int(work_expList[-1]['salary'])) ind1_str = work_expList[0]['industry'] if not ind1_str: ind1_str = 'Null' ind1_num = dl.get_industry_num(ind1_str.strip()) industry1Num.append(ind1_num) ind3_str = work_expList[2]['industry'] if not ind3_str: ind3_str = 'Null' ind3_num = dl.get_industry_num(ind3_str.strip()) industry3Num.append(ind3_num) if work_expList[1]: degreeList.append(int(doc['degree'])) name2List.append(dl.get_position_num(format_name(work_expList[1]['position_name']))) Size2List.append(int(work_expList[1]['size'])) salary2List.append(int(work_expList[1]['salary'])) else: work_expList[1] = {} work_expList[1]['position_name'] = 'no name' time1.append(dl.time_deal(work_expList[0]['start_date'], work_expList[0]['end_date'])) time2.append(dl.time_deal(work_expList[2]['end_date'], work_expList[0]['start_date'])) time3.append(dl.time_deal(work_expList[2]['start_date'], work_expList[2]['end_date'])) time4.append(dl.time_deal(work_expList[-1]['start_date'], work_expList[0]['end_date'])) tem_year1 = dl.get_year(work_expList[0]['end_date']) tem_year3 = dl.get_year(work_expList[2]['end_date']) tem_yearn = dl.get_year(work_expList[-1]['start_date']) year1.append(tem_year1) year2.append(dl.get_year(work_expList[0]['start_date'])) year3.append(tem_year3) yearN.append(tem_yearn) firstAgeList.append(dl.get_frist_age(dl.age_deal(doc['age']), tem_yearn)) ind0_str = work_expList[0]['industry'] if not ind0_str: ind0_str = 'Null' ind0Num.append(dl.get_industry_num(ind0_str.strip())) ind2_Str = work_expList[2]['industry'] if not ind2_Str: ind2_Str = 'Null' ind2Num.append(dl.get_industry_num(ind2_Str.strip())) salary_lv1, year_salary1 = salary_feature.get_sala_feature(name1, tem_year1, int(work_expList[0]['salary'])) salary_lv3, year_salary3 = salary_feature.get_sala_feature(name3, tem_year3, int(work_expList[2]['salary'])) salaryLv1List.append(salary_lv1) salaryLv3List.append(salary_lv3) yearSalary1List.append(year_salary1) yearSalary3List.append(year_salary3) aver1, lv1 = get_level_aver(tem_year1, int(work_expList[0]['salary'])) aver3, lv3 = get_level_aver(tem_year3, int(work_expList[2]['salary'])) lv1List.append(lv1) lv3List.append(lv3) aver1List.append(aver1) aver3List.append(aver3) name_list = [format_name(name) for name in [work_expList[i]['position_name'] for i in range(len(work_expList))]] name_list.pop(1) feature_line = fe.get_matrix(name_list) featureMat.append(np.array(feature_line)) lengthList.append(len(work_expList)) get_feature_dict(work_expList, feature)
def merge_feature(data_file): tmp_n = 0 # 定义特征list # 逐样本提取特征 for doc in data_file: tmp_n += 1 if tmp_n == 100000: break doc = json.loads(doc) id_str = str(doc['_id']['$oid']) idList.append(id_str) ageList.append(dl.age_deal(doc['age'])) genderList.append(dl.get_gender(doc['gender'])) major_str = doc['major'] if not major_str or major_str.strip() == u'None': major_str = u'0' majorClassList.append(dl.get_major_calss(major_str)) majorNumList.append(dl.get_major_num(major_str)) work_expList = doc['workExperienceList'] name1 = format_name(work_expList[0]['position_name']) name1List.append(dl.get_position_num(name1)) name3 = format_name(work_expList[2]['position_name']) name3List.append(dl.get_position_num(name3)) nameNList.append( dl.get_position_num(format_name( work_expList[-1]['position_name']))) Size1List.append(int(work_expList[0]['size'])) Size3List.append(int(work_expList[2]['size'])) SizeNList.append(int(work_expList[-1]['size'])) salary1List.append(int(work_expList[0]['salary'])) salary3List.append(int(work_expList[2]['salary'])) salaryNList.append(int(work_expList[-1]['salary'])) ind1_str = work_expList[0]['industry'] if not ind1_str: ind1_str = 'Null' ind1_num = dl.get_industry_num(ind1_str.strip()) industry1Num.append(ind1_num) ind3_str = work_expList[2]['industry'] if not ind3_str: ind3_str = 'Null' ind3_num = dl.get_industry_num(ind3_str.strip()) industry3Num.append(ind3_num) if work_expList[1]: degreeList.append(int(doc['degree'])) name2List.append( dl.get_position_num( format_name(work_expList[1]['position_name']))) Size2List.append(int(work_expList[1]['size'])) salary2List.append(int(work_expList[1]['salary'])) else: work_expList[1] = {} work_expList[1]['position_name'] = 'no name' time1.append( dl.time_deal(work_expList[0]['start_date'], work_expList[0]['end_date'])) time2.append( dl.time_deal(work_expList[2]['end_date'], work_expList[0]['start_date'])) time3.append( dl.time_deal(work_expList[2]['start_date'], work_expList[2]['end_date'])) time4.append( dl.time_deal(work_expList[-1]['start_date'], work_expList[0]['end_date'])) tem_year1 = dl.get_year(work_expList[0]['end_date']) tem_year3 = dl.get_year(work_expList[2]['end_date']) tem_yearn = dl.get_year(work_expList[-1]['start_date']) year1.append(tem_year1) year2.append(dl.get_year(work_expList[0]['start_date'])) year3.append(tem_year3) yearN.append(tem_yearn) firstAgeList.append( dl.get_frist_age(dl.age_deal(doc['age']), tem_yearn)) ind0_str = work_expList[0]['industry'] if not ind0_str: ind0_str = 'Null' ind0Num.append(dl.get_industry_num(ind0_str.strip())) ind2_Str = work_expList[2]['industry'] if not ind2_Str: ind2_Str = 'Null' ind2Num.append(dl.get_industry_num(ind2_Str.strip())) salary_lv1, year_salary1 = salary_feature.get_sala_feature( name1, tem_year1, int(work_expList[0]['salary'])) salary_lv3, year_salary3 = salary_feature.get_sala_feature( name3, tem_year3, int(work_expList[2]['salary'])) salaryLv1List.append(salary_lv1) salaryLv3List.append(salary_lv3) yearSalary1List.append(year_salary1) yearSalary3List.append(year_salary3) aver1, lv1 = get_level_aver(tem_year1, int(work_expList[0]['salary'])) aver3, lv3 = get_level_aver(tem_year3, int(work_expList[2]['salary'])) lv1List.append(lv1) lv3List.append(lv3) aver1List.append(aver1) aver3List.append(aver3) name_list = [ format_name(name) for name in [ work_expList[i]['position_name'] for i in range(len(work_expList)) ] ] name_list.pop(1) feature_line = fe.get_matrix(name_list) featureMat.append(np.array(feature_line)) lengthList.append(len(work_expList)) get_feature_dict(work_expList, feature)