def main(): tag = nn.Dataframefactory("hcp_tag", iotype=iotype) simi = nn.Dataframefactory("similar", iotype=iotype) mapping = mappingCbind(simi, tag) createDictStop() wechat = nn.Dataframefactory("wechat", iotype=iotype) web = nn.Dataframefactory("web", iotype=iotype) # 整合微信和网站的数据到同一个df cbindBehavData = dataPrepare(wechat, web) if cbindBehavData.shape[0] == 0: print("ERROR!!!") print("NO VALID DATA IS PREPARED! PLEASE CHECK THE RAW DATA.") print() else: doctorList = list(set(cbindBehavData["doctorid"])) print("Finished Data preparation") contentTitle = cbindBehavData['content_title'].dropna( ).drop_duplicates().to_frame() contentLabeled = titleLabeling(contentTitle, mapping) allBehavDataLabelled = cbindBehavData.merge(contentLabeled, left_on='content_title', right_on='content_title') allBehavDataLabelled["month_id"] = allBehavDataLabelled[ "start_date"].apply(getMonthId) validBehavDataLabelled = allBehavDataLabelled[ allBehavDataLabelled.lv2_tag.str.len() != 0] # calculate the heatmap data and chord diagram data heatMapPart = [] chordMapPart = [] print("Begin calculating") for docid in doctorList: segBehavData = validBehavDataLabelled[ validBehavDataLabelled["doctorid"] == docid] if segBehavData.shape[0] != 0: segHeatData = statsBySegment(segBehavData, docid) heatMapPart.append(segHeatData) segChordData = chordStatsBySeg(segBehavData, docid) if segChordData.shape[0] != 0: chordMapPart.append(segChordData) heatMapOutput = pd.concat(heatMapPart, ignore_index=True) chordMapOutput = pd.concat(chordMapPart, ignore_index=True) print("Finished calculating") nn.write_table(heatMapOutput, 'hcp_heatmap', iotype=iotype) # hcp_heatmap structure: four columns - doctorid, month_id, tag_name, tag_count nn.write_table(chordMapOutput, 'hcp_chordmap', iotype=iotype) # hcp_chordmap structure: four columns - doctorid, point_one, point_two, count return (1)
def main(): tag = nn.Dataframefactory("pat_tag", iotype=iotype) simi = nn.Dataframefactory("similar", iotype=iotype) mapping = mappingCbind(simi, tag) createDictStop() wechat = nn.Dataframefactory("wechat", iotype=iotype) # 整合微信和网站的数据到同一个df cbindBehavData = dataPrepare(wechat) patList = list(set(cbindBehavData["hcp_openid_u_2"])) print("Finished Data preparation") contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates( ).to_frame() contentLabeled = titleLabeling(contentTitle, mapping) print(contentLabeled) allBehavDataLabelled = cbindBehavData.merge(contentLabeled, left_on='content_title', right_on='content_title') allBehavDataLabelled["month_id"] = allBehavDataLabelled[ "start_date"].apply(getMonthId) validBehavDataLabelled = allBehavDataLabelled[ allBehavDataLabelled.lv2_tag.str.len() != 0] # calculate the heatmap data and chord diagram data heatMapPart = [] chordMapPart = [] print("Begin calculating") for openID in patList: segBehavData = validBehavDataLabelled[ validBehavDataLabelled["hcp_openid_u_2"] == openID] if segBehavData.shape[0] != 0: segHeatData = statsBySegment(segBehavData, openID) heatMapPart.append(segHeatData) segChordData = chordStatsBySeg(segBehavData, openID) if segChordData.shape[0] != 0: chordMapPart.append(segChordData) heatMapOutput = pd.concat(heatMapPart, ignore_index=True) chordMapOutput = pd.concat(chordMapPart, ignore_index=True) print("Finished calculating") nn.write_table(heatMapOutput, 'pat_heatmap', iotype=iotype) # pat_heatmap structure: four columns - openID, month_id, tag_name, tag_count nn.write_table(chordMapOutput, 'pat_chordmap', iotype=iotype) # pat_chordmap structure: four columns - openID, point_one, point_two, count return (1)
def main(): raw = nn.Dataframefactory("pat_call_center", iotype=iotype) mapping = nn.Dataframefactory("pat_call_mapping", iotype=iotype) print("Begin aggregating patient questions") patQuesDf = sepQuestions(raw) print("Patient questions prepared") print("Begin calculating") quesMerge = pd.merge(patQuesDf, mapping, how="left", left_on="customer_question", right_on="question") output = quesMerge[[ "patient_id", "customer_question", "question_category", "question_sub_category", "product_type" ]] print("Finished calculating") nn.write_table(output, 'pat_call_center_stats', iotype=iotype) # pat_call_center_stats structure: five columns - patient_id, customer_question, question_category, question_sub_category, product_type return (1)
def main(): print("Designed for WeCall Mini Program RecSys") print("------------------------------------------------------") print("Step 1: Loading necessary data") dm = DataManager() # 数据读取 wecall_behavior = dm.get_wecall_behavior # 行为数据读取 wecall_content = dm.get_wecall_content_tag # 文章库读取 hcp_market_title = dm.get_hcp_market_title # 读取市场地区数据 doc_list = dm.get_wecall_doctor # 读取医生列表 all_behavior_data = dm.get_all_behavior behavior_content_tag = dm.get_behavior_content_tag wecall_content_tag = dm.get_wecall_content_tag hcp_brand = dm.get_hcp_market_mapping content_brand = dm.get_wecall_article_brand content_brand = content_brand.rename(columns={'document_id': 'content_id'}) wecall_url = dm.get_wecall_url print("Step 1: Done") print("------------------------------------------------------") print("Step 2: Processing necessary data") # 数据处理 all_content = wecall_content[['content_id', 'content_title']] # 文章库有效列 all_content = all_content.drop_duplicates(['content_id']) # 文章去重 wecall_behavior[ 'content_id'] = dm.get_wecall_behavior.content_id.str.split( pat=".", n=1, expand=True)[0] # 行为数据doctorid统一格式 ### may not useful # Computes the most popular items 得到文章库的受欢迎程度排序 behavior_popularity_df = wecall_behavior.groupby(['doctorid', 'content_id'])['strength'].sum(). \ sort_values(ascending=False).reset_index() item_popularity_df = wecall_behavior.groupby([ 'content_id' ])['strength'].sum().sort_values(ascending=False).reset_index() print(item_popularity_df.dtypes) # 和文章内容合并 all_content_merge = all_content.merge(item_popularity_df, how="left", on="content_id") all_content_merge = all_content_merge.fillna(0) all_behavior_merge = all_content.merge(behavior_popularity_df, how="left", on="content_id") all_behavior_merge = all_behavior_merge.fillna(0) all_behavior_merge = all_behavior_merge.merge(content_brand, how='left', on="content_id") popularity_df = all_content_merge.groupby('content_id')['strength'].sum( ).sort_values(ascending=False).reset_index() popularity_df = popularity_df.merge(content_brand, how='left', on="content_id") print("Step 2: Done") print("------------------------------------------------------") print("Step 3: Generating Recommendation by popularity") ### Method 3 ---- Popularity Ranking start1 = time.time() popularity_model = PopularityRecommender(all_behavior_merge, popularity_df) # 输入 doctor_list = DataFrame(doc_list) # 推荐医生的列表 doctor_list = doctor_list.rename(columns={0: 'doctor_id'}) doctor_list = doctor_list.merge(hcp_brand, how='left', on='doctor_id') method3_final = popularity_model.deliver_final(doctor_list) # =doctor_list['doctor_id'], brand_id= doctor_list['brand_id'] end1 = time.time() running_time1 = end1 - start1 print('time cost : %.5f sec' % running_time1) print("Step 3: Done") print("------------------------------------------------------") print("Step 4: Generating Recommendation by Colleague") ### Method 2 ---- Colleague Recommendation start2 = time.time() print(content_brand) cl_rc = ColleagueRcsys(wecall_behavior, hcp_market_title, hcp_brand, content_brand, doc_list) method2_final = cl_rc.delivery_final() method2_final = method2_final[[ 'doctor_id', 'content_id', 'strength', 'method' ]] end2 = time.time() running_time2 = end2 - start2 print('time cost : %.5f sec' % running_time2) print("Step 4: Done") print("------------------------------------------------------") print("Step 5: Generating Recommendation by Guess what you Like") ### Method 1 ---- guess what you like start3 = time.time() print(content_brand) method1_final = recommand(all_behavior_data, behavior_content_tag, wecall_content_tag, hcp_brand, content_brand, doc_list) method1_final = method1_final[[ 'doctor_id', 'content_id', 'strength', 'method' ]] end3 = time.time() running_time3 = end3 - start3 print('time cost : %.5f sec' % running_time3) print("Step 5: Done") print("------------------------------------------------------") print("Step 6: Generating Final Recommendation Result") final_recommend = method1_final.append(method2_final) final_recommend_2 = final_recommend.append(method3_final) final_recommend_2.to_csv('stage01.csv') popularity_df_update = popularity_df[['content_id', 'strength']].drop_duplicates() final_output = final_recommend_2.groupby(['doctor_id', 'content_id' ])['method'].min().reset_index() final_output.to_csv('stage02.csv') final_output = final_output.merge(popularity_df_update, how='left', on='content_id') # final_with_url = final_output.merge(wecall_url, how='left', on='content_id') # print("add url") df1 = final_output[final_output['method'] == 1] df2 = final_output[final_output['method'] == 2] df3 = final_output[final_output['method'] == 3] t = Transformer(df1) x1 = t.getDataframe() if df2.empty: x2 = DataFrame( [], columns=['doctor_id', 'xn1', 'content_id', 'method', 'strength']) else: t.setDataframe(df2) x2 = t.getDataframe() t.setDataframe(df3) x3 = t.getDataframe() xf = x3.merge(x2, on=['doctor_id', 'xn1'], how='left').merge(x1, on=['doctor_id', 'xn1'], how='left') xf['createtime'] = time.strftime("%m/%d/%Y %H:%M", time.localtime()) xf = xf.rename( columns={ 'doctor_id': 'doctorid', 'xn1': 'rec_cnt', 'content_id_x': 'm1_id', 'content_id': 'm2_id', 'content_id_y': 'm3_id' }) xf1 = xf.merge(wecall_url, how='left', left_on='m1_id', right_on='content_id').rename(columns={ 'content_title': 'method1', 'url': 'm1_url' }) xf2 = xf1.merge(wecall_url, how='left', left_on='m2_id', right_on='content_id').rename(columns={ 'content_title': 'method2', 'url': 'm2_url' }) xf3 = xf2.merge(wecall_url, how='left', left_on='m3_id', right_on='content_id').rename(columns={ 'content_title': 'method3', 'url': 'm3_url' }) xf_final = xf3[[ "doctorid", "rec_cnt", "method1", "m1_id", "method2", "m2_id", "method3", "m3_id", "m1_url", "m2_url", "m3_url", "createtime" ]] xf_final["method1"] = xf_final["method1"].str.replace(',', ',', regex=False) xf_final["method2"] = xf_final["method2"].str.replace(',', ',', regex=False) xf_final["method3"] = xf_final["method3"].str.replace(',', ',', regex=False) nn.write_table(xf_final, 'rec_out', iotype=iotype) print("All Done") return (1)
def main(): tag = nn.Dataframefactory("tag", iotype=iotype) simi = nn.Dataframefactory("similar", iotype=iotype) mapping = mappingCbind(simi, tag) createDictStop() novoHcpAgg = nn.Dataframefactory("hcp_ability_detailing", iotype=iotype) doctorList = list(set(novoHcpAgg["customer_code"])) wechat = nn.Dataframefactory("wechat", iotype=iotype) web = nn.Dataframefactory("web", iotype=iotype) # 整合微信和网站的数据到同一个df cbindBehavData = dataPrepare(wechat, web, doctorList) print("Finished Data preparation") contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates( ).to_frame() contentLabeled = titleLabeling(contentTitle, mapping) allBehavDataLabelled = cbindBehavData.merge(contentLabeled, left_on='content_title', right_on='content_title') allBehavDataLabelled["month_id"] = allBehavDataLabelled[ "start_date"].apply(getMonthId) validBehavDataLabelled = allBehavDataLabelled[ allBehavDataLabelled.lv2_tag.str.len() != 0] # segment mapping file, write this table to Hive allCbindDf = cbindAllConditions(novoHcpAgg) print("Created segment mapping file") # do lv2 tag stats and get the top 15 labels allLv2Stats = statsByLevel(validBehavDataLabelled, "lv2_tag") topLv2LabelsDf = getTopNLabels(allLv2Stats, 15) print("Found top 15 tags of all doctors") # for seg in all segments, for each month in all months in the segment # calculate the heatmap data and chord diagram data heatMapPart = [] chordMapPart = [] print("Begin calculating") for segId in allCbindDf["segment_id"]: segDocList = getSegDoctorList(allCbindDf, novoHcpAgg, segId) if len(segDocList) != 0: segBehavData = validBehavDataLabelled[ validBehavDataLabelled["doctorid"].isin(segDocList)] if segBehavData.shape[0] != 0: segHeatData = statsBySegment(segBehavData, segId, topLv2LabelsDf) heatMapPart.append(segHeatData) segChordData = chordStatsBySeg(segBehavData, segId) if segChordData.shape[0] != 0: chordMapPart.append(segChordData) heatMapOutput = pd.concat(heatMapPart, ignore_index=True) chordMapOutput = pd.concat(chordMapPart, ignore_index=True) print("Finished calculating") objNovoHcpAgg = novoHcpAgg.astype("object") mergeSegID = pd.merge( objNovoHcpAgg, allCbindDf, how="left", left_on=["detailing_path_id", "level", "academic_title", "department"], right_on=["detailing_path", "hcp_segment", "title", "department"]) customerCodeSegId = mergeSegID[["customer_code", "segment_id"]] nn.write_table(heatMapOutput, 'heatmap', iotype=iotype) nn.write_table(chordMapOutput, 'chordmap', iotype=iotype) nn.write_table(allCbindDf, 'segmentmapping', iotype=iotype) nn.write_table(customerCodeSegId, 'customerCodeSegId', iotype=iotype) return (1)
def main(): print("Designed for Novo4PE-Pilot") print("------------------------------------------------------") print("Step 1: loading necessary data") tag = nn.Dataframefactory('tag',iotype = iotype) #similar = pd.read_csv('./essential/tag_similar_words.csv') similar = nn.Dataframefactory('similar',iotype = iotype) mapping = mappingCbind(similar, tag) #wechat = pd.read_excel("./essential/wechat_mengbo.xlsx") wechat = nn.Dataframefactory('wechat',iotype = iotype) wecall = wechat[wechat.module_2.isin(["WeCall 2.0","WeCall 1.0"])] wecall_content = wecall.content_title.unique() #web = pd.read_excel("./essential/web_mengbo.xlsx") web = nn.Dataframefactory('web',iotype = iotype) #novo_hcp = pd.read_csv("./essential/novo_hcp") novo_hcp = nn.Dataframefactory('novo_hcp',iotype = iotype) #novo_market = pd.read_csv("./essential/novo_hcp_market") novo_market = nn.Dataframefactory('novo_hcp_market',iotype = iotype) article_url = nn.Dataframefactory('article_url',iotype = iotype) print("Step 1: Done") print("------------------------------------------------------") print("Step 2: Creating dictionary") createDictStop() print("Step 2: Done") print("------------------------------------------------------") print("Step 3: Processing Raw Data") wechatFilterd, webFilterd, validWechatLog, validWebLog, contentPrefData, LogData = dataPrepare( wechat, web) print("Step 3: Done") print("------------------------------------------------------") print("Step 4: Caculating Channel Preference") output1 = channelPref(wechatFilterd, webFilterd) print("Step 4: Done") cotentTitle = contentPrefData['content_title'].dropna( ).drop_duplicates().to_frame() contentLabeled = titleLabeling(cotentTitle, mapping) contentNew = contentPrefData.merge( contentLabeled, left_on='content_title', right_on='content_title') print("------------------------------------------------------") print("Step 5: Caculating HCP Content Preference and Interest Point") output2 = pd.DataFrame() output3 = pd.DataFrame() for dc_id in contentNew.doctorid.unique(): contentInsteret, otherTags, lb, labelMap = calContInst( contentNew, dc_id) keywordCnt = calContKeyWord( contentNew, dc_id, lb, otherTags, labelMap, mapping) output2 = output2.append(contentInsteret) output3 = output3.append(keywordCnt) output2.reset_index(drop=True, inplace=True) output3.reset_index(drop=True, inplace=True) print("Step 5: Done") print("------------------------------------------------------") print("Step 6: Caculating HCP reading History") webHistWithoutToken = webHistWithoutTokens(validWebLog) wechathistWithoutToken = wechatHistWithoutTokens(validWechatLog) output4 = readingHist(webHistWithoutToken, wechathistWithoutToken, contentLabeled) print("Step 6: Done") content_uq = get_content_uniq(LogData) hcp_reading_history = get_hcp_reading_history(LogData) doctorid_uq = get_uniq_doctorid(LogData) hcp_lb_uq = get_hcp_label_uniq(mapping) content_lb = contentLabeled[["content_title", "HCP标签"]] content_lb_pop = content_lb.merge( content_uq[["content_id", "content_title", "popularity"]], on="content_title") # 更新表结构 !!!HCP标签的 好计算 content_lb_pop[hcp_lb_uq] = content_lb_pop["HCP标签"].apply( create_var, args=(hcp_lb_uq,)) hcp_tech_class, hcp_info_pro = get_hcp_tech_class( novo_hcp, novo_market, doctorid_uq, hcp_reading_history) content_pop = content_lb_pop[["content_title", "popularity"]] hcp_class_mapping = get_hcp_class_mapping( hcp_info_pro, hcp_tech_class, doctorid_uq) content_lb_pop = content_lb_pop[content_lb_pop.content_title.isin(wecall_content)] print("------------------------------------------------------") print("Step 7: Generating HCP Personal Recommendation List") o2 = output2.copy() o2["Ratio"] = o2.Ratio.apply(p2f) output5 = pd.DataFrame() for doc_id in doctorid_uq: test = pd.DataFrame(np.nan, index=range(0, 5), columns=[ "doctorid", "rec_cnt", "method1", "method2", "method3"]) test["method1"] = content_lb_pop[~content_lb_pop["content_title"].isin(hcp_reading_history.get(doc_id))] \ .sort_values('popularity', ascending=False) \ .head(5) \ .content_title \ .reset_index(drop=True) ################################################################################################### try: inst_list = get_most_interest_keyword(o2, doc_id) personal_rec = content_lb_pop[content_lb_pop[inst_list].any(1)] test["method2"] = personal_rec[~personal_rec["content_title"].isin(hcp_reading_history.get(doc_id))] \ .sort_values('popularity', ascending=False) \ .head(5) \ .content_title \ .reset_index(drop=True) except IndexError: test["method2"] = np.nan ################################################################################################### try: hcp_class_content = get_hcp_class( hcp_tech_class, hcp_class_mapping, doc_id, content_lb_pop) except IndexError: hcp_class_content = pd.DataFrame( columns=["content_title", "popularity"]) test["method3"] = hcp_class_content[~hcp_class_content["content_title"].isin(hcp_reading_history.get(doc_id))] \ .sort_values('popularity', ascending=False) \ .head(5) \ .content_title \ .reset_index(drop=True) test["doctorid"] = doc_id test["rec_cnt"] = test.index + 1 output5 = output5.append(test) output5 = output5.reset_index(drop=True) url = article_url[["title","url"]] output5_1 = output5.merge(url,left_on=["method1"],right_on="title",how="left") output5_1 = output5_1.merge(url,left_on=["method2"],right_on="title",how="left",suffixes=("_1","_2")) output5_1 = output5_1.merge(url,left_on=["method3"],right_on="title",how="left",suffixes=("_1","_2")) col_drop = ["title_1","title_1","title"] output5_1.drop(columns=col_drop,axis=1,inplace=True) col_left =[ "doctorid","rec_cnt","method1","method2","method3","url_1","url_2","url"] output5_1 = output5_1[col_left] output5_1.rename(columns={"url":"url_3"},inplace=True) print("Step 7: Done") print("------------------------------------------------------") print("ALL COMPLETE") nn.write_table(output1,'hcp_channel_preference',iotype = iotype) nn.write_table(output2,'hcp_content_interest',iotype = iotype) nn.write_table(output3,'hcp_content_interest_keyword',iotype = iotype) nn.write_table(output4,'hcp_reading_history',iotype = iotype) nn.write_table(output5_1,'hcp_recommendation',iotype = iotype) return(1)