Ejemplo n.º 1
0
def main():
    tag = nn.Dataframefactory("hcp_tag", iotype=iotype)
    simi = nn.Dataframefactory("similar", iotype=iotype)

    mapping = mappingCbind(simi, tag)
    createDictStop()

    wechat = nn.Dataframefactory("wechat", iotype=iotype)

    web = nn.Dataframefactory("web", iotype=iotype)

    # 整合微信和网站的数据到同一个df
    cbindBehavData = dataPrepare(wechat, web)
    if cbindBehavData.shape[0] == 0:
        print("ERROR!!!")
        print("NO VALID DATA IS PREPARED! PLEASE CHECK THE RAW DATA.")
        print()
    else:
        doctorList = list(set(cbindBehavData["doctorid"]))
        print("Finished Data preparation")

        contentTitle = cbindBehavData['content_title'].dropna(
        ).drop_duplicates().to_frame()
        contentLabeled = titleLabeling(contentTitle, mapping)
        allBehavDataLabelled = cbindBehavData.merge(contentLabeled,
                                                    left_on='content_title',
                                                    right_on='content_title')
        allBehavDataLabelled["month_id"] = allBehavDataLabelled[
            "start_date"].apply(getMonthId)
        validBehavDataLabelled = allBehavDataLabelled[
            allBehavDataLabelled.lv2_tag.str.len() != 0]

        # calculate the heatmap data and chord diagram data
        heatMapPart = []
        chordMapPart = []
        print("Begin calculating")

        for docid in doctorList:
            segBehavData = validBehavDataLabelled[
                validBehavDataLabelled["doctorid"] == docid]
            if segBehavData.shape[0] != 0:
                segHeatData = statsBySegment(segBehavData, docid)
                heatMapPart.append(segHeatData)
                segChordData = chordStatsBySeg(segBehavData, docid)
                if segChordData.shape[0] != 0:
                    chordMapPart.append(segChordData)

        heatMapOutput = pd.concat(heatMapPart, ignore_index=True)
        chordMapOutput = pd.concat(chordMapPart, ignore_index=True)
        print("Finished calculating")

        nn.write_table(heatMapOutput, 'hcp_heatmap', iotype=iotype)
        # hcp_heatmap structure: four columns - doctorid, month_id, tag_name, tag_count
        nn.write_table(chordMapOutput, 'hcp_chordmap', iotype=iotype)
        # hcp_chordmap structure: four columns - doctorid, point_one, point_two, count

        return (1)
Ejemplo n.º 2
0
def main():
    tag = nn.Dataframefactory("pat_tag", iotype=iotype)
    simi = nn.Dataframefactory("similar", iotype=iotype)

    mapping = mappingCbind(simi, tag)
    createDictStop()

    wechat = nn.Dataframefactory("wechat", iotype=iotype)

    # 整合微信和网站的数据到同一个df
    cbindBehavData = dataPrepare(wechat)
    patList = list(set(cbindBehavData["hcp_openid_u_2"]))
    print("Finished Data preparation")

    contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates(
    ).to_frame()
    contentLabeled = titleLabeling(contentTitle, mapping)
    print(contentLabeled)
    allBehavDataLabelled = cbindBehavData.merge(contentLabeled,
                                                left_on='content_title',
                                                right_on='content_title')
    allBehavDataLabelled["month_id"] = allBehavDataLabelled[
        "start_date"].apply(getMonthId)
    validBehavDataLabelled = allBehavDataLabelled[
        allBehavDataLabelled.lv2_tag.str.len() != 0]

    # calculate the heatmap data and chord diagram data
    heatMapPart = []
    chordMapPart = []
    print("Begin calculating")

    for openID in patList:
        segBehavData = validBehavDataLabelled[
            validBehavDataLabelled["hcp_openid_u_2"] == openID]
        if segBehavData.shape[0] != 0:
            segHeatData = statsBySegment(segBehavData, openID)
            heatMapPart.append(segHeatData)
            segChordData = chordStatsBySeg(segBehavData, openID)
            if segChordData.shape[0] != 0:
                chordMapPart.append(segChordData)

    heatMapOutput = pd.concat(heatMapPart, ignore_index=True)
    chordMapOutput = pd.concat(chordMapPart, ignore_index=True)
    print("Finished calculating")

    nn.write_table(heatMapOutput, 'pat_heatmap', iotype=iotype)
    # pat_heatmap structure: four columns - openID, month_id, tag_name, tag_count
    nn.write_table(chordMapOutput, 'pat_chordmap', iotype=iotype)
    # pat_chordmap structure: four columns - openID, point_one, point_two, count

    return (1)
Ejemplo n.º 3
0
def main():
    raw = nn.Dataframefactory("pat_call_center", iotype=iotype)
    mapping = nn.Dataframefactory("pat_call_mapping", iotype=iotype)

    print("Begin aggregating patient questions")
    patQuesDf = sepQuestions(raw)
    print("Patient questions prepared")

    print("Begin calculating")
    quesMerge = pd.merge(patQuesDf,
                         mapping,
                         how="left",
                         left_on="customer_question",
                         right_on="question")
    output = quesMerge[[
        "patient_id", "customer_question", "question_category",
        "question_sub_category", "product_type"
    ]]
    print("Finished calculating")

    nn.write_table(output, 'pat_call_center_stats', iotype=iotype)
    # pat_call_center_stats structure: five columns - patient_id, customer_question, question_category, question_sub_category, product_type

    return (1)
Ejemplo n.º 4
0
def main():
    print("Designed for WeCall Mini Program RecSys")
    print("------------------------------------------------------")
    print("Step 1: Loading necessary data")

    dm = DataManager()

    # 数据读取
    wecall_behavior = dm.get_wecall_behavior  # 行为数据读取
    wecall_content = dm.get_wecall_content_tag  # 文章库读取
    hcp_market_title = dm.get_hcp_market_title  # 读取市场地区数据
    doc_list = dm.get_wecall_doctor  # 读取医生列表
    all_behavior_data = dm.get_all_behavior
    behavior_content_tag = dm.get_behavior_content_tag
    wecall_content_tag = dm.get_wecall_content_tag
    hcp_brand = dm.get_hcp_market_mapping
    content_brand = dm.get_wecall_article_brand
    content_brand = content_brand.rename(columns={'document_id': 'content_id'})
    wecall_url = dm.get_wecall_url

    print("Step 1: Done")
    print("------------------------------------------------------")
    print("Step 2: Processing necessary data")
    # 数据处理
    all_content = wecall_content[['content_id', 'content_title']]  # 文章库有效列
    all_content = all_content.drop_duplicates(['content_id'])  # 文章去重
    wecall_behavior[
        'content_id'] = dm.get_wecall_behavior.content_id.str.split(
            pat=".", n=1,
            expand=True)[0]  # 行为数据doctorid统一格式 ### may not useful

    # Computes the most popular items 得到文章库的受欢迎程度排序
    behavior_popularity_df = wecall_behavior.groupby(['doctorid', 'content_id'])['strength'].sum(). \
        sort_values(ascending=False).reset_index()
    item_popularity_df = wecall_behavior.groupby([
        'content_id'
    ])['strength'].sum().sort_values(ascending=False).reset_index()
    print(item_popularity_df.dtypes)

    # 和文章内容合并
    all_content_merge = all_content.merge(item_popularity_df,
                                          how="left",
                                          on="content_id")
    all_content_merge = all_content_merge.fillna(0)

    all_behavior_merge = all_content.merge(behavior_popularity_df,
                                           how="left",
                                           on="content_id")
    all_behavior_merge = all_behavior_merge.fillna(0)

    all_behavior_merge = all_behavior_merge.merge(content_brand,
                                                  how='left',
                                                  on="content_id")
    popularity_df = all_content_merge.groupby('content_id')['strength'].sum(
    ).sort_values(ascending=False).reset_index()
    popularity_df = popularity_df.merge(content_brand,
                                        how='left',
                                        on="content_id")
    print("Step 2: Done")
    print("------------------------------------------------------")
    print("Step 3: Generating Recommendation by popularity")
    ### Method 3 ---- Popularity Ranking

    start1 = time.time()
    popularity_model = PopularityRecommender(all_behavior_merge,
                                             popularity_df)  # 输入
    doctor_list = DataFrame(doc_list)  # 推荐医生的列表
    doctor_list = doctor_list.rename(columns={0: 'doctor_id'})
    doctor_list = doctor_list.merge(hcp_brand, how='left', on='doctor_id')
    method3_final = popularity_model.deliver_final(doctor_list)
    # =doctor_list['doctor_id'], brand_id= doctor_list['brand_id']
    end1 = time.time()
    running_time1 = end1 - start1
    print('time cost : %.5f sec' % running_time1)
    print("Step 3: Done")
    print("------------------------------------------------------")
    print("Step 4: Generating Recommendation by Colleague")
    ### Method 2 ---- Colleague Recommendation
    start2 = time.time()
    print(content_brand)
    cl_rc = ColleagueRcsys(wecall_behavior, hcp_market_title, hcp_brand,
                           content_brand, doc_list)
    method2_final = cl_rc.delivery_final()
    method2_final = method2_final[[
        'doctor_id', 'content_id', 'strength', 'method'
    ]]
    end2 = time.time()
    running_time2 = end2 - start2
    print('time cost : %.5f sec' % running_time2)
    print("Step 4: Done")
    print("------------------------------------------------------")
    print("Step 5: Generating Recommendation by Guess what you Like")
    ### Method 1 ---- guess what you like
    start3 = time.time()
    print(content_brand)
    method1_final = recommand(all_behavior_data, behavior_content_tag,
                              wecall_content_tag, hcp_brand, content_brand,
                              doc_list)
    method1_final = method1_final[[
        'doctor_id', 'content_id', 'strength', 'method'
    ]]
    end3 = time.time()
    running_time3 = end3 - start3
    print('time cost : %.5f sec' % running_time3)
    print("Step 5: Done")
    print("------------------------------------------------------")
    print("Step 6: Generating Final Recommendation Result")

    final_recommend = method1_final.append(method2_final)
    final_recommend_2 = final_recommend.append(method3_final)

    final_recommend_2.to_csv('stage01.csv')
    popularity_df_update = popularity_df[['content_id',
                                          'strength']].drop_duplicates()
    final_output = final_recommend_2.groupby(['doctor_id', 'content_id'
                                              ])['method'].min().reset_index()
    final_output.to_csv('stage02.csv')
    final_output = final_output.merge(popularity_df_update,
                                      how='left',
                                      on='content_id')
    # final_with_url = final_output.merge(wecall_url, how='left', on='content_id')
    # print("add url")
    df1 = final_output[final_output['method'] == 1]
    df2 = final_output[final_output['method'] == 2]
    df3 = final_output[final_output['method'] == 3]
    t = Transformer(df1)
    x1 = t.getDataframe()
    if df2.empty:
        x2 = DataFrame(
            [],
            columns=['doctor_id', 'xn1', 'content_id', 'method', 'strength'])
    else:
        t.setDataframe(df2)
        x2 = t.getDataframe()
    t.setDataframe(df3)
    x3 = t.getDataframe()
    xf = x3.merge(x2, on=['doctor_id', 'xn1'],
                  how='left').merge(x1, on=['doctor_id', 'xn1'], how='left')
    xf['createtime'] = time.strftime("%m/%d/%Y %H:%M", time.localtime())
    xf = xf.rename(
        columns={
            'doctor_id': 'doctorid',
            'xn1': 'rec_cnt',
            'content_id_x': 'm1_id',
            'content_id': 'm2_id',
            'content_id_y': 'm3_id'
        })
    xf1 = xf.merge(wecall_url,
                   how='left',
                   left_on='m1_id',
                   right_on='content_id').rename(columns={
                       'content_title': 'method1',
                       'url': 'm1_url'
                   })
    xf2 = xf1.merge(wecall_url,
                    how='left',
                    left_on='m2_id',
                    right_on='content_id').rename(columns={
                        'content_title': 'method2',
                        'url': 'm2_url'
                    })
    xf3 = xf2.merge(wecall_url,
                    how='left',
                    left_on='m3_id',
                    right_on='content_id').rename(columns={
                        'content_title': 'method3',
                        'url': 'm3_url'
                    })
    xf_final = xf3[[
        "doctorid", "rec_cnt", "method1", "m1_id", "method2", "m2_id",
        "method3", "m3_id", "m1_url", "m2_url", "m3_url", "createtime"
    ]]

    xf_final["method1"] = xf_final["method1"].str.replace(',',
                                                          ',',
                                                          regex=False)
    xf_final["method2"] = xf_final["method2"].str.replace(',',
                                                          ',',
                                                          regex=False)
    xf_final["method3"] = xf_final["method3"].str.replace(',',
                                                          ',',
                                                          regex=False)
    nn.write_table(xf_final, 'rec_out', iotype=iotype)
    print("All Done")

    return (1)
Ejemplo n.º 5
0
def main():
    tag = nn.Dataframefactory("tag", iotype=iotype)
    simi = nn.Dataframefactory("similar", iotype=iotype)

    mapping = mappingCbind(simi, tag)
    createDictStop()

    novoHcpAgg = nn.Dataframefactory("hcp_ability_detailing", iotype=iotype)

    doctorList = list(set(novoHcpAgg["customer_code"]))

    wechat = nn.Dataframefactory("wechat", iotype=iotype)
    web = nn.Dataframefactory("web", iotype=iotype)

    # 整合微信和网站的数据到同一个df
    cbindBehavData = dataPrepare(wechat, web, doctorList)
    print("Finished Data preparation")

    contentTitle = cbindBehavData['content_title'].dropna().drop_duplicates(
    ).to_frame()
    contentLabeled = titleLabeling(contentTitle, mapping)
    allBehavDataLabelled = cbindBehavData.merge(contentLabeled,
                                                left_on='content_title',
                                                right_on='content_title')
    allBehavDataLabelled["month_id"] = allBehavDataLabelled[
        "start_date"].apply(getMonthId)
    validBehavDataLabelled = allBehavDataLabelled[
        allBehavDataLabelled.lv2_tag.str.len() != 0]

    # segment mapping file, write this table to Hive
    allCbindDf = cbindAllConditions(novoHcpAgg)
    print("Created segment mapping file")

    # do lv2 tag stats and get the top 15 labels
    allLv2Stats = statsByLevel(validBehavDataLabelled, "lv2_tag")
    topLv2LabelsDf = getTopNLabels(allLv2Stats, 15)
    print("Found top 15 tags of all doctors")

    # for seg in all segments, for each month in all months in the segment
    # calculate the heatmap data and chord diagram data
    heatMapPart = []
    chordMapPart = []
    print("Begin calculating")

    for segId in allCbindDf["segment_id"]:
        segDocList = getSegDoctorList(allCbindDf, novoHcpAgg, segId)
        if len(segDocList) != 0:
            segBehavData = validBehavDataLabelled[
                validBehavDataLabelled["doctorid"].isin(segDocList)]
            if segBehavData.shape[0] != 0:
                segHeatData = statsBySegment(segBehavData, segId,
                                             topLv2LabelsDf)
                heatMapPart.append(segHeatData)
                segChordData = chordStatsBySeg(segBehavData, segId)
                if segChordData.shape[0] != 0:
                    chordMapPart.append(segChordData)

    heatMapOutput = pd.concat(heatMapPart, ignore_index=True)
    chordMapOutput = pd.concat(chordMapPart, ignore_index=True)
    print("Finished calculating")

    objNovoHcpAgg = novoHcpAgg.astype("object")
    mergeSegID = pd.merge(
        objNovoHcpAgg,
        allCbindDf,
        how="left",
        left_on=["detailing_path_id", "level", "academic_title", "department"],
        right_on=["detailing_path", "hcp_segment", "title", "department"])
    customerCodeSegId = mergeSegID[["customer_code", "segment_id"]]

    nn.write_table(heatMapOutput, 'heatmap', iotype=iotype)
    nn.write_table(chordMapOutput, 'chordmap', iotype=iotype)
    nn.write_table(allCbindDf, 'segmentmapping', iotype=iotype)
    nn.write_table(customerCodeSegId, 'customerCodeSegId', iotype=iotype)

    return (1)
Ejemplo n.º 6
0
def main():
    print("Designed for Novo4PE-Pilot")
    print("------------------------------------------------------")
    print("Step 1: loading necessary data")
    
    

    tag = nn.Dataframefactory('tag',iotype = iotype)
    #similar = pd.read_csv('./essential/tag_similar_words.csv')
    similar = nn.Dataframefactory('similar',iotype = iotype)
    mapping = mappingCbind(similar, tag)

    #wechat = pd.read_excel("./essential/wechat_mengbo.xlsx")
    wechat = nn.Dataframefactory('wechat',iotype = iotype)
    wecall = wechat[wechat.module_2.isin(["WeCall 2.0","WeCall 1.0"])]
    wecall_content = wecall.content_title.unique()
    #web = pd.read_excel("./essential/web_mengbo.xlsx")
    web = nn.Dataframefactory('web',iotype = iotype)

    #novo_hcp = pd.read_csv("./essential/novo_hcp")
    novo_hcp = nn.Dataframefactory('novo_hcp',iotype = iotype)

    #novo_market = pd.read_csv("./essential/novo_hcp_market")
    novo_market = nn.Dataframefactory('novo_hcp_market',iotype = iotype)
    article_url = nn.Dataframefactory('article_url',iotype = iotype)
    print("Step 1: Done")
    print("------------------------------------------------------")
    print("Step 2: Creating dictionary")
    createDictStop()
    print("Step 2: Done")
    print("------------------------------------------------------")
    print("Step 3: Processing Raw Data")
    wechatFilterd, webFilterd, validWechatLog, validWebLog, contentPrefData, LogData = dataPrepare(
        wechat, web)
    print("Step 3: Done")
    print("------------------------------------------------------")
    print("Step 4: Caculating Channel Preference")
    output1 = channelPref(wechatFilterd, webFilterd)
    print("Step 4: Done")
    cotentTitle = contentPrefData['content_title'].dropna(
    ).drop_duplicates().to_frame()
    contentLabeled = titleLabeling(cotentTitle, mapping)
    contentNew = contentPrefData.merge(
        contentLabeled, left_on='content_title', right_on='content_title')
    print("------------------------------------------------------")
    print("Step 5: Caculating HCP Content Preference and Interest Point")
    output2 = pd.DataFrame()
    output3 = pd.DataFrame()
    for dc_id in contentNew.doctorid.unique():
        contentInsteret, otherTags, lb, labelMap = calContInst(
            contentNew, dc_id)
        keywordCnt = calContKeyWord(
            contentNew, dc_id, lb, otherTags, labelMap, mapping)
        output2 = output2.append(contentInsteret)
        output3 = output3.append(keywordCnt)
    output2.reset_index(drop=True, inplace=True)
    output3.reset_index(drop=True, inplace=True)
    print("Step 5: Done")
    print("------------------------------------------------------")
    print("Step 6: Caculating HCP reading History")
    webHistWithoutToken = webHistWithoutTokens(validWebLog)
    wechathistWithoutToken = wechatHistWithoutTokens(validWechatLog)
    output4 = readingHist(webHistWithoutToken,
                          wechathistWithoutToken, contentLabeled)
    print("Step 6: Done")
    content_uq = get_content_uniq(LogData)
    hcp_reading_history = get_hcp_reading_history(LogData)
    doctorid_uq = get_uniq_doctorid(LogData)
    hcp_lb_uq = get_hcp_label_uniq(mapping)
    content_lb = contentLabeled[["content_title", "HCP标签"]]
    content_lb_pop = content_lb.merge(
        content_uq[["content_id", "content_title", "popularity"]], on="content_title")
    # 更新表结构 !!!HCP标签的 好计算
    content_lb_pop[hcp_lb_uq] = content_lb_pop["HCP标签"].apply(
        create_var, args=(hcp_lb_uq,))
    hcp_tech_class, hcp_info_pro = get_hcp_tech_class(
        novo_hcp, novo_market, doctorid_uq, hcp_reading_history)
    content_pop = content_lb_pop[["content_title", "popularity"]]
    hcp_class_mapping = get_hcp_class_mapping(
        hcp_info_pro, hcp_tech_class, doctorid_uq)

    content_lb_pop = content_lb_pop[content_lb_pop.content_title.isin(wecall_content)]
    print("------------------------------------------------------")
    print("Step 7: Generating HCP Personal Recommendation List")
    o2 = output2.copy()
    o2["Ratio"] = o2.Ratio.apply(p2f)
    output5 = pd.DataFrame()
    for doc_id in doctorid_uq:
        test = pd.DataFrame(np.nan, index=range(0, 5), columns=[
                            "doctorid", "rec_cnt", "method1", "method2", "method3"])

        test["method1"] = content_lb_pop[~content_lb_pop["content_title"].isin(hcp_reading_history.get(doc_id))] \
            .sort_values('popularity', ascending=False) \
            .head(5) \
            .content_title \
            .reset_index(drop=True)
    ###################################################################################################
        try:
            inst_list = get_most_interest_keyword(o2, doc_id)
            personal_rec = content_lb_pop[content_lb_pop[inst_list].any(1)]
            test["method2"] = personal_rec[~personal_rec["content_title"].isin(hcp_reading_history.get(doc_id))] \
                .sort_values('popularity', ascending=False) \
                .head(5) \
                .content_title \
                .reset_index(drop=True)
        except IndexError:
            test["method2"] = np.nan
    ###################################################################################################
        try:
            hcp_class_content = get_hcp_class(
                hcp_tech_class, hcp_class_mapping, doc_id, content_lb_pop)
        except IndexError:
            hcp_class_content = pd.DataFrame(
                columns=["content_title", "popularity"])

        test["method3"] = hcp_class_content[~hcp_class_content["content_title"].isin(hcp_reading_history.get(doc_id))] \
            .sort_values('popularity', ascending=False) \
            .head(5) \
            .content_title \
            .reset_index(drop=True)
        test["doctorid"] = doc_id
        test["rec_cnt"] = test.index + 1

        output5 = output5.append(test)

    output5 = output5.reset_index(drop=True)
    url = article_url[["title","url"]]    
    output5_1 = output5.merge(url,left_on=["method1"],right_on="title",how="left")
    output5_1 = output5_1.merge(url,left_on=["method2"],right_on="title",how="left",suffixes=("_1","_2"))
    output5_1 = output5_1.merge(url,left_on=["method3"],right_on="title",how="left",suffixes=("_1","_2"))
    col_drop = ["title_1","title_1","title"]
    output5_1.drop(columns=col_drop,axis=1,inplace=True)
    col_left =[ "doctorid","rec_cnt","method1","method2","method3","url_1","url_2","url"] 
    output5_1 = output5_1[col_left]
    output5_1.rename(columns={"url":"url_3"},inplace=True)
    print("Step 7: Done")
    print("------------------------------------------------------")
    print("ALL COMPLETE")

    nn.write_table(output1,'hcp_channel_preference',iotype = iotype)
    nn.write_table(output2,'hcp_content_interest',iotype = iotype)
    nn.write_table(output3,'hcp_content_interest_keyword',iotype = iotype)
    nn.write_table(output4,'hcp_reading_history',iotype = iotype)
    nn.write_table(output5_1,'hcp_recommendation',iotype = iotype)

    return(1)