Example #1
0
def get_one_day_wechat_full_sentence(date):
    aggregated_wechat_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data",
                                              "aggregated_wechat_data")
    wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data",
                                                 "wechat_full_sentence_data")

    aggregated_wechat_data = os.path.join(aggregated_wechat_data_dir,
                                          "aggregated_wechat_data_%s" % date)
    wechat_full_sentence_data = os.path.join(
        wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s" % date)

    with codecs.open(aggregated_wechat_data, "r", "utf-8") as fin, \
            codecs.open(wechat_full_sentence_data, "w", "utf-8") as fout:
        for line in fin:
            arr = line.strip().split("\t")
            if len(arr) != 2:
                continue

            opp_id = arr[0].strip()
            chat_ls = arr[1].strip()
            try:
                chat_ls = json.loads(chat_ls, encoding="utf-8")
            except Exception as e:
                log.info(e)
                continue

            cleared_chat_sentence = clear_sentence(chat_ls)
            chat_stat_ls = stat_sentence(chat_ls)

            stat_str = "\t".join(map(str, chat_stat_ls))
            fout.write(opp_id + "\t" + stat_str + "\t" +
                       cleared_chat_sentence + "\n")
Example #2
0
def load_multi_day_order(start_date, end_date):
    multi_day_dict = {}
    date_ls = DateUtil.get_every_date(start_date, end_date)
    date_ls = sorted(date_ls, reverse=True)  # 降序排列,如果出现多次,用最小的订单时间覆盖
    for date in date_ls:
        log.info(date)
        one_day_dict = load_one_day_order(date)
        multi_day_dict.update(one_day_dict)
    return multi_day_dict
def load_wechat_2_dict(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)
    dict_wechat = OrderedDict()  #

    wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb",
                                           "raw_data/aggregated_wechat_segment_data")
    wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s")

    log.info(date_ls)
    for tmp_date in date_ls:
        log.info(tmp_date)
        dict_wechat[tmp_date] = OrderedDict()
        with codecs.open(wechat_segment_data % tmp_date, "r", "utf-8") as fin:
            for line in fin:
                arr = line.strip().split("\t")
                if len(arr) != 2:
                    continue
                opp_id = arr[0].strip()
                chat_ls = arr[1].strip()
                try:
                    chat_ls = json.loads(chat_ls)
                except Exception as e:
                    log.info(e)
                    continue
                dict_wechat[tmp_date][opp_id] = chat_ls
    return dict_wechat
def update_wechat_dict(dict_wechat, del_date, add_date):
    del dict_wechat[del_date]

    wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb",
                                           "raw_data/aggregated_wechat_segment_data")
    wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s" % add_date)

    dict_wechat[add_date] = OrderedDict()
    with codecs.open(wechat_segment_data, "r", "utf-8") as fin:
        for line in fin:
            arr = line.strip().split("\t")
            if len(arr) != 2:
                continue

            opp_id = arr[0].strip()
            chat_ls = arr[1].strip()
            try:
                chat_ls = json.loads(chat_ls)
            except Exception as e:
                log.info(2)
                continue
            dict_wechat[add_date][opp_id] = chat_ls
    return dict_wechat
Example #5
0
def load_hist_wechat_record_dict(date):
    wechat_dict = defaultdict(list)
    start_date = DateUtil.get_relative_delta_time_str(
        date, day=-HISTORY_WECHAT_RECORD_DELTA_DAY)
    end_date = DateUtil.get_relative_delta_time_str(date, -1)
    date_ls = sorted(DateUtil.get_every_date(start_date,
                                             end_date))  # 时间在后的聊天追加在后面
    wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data",
                                                 "wechat_full_sentence_data")

    wechat_full_sentence_data_file = os.path.join(
        wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s")
    for date in date_ls:
        log.info(date)
        wechat_full_sentence_data = wechat_full_sentence_data_file % date
        with codecs.open(wechat_full_sentence_data, 'r', 'utf-8') as fin:
            for line in fin:
                arr = line.strip().split("\t")
                if len(arr) != 5:
                    continue
                opp_id, student_chat_num, teacher_chat_num, all_chat_num, chat_content = arr
                student_chat_num, teacher_chat_num, all_chat_num = int(
                    student_chat_num), int(teacher_chat_num), int(all_chat_num)

                if opp_id not in wechat_dict:
                    wechat_dict[opp_id] = {
                        "stat_info": [0, 0, 0],
                        "chat_content": ""
                    }
                wechat_dict[opp_id]["chat_content"] = wechat_dict[opp_id][
                    "chat_content"] + chat_content
                wechat_dict[opp_id]["stat_info"] = [
                    x + y for x, y in
                    zip([student_chat_num, teacher_chat_num, all_chat_num],
                        wechat_dict[opp_id]["stat_info"])
                ]
    return wechat_dict
Example #6
0
def get_hist_wechat_full_sentence(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)
    for tmp_date in date_ls:
        log.info("extract %s basic feature..." % tmp_date)
        get_one_day_wechat_full_sentence(tmp_date)
Example #7
0
def get_one_day_wechat_basic_feature(date):
    log.info("get %s wechat basic feature..." % date)
    wechat_basic_feature_dir = os.path.join(PROJECT_DATA_DIR, "feature_file",
                                            "wechat_basic_feature")
    aggregated_wechat_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data",
                                              "aggregated_wechat_data")

    wechat_basic_feature_data = os.path.join(wechat_basic_feature_dir,
                                             "wechat_basic_feature_%s" % date)
    aggregated_wechat_data = os.path.join(aggregated_wechat_data_dir,
                                          "aggregated_wechat_data_%s" % date)
    log.info("prepare hist wechat chat dict...")
    hist_wechat_chat_dict = load_hist_wechat_record_dict(date)

    log.info("prepare hist and future order dict...")
    hist_order_dict = load_multi_day_order(
        DateUtil.get_relative_delta_time_str(date,
                                             day=-HISTORY_ORDER_DELTA_DAY),
        DateUtil.get_relative_delta_time_str(date, day=-1))
    future_order_dict = load_multi_day_order(
        date,
        DateUtil.get_relative_delta_time_str(date, day=FUTURE_ORDER_DELTA_DAY))

    log.info("start 2 gen wechat basic feature...")
    with codecs.open(aggregated_wechat_data, "r", "utf-8") as fin, \
            codecs.open(wechat_basic_feature_data, "w", "utf-8") as fout:
        for line in fin:
            arr = line.strip().split("\t")
            if len(arr) != 2:
                continue

            opp_id = arr[0].strip()
            chat_ls = arr[1].strip()
            try:
                chat_ls = json.loads(chat_ls, encoding="utf-8")
            except Exception as e:
                log.info(e)
                continue
            if opp_id in hist_order_dict:
                continue

            student_chat_idx = []
            for idx, chat_dict in enumerate(chat_ls):
                send_type = chat_dict["send_type"]
                if send_type == "1":
                    student_chat_idx.append(idx)
            if not student_chat_idx:
                continue

            # 随机选取一句学生对话,作为样本生成点
            sample_idx = np.random.choice(student_chat_idx, 1)[0]
            sample_chat = chat_ls[sample_idx]
            create_time = sample_chat["create_time"]
            account = sample_chat["account"]
            order_time = future_order_dict.get(opp_id, None)  # 是否最近成单
            label = judge_label(order_time, create_time)
            if label == "-1":
                continue
            sample_chat_ls = chat_ls[:sample_idx + 1]
            cleared_chat_sentence = clear_sentence(sample_chat_ls)
            chat_stat_ls = stat_sentence(sample_chat_ls)
            hist_chat_stat_ls = [0, 0, 0]

            hist_wechat_chat = hist_wechat_chat_dict.get(opp_id,
                                                         None)  # 是否有历史聊天记录

            if hist_wechat_chat:  # 拼接历史聊天信息
                hist_chat_stat_ls = hist_wechat_chat["stat_info"]
                cleared_chat_sentence = hist_wechat_chat[
                    "chat_content"] + cleared_chat_sentence

            today_stat_str = "\t".join(map(str, chat_stat_ls))
            hist_stat_str = "\t".join(map(str, hist_chat_stat_ls))
            result = "\t".join([
                label, opp_id, account, create_time, today_stat_str,
                hist_stat_str, cleared_chat_sentence
            ])
            fout.write(result + "\n")

            # 不抽样,剖析每个对话,学生说话则触发一次样本生成
            # for idx, chat_dict in enumerate(chat_ls):
            #     send_type = chat_dict["send_type"]
            #     create_time = chat_dict["create_time"]
            #     accout = chat_dict["account"]
            #
            #     if send_type == "0":  # 老师会话不做样本选取点
            #         continue
            #
            #     label = judge_label(order_time, create_time)
            #     if label == "-1":
            #         continue
            #
            #     cleared_chat_sentence = clear_sentence(chat_ls[:idx + 1])
            #     chat_stat_ls = stat_sentence(chat_ls[:idx + 1])
            #     hist_chat_stat_ls = [0, 0, 0]
            #
            #     if hist_wechat_chat:  # 拼接历史聊天信息
            #         hist_chat_stat_ls = hist_wechat_chat["stat_info"]
            #         cleared_chat_sentence = hist_wechat_chat["chat_content"] + cleared_chat_sentence
            #
            #     today_stat_str = "\t".join(map(str, chat_stat_ls))
            #     hist_stat_str = "\t".join(map(str, hist_chat_stat_ls))
            #     result = "\t".join(
            #         [label, opp_id, accout, create_time, today_stat_str, hist_stat_str, cleared_chat_sentence])
            #     fout.write(result + "\n")
    log.info("finished, write feature to file : %s" %
             wechat_basic_feature_data)
def get_hist_wechat_tf_feature(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)

    log.info("initial past n day wechat segment data...")
    s_date = (datetime.strptime(start_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d")
    t_date = start_date
    log.info("load past days wechat segment data...")
    dict_wechat = load_wechat_2_dict(s_date, t_date)

    log.info("extract tf feature...")
    for tmp_date in date_ls:
        log.info("extract %s tf feature..." % tmp_date)
        start_time = time.time()
        get_one_day_wechat_tf_feature(tmp_date, dict_wechat)
        log.info("extract {0} wechat tf feature cost time:{1}".format(tmp_date, time.time()-start_time))

        del_date = (datetime.strptime(tmp_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d")
        add_date = (datetime.strptime(tmp_date, "%Y%m%d") + timedelta(days=1)).strftime("%Y%m%d")
        if add_date <= end_date:
            log.info("update past days wechat segment data [del %s, add %s]..." % (del_date, add_date))
            dict_wechat = update_wechat_dict(dict_wechat, del_date, add_date)
            log.info("=======" * 3)
def get_one_day_wechat_tf_feature(date, dict_wechat):
    applied_order_data_dir = os.path.join(home_dir, "project_data/xgb", "raw_data/applied_order")
    middle_dir = os.path.join(home_dir, "project_data/xgb", "middle_file")
    label_date = (datetime.strptime(date, "%Y%m%d") + timedelta(days=LABEL_DAYS_LIMIT)).strftime("%Y%m%d")

    hist_applied_order_data = aggregation_data(applied_order_data_dir, middle_dir,
                                               "applied_order_",
                                               """{0} <= '%s'""" % date,
                                               file_pattern_filter="applied_order_")

    future_applied_order_data = aggregation_data(applied_order_data_dir, middle_dir,
                                                 "applied_order_",
                                                 """'%s'<= {0} <='%s'""" % (date, label_date),
                                                 file_pattern_filter="applied_order_")

    log.info("load history applied order to filter applied samples...")
    dict_hist_applied_order = load_applied_order_2_dict(hist_applied_order_data)

    log.info("load future applied order to judge sample label...")
    dict_future_applied_order = load_applied_order_2_dict(future_applied_order_data)

    benchmark_label_data = os.path.join(home_dir, "project_data/xgb",
                                        "benchmark_label_data", "benchmark_label_data_%s" % date)
    wechat_tf_feature_file = os.path.join(home_dir, "project_data/xgb",
                                          "feature_file/wechat_tf_feature", "wechat_tf_feature_%s" % date)

    past_tf_dict = OrderedDict()  # 过去N天会话的词频统计(不区分学员和咨询师)
    for tmp_date, tmp_dc in dict_wechat.items():
        if tmp_date == date:
            continue
        for tmp_opp, tmp_dc_ls in tmp_dc.items():
            if tmp_opp not in past_tf_dict:
                past_tf_dict[tmp_opp] = OrderedDict()
            for tmp in tmp_dc_ls:
                chat_record_seg = tmp["chat_record"]
                if chat_record_seg.strip() == "":
                    continue
                seg_words = chat_record_seg.strip().split(" ")
                for word in seg_words:
                    if word not in past_tf_dict[tmp_opp]:
                        past_tf_dict[tmp_opp][word] = 0
                    past_tf_dict[tmp_opp][word] += 1

    past_student_dialogue_dict = dict()  # 过去N天学员会话次数
    past_account_dialogue_dict = dict()   # 过去N天咨询师会话次数
    for tmp_date, tmp_dc in dict_wechat.items():
        if tmp_date == date:
            continue
        for tmp_opp, tmp_dc_ls in tmp_dc.items():
            if tmp_opp not in past_student_dialogue_dict:
                past_student_dialogue_dict[tmp_opp] = 0
            if tmp_opp not in past_account_dialogue_dict:
                past_account_dialogue_dict[tmp_opp] = 0
            for tmp in tmp_dc_ls:
                send_type = tmp["send_type"]
                if send_type == "1":
                    past_student_dialogue_dict[tmp_opp] += 1
                elif send_type == "0":
                    past_account_dialogue_dict[tmp_opp] += 1

    with codecs.open(benchmark_label_data, "w", "utf-8") as fout1, codecs.open(wechat_tf_feature_file, "w", "utf-8") as fout2:
        for tmp_opp, tmp_dc_ls in dict_wechat[date].items():

            dict_word_tf = OrderedDict()
            student_dialogue = 0
            account_dialogue = 0
            for tmp_dc in tmp_dc_ls:
                create_time = tmp_dc["create_time"]
                receive_time = tmp_dc["receive_time"]
                account = tmp_dc["account"]
                chat_record_seg = tmp_dc["chat_record"]
                send_type = tmp_dc["send_type"]

                if tmp_opp in dict_hist_applied_order and min(dict_hist_applied_order[tmp_opp]) < create_time:  # 历史已经成单(这里不考虑成多单的情况)
                    continue

                if chat_record_seg.strip() == "":
                    continue

                seg_words = chat_record_seg.strip().split(" ")
                for word in seg_words:  # 计算当天该机会对应会话的tf
                    if word not in dict_word_tf:
                        dict_word_tf[word] = 0
                    dict_word_tf[word] += 1

                if send_type == "0":  # 咨询师会话不作为样本
                    account_dialogue += 1
                    continue
                else:
                    student_dialogue += 1

                # log.info("judge label...")
                label = judge_label(dict_future_applied_order, tmp_opp, create_time)

                # 综合past days的该机会的会话词频
                if tmp_opp in past_tf_dict:
                    statistic_tf_dict = copy.deepcopy(past_tf_dict[tmp_opp])
                    for tmp_word, tmp_tf in dict_word_tf.items():
                        if tmp_word not in statistic_tf_dict:
                            statistic_tf_dict[tmp_word] = 0
                        statistic_tf_dict[tmp_word] += tmp_tf
                else:
                    statistic_tf_dict = copy.deepcopy(dict_word_tf)

                result_str = label + " " + "account_%s:1" % account + " "
                for tmp_word, tmp_tf in statistic_tf_dict.items():
                    result_str += "TF_" + tmp_word + ":" + str(tmp_tf) + " "
                fout1.write(str(student_dialogue + past_student_dialogue_dict.get(tmp_opp, 0)) + "\t" +
                            str(account_dialogue + past_account_dialogue_dict.get(tmp_opp, 0)) + "\t" +
                            label + "\t" + tmp_opp + "\t" +
                            account + "\t" + create_time + "\t" + receive_time + "\t" + chat_record_seg + "\n")
                fout2.write(result_str.strip() + "\n")

    delete_flag = True
    if delete_flag:
        cmd = "rm %s" % (" ".join([hist_applied_order_data, future_applied_order_data]))
        log.info(cmd)
        os.system(cmd)