def get_one_day_wechat_full_sentence(date): aggregated_wechat_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "aggregated_wechat_data") wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "wechat_full_sentence_data") aggregated_wechat_data = os.path.join(aggregated_wechat_data_dir, "aggregated_wechat_data_%s" % date) wechat_full_sentence_data = os.path.join( wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s" % date) with codecs.open(aggregated_wechat_data, "r", "utf-8") as fin, \ codecs.open(wechat_full_sentence_data, "w", "utf-8") as fout: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls, encoding="utf-8") except Exception as e: log.info(e) continue cleared_chat_sentence = clear_sentence(chat_ls) chat_stat_ls = stat_sentence(chat_ls) stat_str = "\t".join(map(str, chat_stat_ls)) fout.write(opp_id + "\t" + stat_str + "\t" + cleared_chat_sentence + "\n")
def load_multi_day_order(start_date, end_date): multi_day_dict = {} date_ls = DateUtil.get_every_date(start_date, end_date) date_ls = sorted(date_ls, reverse=True) # 降序排列,如果出现多次,用最小的订单时间覆盖 for date in date_ls: log.info(date) one_day_dict = load_one_day_order(date) multi_day_dict.update(one_day_dict) return multi_day_dict
def load_wechat_2_dict(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) dict_wechat = OrderedDict() # wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb", "raw_data/aggregated_wechat_segment_data") wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s") log.info(date_ls) for tmp_date in date_ls: log.info(tmp_date) dict_wechat[tmp_date] = OrderedDict() with codecs.open(wechat_segment_data % tmp_date, "r", "utf-8") as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls) except Exception as e: log.info(e) continue dict_wechat[tmp_date][opp_id] = chat_ls return dict_wechat
def update_wechat_dict(dict_wechat, del_date, add_date): del dict_wechat[del_date] wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb", "raw_data/aggregated_wechat_segment_data") wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s" % add_date) dict_wechat[add_date] = OrderedDict() with codecs.open(wechat_segment_data, "r", "utf-8") as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls) except Exception as e: log.info(2) continue dict_wechat[add_date][opp_id] = chat_ls return dict_wechat
def load_hist_wechat_record_dict(date): wechat_dict = defaultdict(list) start_date = DateUtil.get_relative_delta_time_str( date, day=-HISTORY_WECHAT_RECORD_DELTA_DAY) end_date = DateUtil.get_relative_delta_time_str(date, -1) date_ls = sorted(DateUtil.get_every_date(start_date, end_date)) # 时间在后的聊天追加在后面 wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "wechat_full_sentence_data") wechat_full_sentence_data_file = os.path.join( wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s") for date in date_ls: log.info(date) wechat_full_sentence_data = wechat_full_sentence_data_file % date with codecs.open(wechat_full_sentence_data, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") if len(arr) != 5: continue opp_id, student_chat_num, teacher_chat_num, all_chat_num, chat_content = arr student_chat_num, teacher_chat_num, all_chat_num = int( student_chat_num), int(teacher_chat_num), int(all_chat_num) if opp_id not in wechat_dict: wechat_dict[opp_id] = { "stat_info": [0, 0, 0], "chat_content": "" } wechat_dict[opp_id]["chat_content"] = wechat_dict[opp_id][ "chat_content"] + chat_content wechat_dict[opp_id]["stat_info"] = [ x + y for x, y in zip([student_chat_num, teacher_chat_num, all_chat_num], wechat_dict[opp_id]["stat_info"]) ] return wechat_dict
def get_hist_wechat_full_sentence(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: log.info("extract %s basic feature..." % tmp_date) get_one_day_wechat_full_sentence(tmp_date)
def get_one_day_wechat_basic_feature(date): log.info("get %s wechat basic feature..." % date) wechat_basic_feature_dir = os.path.join(PROJECT_DATA_DIR, "feature_file", "wechat_basic_feature") aggregated_wechat_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data", "aggregated_wechat_data") wechat_basic_feature_data = os.path.join(wechat_basic_feature_dir, "wechat_basic_feature_%s" % date) aggregated_wechat_data = os.path.join(aggregated_wechat_data_dir, "aggregated_wechat_data_%s" % date) log.info("prepare hist wechat chat dict...") hist_wechat_chat_dict = load_hist_wechat_record_dict(date) log.info("prepare hist and future order dict...") hist_order_dict = load_multi_day_order( DateUtil.get_relative_delta_time_str(date, day=-HISTORY_ORDER_DELTA_DAY), DateUtil.get_relative_delta_time_str(date, day=-1)) future_order_dict = load_multi_day_order( date, DateUtil.get_relative_delta_time_str(date, day=FUTURE_ORDER_DELTA_DAY)) log.info("start 2 gen wechat basic feature...") with codecs.open(aggregated_wechat_data, "r", "utf-8") as fin, \ codecs.open(wechat_basic_feature_data, "w", "utf-8") as fout: for line in fin: arr = line.strip().split("\t") if len(arr) != 2: continue opp_id = arr[0].strip() chat_ls = arr[1].strip() try: chat_ls = json.loads(chat_ls, encoding="utf-8") except Exception as e: log.info(e) continue if opp_id in hist_order_dict: continue student_chat_idx = [] for idx, chat_dict in enumerate(chat_ls): send_type = chat_dict["send_type"] if send_type == "1": student_chat_idx.append(idx) if not student_chat_idx: continue # 随机选取一句学生对话,作为样本生成点 sample_idx = np.random.choice(student_chat_idx, 1)[0] sample_chat = chat_ls[sample_idx] create_time = sample_chat["create_time"] account = sample_chat["account"] order_time = future_order_dict.get(opp_id, None) # 是否最近成单 label = judge_label(order_time, create_time) if label == "-1": continue sample_chat_ls = chat_ls[:sample_idx + 1] cleared_chat_sentence = clear_sentence(sample_chat_ls) chat_stat_ls = stat_sentence(sample_chat_ls) hist_chat_stat_ls = [0, 0, 0] hist_wechat_chat = hist_wechat_chat_dict.get(opp_id, None) # 是否有历史聊天记录 if hist_wechat_chat: # 拼接历史聊天信息 hist_chat_stat_ls = hist_wechat_chat["stat_info"] cleared_chat_sentence = hist_wechat_chat[ "chat_content"] + cleared_chat_sentence today_stat_str = "\t".join(map(str, chat_stat_ls)) hist_stat_str = "\t".join(map(str, hist_chat_stat_ls)) result = "\t".join([ label, opp_id, account, create_time, today_stat_str, hist_stat_str, cleared_chat_sentence ]) fout.write(result + "\n") # 不抽样,剖析每个对话,学生说话则触发一次样本生成 # for idx, chat_dict in enumerate(chat_ls): # send_type = chat_dict["send_type"] # create_time = chat_dict["create_time"] # accout = chat_dict["account"] # # if send_type == "0": # 老师会话不做样本选取点 # continue # # label = judge_label(order_time, create_time) # if label == "-1": # continue # # cleared_chat_sentence = clear_sentence(chat_ls[:idx + 1]) # chat_stat_ls = stat_sentence(chat_ls[:idx + 1]) # hist_chat_stat_ls = [0, 0, 0] # # if hist_wechat_chat: # 拼接历史聊天信息 # hist_chat_stat_ls = hist_wechat_chat["stat_info"] # cleared_chat_sentence = hist_wechat_chat["chat_content"] + cleared_chat_sentence # # today_stat_str = "\t".join(map(str, chat_stat_ls)) # hist_stat_str = "\t".join(map(str, hist_chat_stat_ls)) # result = "\t".join( # [label, opp_id, accout, create_time, today_stat_str, hist_stat_str, cleared_chat_sentence]) # fout.write(result + "\n") log.info("finished, write feature to file : %s" % wechat_basic_feature_data)
def get_hist_wechat_tf_feature(start_date, end_date): date_ls = DateUtil.get_every_date(start_date, end_date) log.info("initial past n day wechat segment data...") s_date = (datetime.strptime(start_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d") t_date = start_date log.info("load past days wechat segment data...") dict_wechat = load_wechat_2_dict(s_date, t_date) log.info("extract tf feature...") for tmp_date in date_ls: log.info("extract %s tf feature..." % tmp_date) start_time = time.time() get_one_day_wechat_tf_feature(tmp_date, dict_wechat) log.info("extract {0} wechat tf feature cost time:{1}".format(tmp_date, time.time()-start_time)) del_date = (datetime.strptime(tmp_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d") add_date = (datetime.strptime(tmp_date, "%Y%m%d") + timedelta(days=1)).strftime("%Y%m%d") if add_date <= end_date: log.info("update past days wechat segment data [del %s, add %s]..." % (del_date, add_date)) dict_wechat = update_wechat_dict(dict_wechat, del_date, add_date) log.info("=======" * 3)
def get_one_day_wechat_tf_feature(date, dict_wechat): applied_order_data_dir = os.path.join(home_dir, "project_data/xgb", "raw_data/applied_order") middle_dir = os.path.join(home_dir, "project_data/xgb", "middle_file") label_date = (datetime.strptime(date, "%Y%m%d") + timedelta(days=LABEL_DAYS_LIMIT)).strftime("%Y%m%d") hist_applied_order_data = aggregation_data(applied_order_data_dir, middle_dir, "applied_order_", """{0} <= '%s'""" % date, file_pattern_filter="applied_order_") future_applied_order_data = aggregation_data(applied_order_data_dir, middle_dir, "applied_order_", """'%s'<= {0} <='%s'""" % (date, label_date), file_pattern_filter="applied_order_") log.info("load history applied order to filter applied samples...") dict_hist_applied_order = load_applied_order_2_dict(hist_applied_order_data) log.info("load future applied order to judge sample label...") dict_future_applied_order = load_applied_order_2_dict(future_applied_order_data) benchmark_label_data = os.path.join(home_dir, "project_data/xgb", "benchmark_label_data", "benchmark_label_data_%s" % date) wechat_tf_feature_file = os.path.join(home_dir, "project_data/xgb", "feature_file/wechat_tf_feature", "wechat_tf_feature_%s" % date) past_tf_dict = OrderedDict() # 过去N天会话的词频统计(不区分学员和咨询师) for tmp_date, tmp_dc in dict_wechat.items(): if tmp_date == date: continue for tmp_opp, tmp_dc_ls in tmp_dc.items(): if tmp_opp not in past_tf_dict: past_tf_dict[tmp_opp] = OrderedDict() for tmp in tmp_dc_ls: chat_record_seg = tmp["chat_record"] if chat_record_seg.strip() == "": continue seg_words = chat_record_seg.strip().split(" ") for word in seg_words: if word not in past_tf_dict[tmp_opp]: past_tf_dict[tmp_opp][word] = 0 past_tf_dict[tmp_opp][word] += 1 past_student_dialogue_dict = dict() # 过去N天学员会话次数 past_account_dialogue_dict = dict() # 过去N天咨询师会话次数 for tmp_date, tmp_dc in dict_wechat.items(): if tmp_date == date: continue for tmp_opp, tmp_dc_ls in tmp_dc.items(): if tmp_opp not in past_student_dialogue_dict: past_student_dialogue_dict[tmp_opp] = 0 if tmp_opp not in past_account_dialogue_dict: past_account_dialogue_dict[tmp_opp] = 0 for tmp in tmp_dc_ls: send_type = tmp["send_type"] if send_type == "1": past_student_dialogue_dict[tmp_opp] += 1 elif send_type == "0": past_account_dialogue_dict[tmp_opp] += 1 with codecs.open(benchmark_label_data, "w", "utf-8") as fout1, codecs.open(wechat_tf_feature_file, "w", "utf-8") as fout2: for tmp_opp, tmp_dc_ls in dict_wechat[date].items(): dict_word_tf = OrderedDict() student_dialogue = 0 account_dialogue = 0 for tmp_dc in tmp_dc_ls: create_time = tmp_dc["create_time"] receive_time = tmp_dc["receive_time"] account = tmp_dc["account"] chat_record_seg = tmp_dc["chat_record"] send_type = tmp_dc["send_type"] if tmp_opp in dict_hist_applied_order and min(dict_hist_applied_order[tmp_opp]) < create_time: # 历史已经成单(这里不考虑成多单的情况) continue if chat_record_seg.strip() == "": continue seg_words = chat_record_seg.strip().split(" ") for word in seg_words: # 计算当天该机会对应会话的tf if word not in dict_word_tf: dict_word_tf[word] = 0 dict_word_tf[word] += 1 if send_type == "0": # 咨询师会话不作为样本 account_dialogue += 1 continue else: student_dialogue += 1 # log.info("judge label...") label = judge_label(dict_future_applied_order, tmp_opp, create_time) # 综合past days的该机会的会话词频 if tmp_opp in past_tf_dict: statistic_tf_dict = copy.deepcopy(past_tf_dict[tmp_opp]) for tmp_word, tmp_tf in dict_word_tf.items(): if tmp_word not in statistic_tf_dict: statistic_tf_dict[tmp_word] = 0 statistic_tf_dict[tmp_word] += tmp_tf else: statistic_tf_dict = copy.deepcopy(dict_word_tf) result_str = label + " " + "account_%s:1" % account + " " for tmp_word, tmp_tf in statistic_tf_dict.items(): result_str += "TF_" + tmp_word + ":" + str(tmp_tf) + " " fout1.write(str(student_dialogue + past_student_dialogue_dict.get(tmp_opp, 0)) + "\t" + str(account_dialogue + past_account_dialogue_dict.get(tmp_opp, 0)) + "\t" + label + "\t" + tmp_opp + "\t" + account + "\t" + create_time + "\t" + receive_time + "\t" + chat_record_seg + "\n") fout2.write(result_str.strip() + "\n") delete_flag = True if delete_flag: cmd = "rm %s" % (" ".join([hist_applied_order_data, future_applied_order_data])) log.info(cmd) os.system(cmd)