def transfer_wechat_feature_2_tfRecord(start_date, end_date, raw_feature_file_path, tf_record_file_path): raw_feature_folder_name = os.path.basename(raw_feature_file_path) tf_record_folder_name = os.path.basename(tf_record_file_path) raw_feature_file = os.path.join(raw_feature_file_path, raw_feature_folder_name + "_%s") tf_record_file = os.path.join(tf_record_file_path, tf_record_folder_name + "_%s.tfrecord") data_info_csv_path = os.path.join(tf_record_file_path, "data_info.csv") if not os.path.isdir(tf_record_file_path): os.makedirs(tf_record_file_path) date_ls = DateUtil.get_every_date(start_date, end_date) for date in date_ls: print(date) tfRecorder.transfer_single_feature_file_2_tfRecord( raw_feature_file % date, tf_record_file % date, data_info_csv_path, column_names, label_name, need_features_cols, var_length_cols=var_length_cols, col_preprocess_func=col_preprocess_func)
def merge_wechat_records_by_opp_id(start_date, end_date): print("merge_wechat_records_by_opp_id..") date_ls = DateUtil.get_every_date(start_date, end_date) rs_dict = {} for date in date_ls: print(date) wechat_file = tmp_merged_wechat_record_data_file % date with codecs.open(wechat_file, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") opp_id = arr[0] chat_ls = [] for chat_str in arr[1:]: try: chat_dict = json.loads(chat_str, encoding='utf-8') except: # print("wechat record dict can't parse by json:", chat_str) continue chat_ls.append(chat_dict) if opp_id not in rs_dict: rs_dict[opp_id] = chat_ls else: rs_dict[opp_id].extend(chat_ls) print("sort chat by create_time..") for opp_id, chat_ls in rs_dict.items(): chat_ls.sort(key=lambda json_dict: json_dict["create_time"]) return rs_dict
def get_vocabulary_list(vocab_file_dir, feature_name, start_date, end_date): vocab_set = set() file_prefix = os.path.join(vocab_file_dir, feature_name + "_vocabulary_") date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: with codecs.open(file_prefix + tmp_date, "r", "utf-8") as fin: for line in fin: vocab_set.add(line.strip().encode("utf-8")) vocab_set.add("0".encode("utf-8")) return list(vocab_set)
def gen_multi_day_bench_mark_data(start_date, end_date): bench_mark_text_file_tmp = os.path.join(bench_mark_text_file_path, "total_chat_num_%s", "total_chat_num_%s_%s") for date in DateUtil.get_every_date(start_date, end_date): print(date) # 按chat_num分组 source_file_path = os.path.join(raw_feature_path, "wechat_record_feature_%s" % date) chat_group_dict = {x: [] for x in chat_num_ls} with codecs.open(source_file_path, 'r', 'utf-8') as fin: for line in fin: arr = line.strip().split("\t") column_names = [ "label", "opp_id", "create_time", "hist_student_chat_num", "hist_teacher_chat_num", "hist_total_chat_num", "wechat_record" ] tmp_dict = { key: value for key, value in zip(column_names, arr) } total_student_chat_num = int(tmp_dict["hist_student_chat_num"]) if total_student_chat_num in chat_group_dict: chat_group_dict[total_student_chat_num].append( line.strip()) for chat_num, line_ls in chat_group_dict.items(): if not line_ls: continue bench_mark_text_file = bench_mark_text_file_tmp % (chat_num, chat_num, date) tmp_text_folder_path = os.path.dirname(bench_mark_text_file) tmp_tf_record_folder_path = tmp_text_folder_path.replace( "feature_file", "tf_record") if not os.path.isdir(tmp_text_folder_path): os.makedirs(tmp_text_folder_path) with codecs.open(bench_mark_text_file, "w", "utf-8") as fout: for line in line_ls: fout.write(line + "\n") # 将转换好的text bench mark 文本文件 , 转换为tfRecord文件 tfRecorder.transfer_wechat_feature_2_tfRecord_default( date, date, tmp_text_folder_path, tmp_tf_record_folder_path)
def get_MinMaxValue_dict(MinMaxValue_file_dir, start_date, end_date): MinMaxValue_dict = dict() file_prefix = os.path.join(MinMaxValue_file_dir, "MinMaxValue_file_") date_ls = DateUtil.get_every_date(start_date, end_date) for tmp_date in date_ls: with codecs.open(file_prefix + tmp_date, "r", "utf-8") as fin: for line in fin: arr = line.strip().split("\t") feature_name = arr[0].strip() minValue = float(arr[1].strip()) maxValue = float(arr[2].strip()) if feature_name not in MinMaxValue_dict: MinMaxValue_dict[feature_name] = [minValue, maxValue] continue if minValue < MinMaxValue_dict[feature_name][0]: MinMaxValue_dict[feature_name][0] = minValue if maxValue > MinMaxValue_dict[feature_name][1]: MinMaxValue_dict[feature_name][1] = maxValue return MinMaxValue_dict
def get_multi_day_wechat_record_feature(start_date, end_date): for date in DateUtil.get_every_date(start_date, end_date): print(date) get_one_day_wechat_record_feature(date)