Exemple #1
0
def transfer_wechat_feature_2_tfRecord(start_date, end_date,
                                       raw_feature_file_path,
                                       tf_record_file_path):
    raw_feature_folder_name = os.path.basename(raw_feature_file_path)
    tf_record_folder_name = os.path.basename(tf_record_file_path)

    raw_feature_file = os.path.join(raw_feature_file_path,
                                    raw_feature_folder_name + "_%s")
    tf_record_file = os.path.join(tf_record_file_path,
                                  tf_record_folder_name + "_%s.tfrecord")
    data_info_csv_path = os.path.join(tf_record_file_path, "data_info.csv")

    if not os.path.isdir(tf_record_file_path):
        os.makedirs(tf_record_file_path)

    date_ls = DateUtil.get_every_date(start_date, end_date)

    for date in date_ls:
        print(date)
        tfRecorder.transfer_single_feature_file_2_tfRecord(
            raw_feature_file % date,
            tf_record_file % date,
            data_info_csv_path,
            column_names,
            label_name,
            need_features_cols,
            var_length_cols=var_length_cols,
            col_preprocess_func=col_preprocess_func)
def merge_wechat_records_by_opp_id(start_date, end_date):
    print("merge_wechat_records_by_opp_id..")
    date_ls = DateUtil.get_every_date(start_date, end_date)
    rs_dict = {}
    for date in date_ls:
        print(date)
        wechat_file = tmp_merged_wechat_record_data_file % date
        with codecs.open(wechat_file, 'r', 'utf-8') as fin:
            for line in fin:
                arr = line.strip().split("\t")
                opp_id = arr[0]
                chat_ls = []
                for chat_str in arr[1:]:
                    try:
                        chat_dict = json.loads(chat_str, encoding='utf-8')
                    except:
                        # print("wechat record dict can't parse by json:", chat_str)
                        continue
                    chat_ls.append(chat_dict)
                if opp_id not in rs_dict:
                    rs_dict[opp_id] = chat_ls
                else:
                    rs_dict[opp_id].extend(chat_ls)

    print("sort chat by create_time..")
    for opp_id, chat_ls in rs_dict.items():
        chat_ls.sort(key=lambda json_dict: json_dict["create_time"])
    return rs_dict
Exemple #3
0
def get_vocabulary_list(vocab_file_dir, feature_name, start_date, end_date):

    vocab_set = set()
    file_prefix = os.path.join(vocab_file_dir, feature_name + "_vocabulary_")
    date_ls = DateUtil.get_every_date(start_date, end_date)
    for tmp_date in date_ls:
        with codecs.open(file_prefix + tmp_date, "r", "utf-8") as fin:
            for line in fin:
                vocab_set.add(line.strip().encode("utf-8"))
    vocab_set.add("0".encode("utf-8"))

    return list(vocab_set)
def gen_multi_day_bench_mark_data(start_date, end_date):
    bench_mark_text_file_tmp = os.path.join(bench_mark_text_file_path,
                                            "total_chat_num_%s",
                                            "total_chat_num_%s_%s")
    for date in DateUtil.get_every_date(start_date, end_date):
        print(date)
        # 按chat_num分组
        source_file_path = os.path.join(raw_feature_path,
                                        "wechat_record_feature_%s" % date)
        chat_group_dict = {x: [] for x in chat_num_ls}

        with codecs.open(source_file_path, 'r', 'utf-8') as fin:
            for line in fin:
                arr = line.strip().split("\t")
                column_names = [
                    "label", "opp_id", "create_time", "hist_student_chat_num",
                    "hist_teacher_chat_num", "hist_total_chat_num",
                    "wechat_record"
                ]
                tmp_dict = {
                    key: value
                    for key, value in zip(column_names, arr)
                }
                total_student_chat_num = int(tmp_dict["hist_student_chat_num"])
                if total_student_chat_num in chat_group_dict:
                    chat_group_dict[total_student_chat_num].append(
                        line.strip())

        for chat_num, line_ls in chat_group_dict.items():
            if not line_ls:
                continue

            bench_mark_text_file = bench_mark_text_file_tmp % (chat_num,
                                                               chat_num, date)
            tmp_text_folder_path = os.path.dirname(bench_mark_text_file)
            tmp_tf_record_folder_path = tmp_text_folder_path.replace(
                "feature_file", "tf_record")

            if not os.path.isdir(tmp_text_folder_path):
                os.makedirs(tmp_text_folder_path)

            with codecs.open(bench_mark_text_file, "w", "utf-8") as fout:
                for line in line_ls:
                    fout.write(line + "\n")
            # 将转换好的text bench mark 文本文件 , 转换为tfRecord文件
            tfRecorder.transfer_wechat_feature_2_tfRecord_default(
                date, date, tmp_text_folder_path, tmp_tf_record_folder_path)
Exemple #5
0
def get_MinMaxValue_dict(MinMaxValue_file_dir, start_date, end_date):

    MinMaxValue_dict = dict()
    file_prefix = os.path.join(MinMaxValue_file_dir, "MinMaxValue_file_")
    date_ls = DateUtil.get_every_date(start_date, end_date)
    for tmp_date in date_ls:
        with codecs.open(file_prefix + tmp_date, "r", "utf-8") as fin:
            for line in fin:
                arr = line.strip().split("\t")
                feature_name = arr[0].strip()
                minValue = float(arr[1].strip())
                maxValue = float(arr[2].strip())
                if feature_name not in MinMaxValue_dict:
                    MinMaxValue_dict[feature_name] = [minValue, maxValue]
                    continue
                if minValue < MinMaxValue_dict[feature_name][0]:
                    MinMaxValue_dict[feature_name][0] = minValue
                if maxValue > MinMaxValue_dict[feature_name][1]:
                    MinMaxValue_dict[feature_name][1] = maxValue

    return MinMaxValue_dict
def get_multi_day_wechat_record_feature(start_date, end_date):
    for date in DateUtil.get_every_date(start_date, end_date):
        print(date)
        get_one_day_wechat_record_feature(date)