Esempio n. 1
0
def train_model_parallel(args, data_df, data_df_log, sc):
    """ 对需要分组的数据进行并行处理
        note: 并行有很多问题,先暂时串行
    """
    distinct_group_name = {i for i in data_df[args.group_field_name].array}
    # parallel_num = len(distinct_group_name) if len(distinct_group_name) < 10 else 10
    # p = Pool(parallel_num)
    # for group_name in distinct_group_name:
    #     group_data_df = data_df[data_df[args.group_field_name] == group_name].drop(args.group_field_name, axis=1)
    #     group_data_df_log = data_df_log[data_df_log[args.group_field_name] == group_name].drop(args.group_field_name, axis=1)
    #     p.apply_async(train_model, args=(args, group_data_df, group_data_df_log, sc, "-" + group_name))
    #
    # p.close()
    # p.join()
    l = []
    for group_name in distinct_group_name:
        group_data_df = data_df[data_df[args.group_field_name] ==
                                group_name].drop(args.group_field_name, axis=1)
        group_data_df_log = data_df_log[data_df_log[args.group_field_name] ==
                                        group_name].drop(args.group_field_name,
                                                         axis=1)
        # 对数据做校验
        check_data_num(sc, group_data_df, args)
        l.append((group_name, group_data_df, group_data_df_log))

    for group_name, group_data_df, group_data_df_log in l:
        train_model(args, group_data_df, group_data_df_log, sc,
                    str(group_name) + "-")

    save_data.write_data_to_cluster(sc,
                                    args.evaluate + os.sep +
                                    constants.EVALUATION_MODEL_NAME,
                                    str(res_trend),
                                    is_text_file=True)
Esempio n. 2
0
def check_data_num(sc, data, args):
    """ 对数据进行异常校验 """
    # 计算预测秒数
    predict_seconds = 0
    if "d" == args.freq.lower():
        predict_seconds = args.periods * 24 * 60 * 60
    elif "y" == args.freq.lower():
        predict_seconds = args.periods * 24 * 60 * 60 * 365
    elif "m" == args.freq.lower():
        predict_seconds = args.periods * 24 * 60 * 60 * 30
    elif "min" == args.freq.lower():
        predict_seconds = args.periods * 60
    elif "h" == args.freq.lower():
        predict_seconds = args.periods * 60 * 60
    else:
        predict_seconds = args.periods * 24 * 60 * 60

    if (data["ds"].max().timestamp() - data["ds"].min().timestamp()
        ) < args.base_multiple * predict_seconds:
        print("======> 数据异常,预测数据过少 ")
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep +
                                        constants.ERROR_LOG,
                                        "数据异常-预测数据过少",
                                        is_text_file=True)
        exit(0)
Esempio n. 3
0
def gen_pmml_to_hdfs(sc, model, args):
    """ 生成 pmml,pkl 到 hdfs """

    # 先将pmml文件生成到driver的tmp目录下,然后再将其上传至HDFS
    # 目录名称 /tmp/时间戳/文件名
    print("===========> 保存模型文件到 HDFS")
    pmml_model_name = constants.PMML_NAME
    pkl_model_name = constants.PKL_NAME
    dir_name = "/tmp/" + str(time.time())
    args.tmp_dir = dir_name
    os.mkdir(dir_name)

    # 保存为pmml文件
    pipeline = PMMLPipeline([("classifier", model)])
    sklearn2pmml(pipeline, dir_name + os.sep + pmml_model_name, with_repr=True)
    joblib.dump(model, dir_name + os.sep + pkl_model_name)

    # 上传文件至HDFS
    with open(dir_name + os.sep + pmml_model_name, "r") as f1, \
            open(dir_name + os.sep + pkl_model_name, "rb") as f2:
        data1 = f1.read()
        data2 = f2.read()
        save_data.write_data_to_cluster(sc, args.export_dir + os.sep + pmml_model_name, data1)
        save_data.write_data_to_cluster(sc, args.model_dir + os.sep + pkl_model_name, data2, is_text_file=False)

    # 删除临时文件
    os.remove(dir_name + os.sep + pmml_model_name)
    os.remove(dir_name + os.sep + pkl_model_name)
Esempio n. 4
0
def train_model_by_all_data(args, data_df, sc, suffix=""):
    """ 使用全量数据训练模型并保存结果到hdfs """
    m = get_prophet_model(args)
    m.fit(data_df)
    future = m.make_future_dataframe(periods=args.periods, freq=args.freq)
    future_tmp = m.make_future_dataframe(periods=args.periods,
                                         freq=args.freq,
                                         include_history=False)
    forecast = m.predict(future)
    forecast_tmp = m.predict(future_tmp)
    # res_fig = m.plot(forecast)
    # component_fig = m.plot_components(forecast)
    splitFlag = forecast["ds"].count() - data_df["ds"].count()
    res_fig = drawing_pic.plot(m, forecast, splitFlag)
    component_fig = drawing_pic.plot_components(m, forecast)

    dir_name = args.tmp_dir
    # 预测结果
    res_path = dir_name + os.sep + "000000"
    res_df = pd.DataFrame(forecast, columns=["ds", "yhat"])
    res_df_tmp = pd.DataFrame(forecast_tmp, columns=["ds", "yhat"])

    if suffix != "":
        res_df_tmp[args.group_field_name] = suffix[:-1]
        res_df[args.group_field_name] = suffix[:-1]
        tail_sum = res_df[-args.periods:]["yhat"].sum()
        pre_sum = res_df[-args.periods * 2:-args.periods]["yhat"].sum()
        global res_trend
        res_trend[suffix[:-1]] = {
            "difference": abs(tail_sum - pre_sum),
            "amplitude": abs(tail_sum - pre_sum) / pre_sum
        }

    # res_df.to_csv(res_path, index=False, mode="a", header=False)
    res_df_tmp.to_csv(res_path, index=False, mode="a", header=False)
    # 预测结果图
    res_pic_path = dir_name + os.sep + suffix + constants.PREDICT_PIC
    # 成分分析图
    component_pic_path = dir_name + os.sep + suffix + constants.COMPONENT_PIC
    standard_fig(res_fig)
    standard_fig_component(component_fig)
    res_fig.savefig(res_pic_path)
    component_fig.savefig(component_pic_path)
    with open(res_pic_path, "rb") as f1, open(component_pic_path, "rb") as f2:
        data1 = f1.read()
        data2 = f2.read()
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep + suffix +
                                        constants.PREDICT_PIC,
                                        data1,
                                        is_text_file=False)
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep + suffix +
                                        constants.COMPONENT_PIC,
                                        data2,
                                        is_text_file=False)

    os.remove(res_pic_path)
    os.remove(component_pic_path)
Esempio n. 5
0
def run():
    args = parse_args()
    sc, spark, num_executors = init_spark_session(args)

    # 拿到训练数据
    dataDF, label_type, features_name = get_train_data(spark, args,
                                                       num_executors)

    trainDF, testDF = dataDF.randomSplit(
        [1 - float(args.sample_ratio),
         float(args.sample_ratio)],
        seed=args.seed)

    trainDF.cache()
    testDF.cache()  # 后面用于预测

    import math
    trainDFCount = trainDF.count()
    args.epochs = math.ceil(args.steps * args.batch_size / trainDFCount)
    print("迭代步数:" + str(args.steps) + "=======" + " 迭代轮次:" + str(args.epochs))

    # 将路径转换为 hdfs 路径
    args.export_dir = common_utils.get_abs_path(sc, args.export_dir)
    args.model_dir = common_utils.get_abs_path(sc, args.model_dir)

    # 训练
    print("{0} ===== Estimator.fit()".format(datetime.now().isoformat()))
    estimator = get_tf_estimator(args)
    model = estimator.fit(trainDF)
    trainDF.unpersist()

    # 推理 [y_,y_pre]
    set_model_param(model, args)
    print("{0} ===== Model.transform()".format(datetime.now().isoformat()))
    test_preds_tmp = model.transform(testDF)
    testDF.unpersist()

    test_preds_tmp = save_res_to_hdfs(test_preds_tmp, args, label_type,
                                      features_name, spark)

    # 将预测结果和label拿出来,评估模型
    test_preds = test_preds_tmp.map(lambda row: [row[1], row[2]]).cache()
    test_preds_tmp.unpersist()

    # 模型评估
    final_result = evaluate_model(test_preds, label_type)
    save_data.write_data_to_cluster(
        sc,
        args.evaluate + constants.PATH_SEP + constants.EVALUATION_MODEL_NAME,
        str(final_result))

    print("{0} ===== Stop".format(datetime.now().isoformat()))
Esempio n. 6
0
def save_res_to_hdfs(sc, args):
    # 将预测结果写入hdfs
    res_path = args.tmp_dir + os.sep + "000000"

    if args.group_field_name != "null":
        format_res(res_path, args.tmp_dir + os.sep + "000001")
        res_path = args.tmp_dir + os.sep + "000001"

    with open(res_path, "r") as f:
        data = f.read()
        save_data.write_data_to_cluster(sc,
                                        args.output + os.sep + "000000",
                                        data,
                                        is_text_file=True)
    os.remove(res_path)
Esempio n. 7
0
def cross_validation(args, train_data_df, test_data_df, sc):
    """ 交叉验证求 mape """
    m = get_prophet_model(args)
    m.fit(train_data_df)
    res = pd.merge(pd.DataFrame(m.predict(test_data_df),
                                columns=["ds", "yhat"]),
                   test_data_df,
                   how="inner",
                   on="ds")
    mape = ((res["y"] - res["yhat"]) /
            res["y"]).apply(abs).sum() / test_data_df.count().array[0]
    save_data.write_data_to_cluster(sc,
                                    args.evaluate + os.sep +
                                    constants.EVALUATION_MODEL_NAME,
                                    str({"mape": str(mape)}),
                                    is_text_file=True)
Esempio n. 8
0
def get_standard_data(sc, data_df, args):
    """ 对数据进行处理 """
    schema_list = data_df.columns.to_list()
    # assert len(schema_list) == 2, "目前只支持两列输入"
    # 将列名修改为 ds,y
    try:
        schema_list.remove(args.label_name)
        if args.no_validation:
            schema_list.remove(args.group_field_name)
            if len(schema_list) > 1:
                data_df = data_df.drop(schema_list[0], axis=1)
                schema_list.remove(schema_list[0])
        print("=======> 删除序号以及分组列之后的表头信息 " + str(schema_list))
        data_df = data_df.rename(columns={
            args.label_name: "y",
            schema_list[0]: "ds"
        })
        data_df["ds"] = pd.to_datetime(data_df["ds"])
    except:
        print("======> 数据格式异常 ")
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep +
                                        constants.ERROR_LOG,
                                        "数据格式异常",
                                        is_text_file=True)
        exit(0)

    # 对数处理
    data_df_log = copy.deepcopy(data_df)
    data_df_log["y"] = data_df_log["y"].apply(math.log)

    if not args.no_validation:
        # 交叉验证数据
        # data_df["ds"] = pd.to_datetime(data_df["ds"])
        count = data_df.count().array[0]
        data_df = data_df.sort_values(by="ds")
        split_flag = math.floor(count * (1 - args.sample_ratio))
        train_data_df = data_df.iloc[:split_flag]
        test_data_df = data_df.iloc[split_flag:]
        return data_df, data_df_log, train_data_df, test_data_df
    else:
        return data_df, data_df_log, None, None
Esempio n. 9
0
def train_model_by_log_data(args, data_df_log, sc, suffix=""):
    """ 使用log处理的y进行重新训练 """
    m = get_prophet_model(args)
    m.fit(data_df_log)
    future = m.make_future_dataframe(periods=args.periods, freq=args.freq)
    forecast = m.predict(future)
    png_path = args.tmp_dir + os.sep + suffix + constants.PREDICT_LOG_PIC
    # res_fig = m.plot(forecast)
    splitFlag = forecast["ds"].count() - data_df_log["ds"].count()
    res_fig = drawing_pic.plot(m, forecast, splitFlag)

    standard_fig(res_fig)
    res_fig.savefig(png_path)
    with open(png_path, "rb") as f:
        data = f.read()
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep + suffix +
                                        constants.PREDICT_LOG_PIC,
                                        data,
                                        is_text_file=False)
    os.remove(png_path)
Esempio n. 10
0
def check_data_null(sc, data, args):
    flag = False
    try:
        count = data[args.label_name].count()
        null_ratio_group_field = sum(pd.isna(
            data[args.group_field_name])) / count
        null_ratio_label = sum(pd.isna(data[args.label_name])) / count

        schema_list = data.columns.to_list()
        schema_list.remove(args.label_name)
        schema_list.remove(args.group_field_name)
        null_ratio_date = 0
        if len(schema_list) > 1:
            null_ratio_date = sum(pd.isna(data[schema_list[1]])) / count
        elif len(schema_list) == 1:
            null_ratio_date = sum(pd.isna(data[schema_list[0]])) / count

        print("========> cellid空值占比 " + str(null_ratio_group_field))
        print("========> x空值占比 " + str(null_ratio_date))
        print("========> y空值占比 " + str(null_ratio_label))
        if null_ratio_group_field > 0.05 or null_ratio_label > 0.05 or null_ratio_date > 0.05:
            save_data.write_data_to_cluster(sc,
                                            args.evaluate + os.sep +
                                            constants.ERROR_LOG,
                                            "数据异常-预测数据空值过多",
                                            is_text_file=True)
            flag = True
            exit(0)
    except:
        if not flag:
            print("======> 数据格式异常 ")
            save_data.write_data_to_cluster(sc,
                                            args.evaluate + os.sep +
                                            constants.ERROR_LOG,
                                            "数据格式异常",
                                            is_text_file=True)
        exit(0)
Esempio n. 11
0
def evaluate_model(y, y_, args, class_s, sc, model, x):
    """ 评估模型 """
    print("=======> 开始进行模型评估")

    # 保存报告文件到HDFS
    report = classification_report(y, y_, labels=class_s)
    report_dict = classification_report(y, y_, labels=class_s, output_dict=True)
    res_dict = {}
    accuracy_score(y, y_)
    res_dict["total_accuracy"] = accuracy_score(y, y_)

    # 存放类别字典
    class_list = []
    res_dict["other_indicators"] = class_list
    for i in [str(c) for c in class_s]:
        d = {"recall": report_dict[i]["recall"],
             "precise": report_dict[i]["precision"],
             "f1-score": report_dict[i]["f1-score"],
             "type": i
             }
        class_list.append(d)
    res_dict["n_class"] = len(class_s)

    save_data.write_data_to_cluster(sc, args.evaluate + os.sep + constants.CLASSIFICATION_REPORT, report)

    # 保存报告图片
    report_path = args.tmp_dir + os.sep + constants.CLASSIFICATION_REPORT + ".png"
    drawing_pic.save_text_to_pic(report, report_path)
    with open(report_path, "rb") as f:
        data = f.read()
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep + constants.CLASSIFICATION_REPORT + ".png",
                                        data,
                                        is_text_file=False)
    os.remove(report_path)

    # 保存混淆矩阵信息到HDFS
    matrix = confusion_matrix(y, y_, labels=class_s).tolist()
    res = ""
    for index, line in enumerate(matrix):
        res += str(class_s[index]) + "," + ",".join([str(l) for l in line]) + "\n"
    save_data.write_data_to_cluster(sc, args.evaluate + os.sep + constants.CONFUSION_MATRIX, res)

    # 保存混淆矩阵图片到HDFS
    matrix_path = args.tmp_dir + os.sep + constants.CONFUSION_MATRIX + ".png"
    drawing_pic.save_confusion_matrix(matrix, class_s, matrix_path)
    with open(matrix_path, "rb") as f:
        data = f.read()
        save_data.write_data_to_cluster(sc,
                                        args.evaluate + os.sep + constants.CONFUSION_MATRIX + ".png",
                                        data,
                                        is_text_file=False)
    os.remove(matrix_path)

    if len(class_s) == 2 :
        # 如果是二分类,则需要生成 ROC曲线

        # 计算出预测概率
        y_proba = model.predict_proba(x)

        # 取第一个类为 正样例
        fpr, tpr, thresholds = roc_curve(y, y_proba[:, 0], pos_label=class_s[0])
        roc_auc = auc(fpr, tpr)
        roc_pic_path = args.tmp_dir + os.sep + constants.ROC_PIC
        drawing_pic.save_roc_to_pic(fpr, tpr, roc_auc, roc_pic_path)
        with open(roc_pic_path, "rb") as f:
            data = f.read()
            save_data.write_data_to_cluster(sc,
                                            args.evaluate + os.sep + constants.ROC_PIC,
                                            data,
                                            is_text_file=False)
        os.remove(roc_pic_path)
        res_dict["auc"] = roc_auc

    # 删除临时目录
    os.rmdir(args.tmp_dir)
    # 将评估结果写入HDFS
    save_data.write_data_to_cluster(sc, args.evaluate + os.sep + constants.EVALUATION_MODEL_NAME, str(res_dict))