コード例 #1
0
def second_evaluation_core(spark_session, condition, operator_id):
    """
    二分类评估核心函数
    :param spark_session:
    :param condition:
    :param operator_id:
    :return:
    """
    # 读模型
    # 当前节点(评估节点)一个父节点
    operator = OperatorDao.get_operator_by_id(operator_id)
    # 父节点(预测节点) 两个父节点
    father_id = operator.father_operator_ids
    father_operator = OperatorDao.get_operator_by_id(father_id)
    # 祖节点(模型节点和读预测数据节点)
    grand_father_ids = father_operator.father_operator_ids.split(',')
    print("**********祖节点(模型节点和读预测数据源节点):", grand_father_ids)

    # 读数据
    def get_predict_data(operator_config_):
        for grand_father_file_ in operator_config_:
            grand_father_id_ = list(grand_father_file_.keys())[0]
            grand_father_ = OperatorDao.get_operator_by_id(grand_father_id_)
            if grand_father_.operator_type_id == 5001 or grand_father_.operator_type_id < 3000:
                print("***************评估函数,预测数据:",
                      grand_father_.operator_type_id)
                pre_data_file_url = grand_father_.operator_output_url.split(
                    '*,')[grand_father_file_[grand_father_id_]]
                print("***************评估函数,预测数据url:", pre_data_file_url)
                return read_data(spark_session, pre_data_file_url)

    print("**********预测节点:", father_operator.operator_config)
    df = get_predict_data(
        json.loads(father_operator.operator_config)['fileUrl'])

    # 评估
    for grand_father_id in grand_father_ids:
        grand_father = OperatorDao.get_operator_by_id(grand_father_id)
        grand_father_operator_type = grand_father.operator_type_id
        # 模型加载节点
        if grand_father_operator_type == 8000:
            grand_father_operator_type = json.loads(
                grand_father.operator_config)['parameter']['modelTypeId']
        if grand_father_operator_type == 6001:  # svm二分类节点
            print("***************评估函数,训练模型", grand_father.operator_type_id)
            evaluation_df = svm_second_evaluation(
                spark_session, grand_father.operator_output_url, df,
                json.loads(father_operator.operator_config)['parameter'],
                condition)
            return evaluation_df
        elif grand_father_operator_type == 6003:  # lr二分类节点
            print("***************评估函数,训练模型", grand_father.operator_type_id)
            evaluation_df = lr_second_evaluation(
                spark_session, grand_father.operator_output_url, df,
                json.loads(father_operator.operator_config)['parameter'],
                condition)
            return evaluation_df
コード例 #2
0
ファイル: MLModelService.py プロジェクト: BestJex/Easy_Data
def save_ml_model(operator_id, user_id, name):
    """
    保存训练模型
    :param operator_id:
    :param user_id:
    :param name:
    :return:
    """
    # 查看算子
    operator = OperatorDao.get_operator_by_id(operator_id)
    if operator.operator_type_id > 7000 or operator.operator_type_id < 6001:
        return "所选择的节点并不是模型算子节点"
    if operator.status != "success":
        return "请执行该节点"
    if operator.operator_output_url is not None:
        operator_output_url = operator.operator_output_url.split('*,')
    else:
        return "没有运行结果"

    model_url = operator_output_url[0]
    operator_type_id = operator.operator_type_id
    model_id = operator.model_id

    # 查看执行流程model
    model = ModelDao.get_model_by_id(model_id)
    project_id = model.project_id

    ml_model = MLModel(user_id=user_id,
                       project_id=project_id,
                       model_id=model_id,
                       status='save',
                       name=name,
                       operator_type_id=operator_type_id,
                       model_url=model_url)
    return MLModelDao.create_ml_model(ml_model)
コード例 #3
0
 def get_predict_data(operator_config_):
     for grand_father_file_ in operator_config_:
         grand_father_id_ = list(grand_father_file_.keys())[0]
         grand_father_ = OperatorDao.get_operator_by_id(grand_father_id_)
         if grand_father_.operator_type_id == 5001 or grand_father_.operator_type_id < 3000:
             print("***************评估函数,预测数据:",
                   grand_father_.operator_type_id)
             pre_data_file_url = grand_father_.operator_output_url.split(
                 '*,')[grand_father_file_[grand_father_id_]]
             print("***************评估函数,预测数据url:", pre_data_file_url)
             return read_data(spark_session, pre_data_file_url)
コード例 #4
0
def ml_predict_core(spark_session, operator_id, df, model_url, condition):
    """
    路由控制加载哪种模型进行预测
    :param spark_session:
    :param operator_id:
    :param df:
    :param model_url:
    :param condition:
    :return:  预测结果 sparkframe
    """

    # 父节点是什么组件
    operator = OperatorDao.get_operator_by_id(operator_id)
    father_ids = operator.father_operator_ids.split(',')
    print("**********", operator.father_operator_ids)
    for father_id in father_ids:
        father = OperatorDao.get_operator_by_id(father_id)
        print("***************", father.operator_type_id)
        print("---------------", father.operator_type_id == 6001)
        operator_type_flag = father.operator_type_id

        # 模型加载节点
        if operator_type_flag == 8000:
            operator_type_flag = json.loads(
                father.operator_config)['parameter']['modelTypeId']

        if operator_type_flag == 6001:  # svm二分类
            prediction_df = svm_second_predict(spark_session, model_url, df,
                                               condition)
        elif operator_type_flag == 6002:  # gbdt二分类
            prediction_df = gbdt_second_predict(model_url, df, condition)
        elif operator_type_flag == 6003:  # lr二分类
            prediction_df = lr_second_predict(model_url, df, condition)
        elif operator_type_flag == 6004:  # lr多分类
            prediction_df = lr_multiple_predict(model_url, df, condition)
        elif operator_type_flag == 6005:  # mpc多分类
            prediction_df = mpc_multiple_predict(model_url, df, condition)

    # 根据父组件的类型决定加载哪种模型
    return prediction_df
コード例 #5
0
ファイル: Operator.py プロジェクト: BestJex/Easy_Data
def get_operate_result_data():
    """
    查看算子运行结果数据
    :return:
    """
    operator_id = request.form.get('operatorId')
    start = int(request.form.get('start'))
    end = int(request.form.get('end'))
    print(operator_id, start, end)
    operator = OperatorDao.get_operator_by_id(operator_id)
    if operator.status != "success":
        return "请执行该节点"
    if operator.operator_output_url is not None:
        operator_output_url = operator.operator_output_url.split('*,')
    else:
        return "没有运行结果"
    result_arr = []
    try:
        for i in range(len(operator_output_url)):
            data = pd.read_csv(operator_output_url[i], encoding='utf-8')
            if len(data) < end:
                end = len(data)
            if start > end:
                result_arr.append({
                    'length': len(data),
                    'data': "请输入合法参数",
                    'position': i
                })
            else:
                data2 = data[int(start):int(end)].to_json(orient='records',
                                                          force_ascii=False)
                result_arr.append({
                    'length': len(data),
                    'data': json.loads(data2),
                    'position': i
                })
        return jsonify(result_arr)
    except:
        traceback.print_exc()
        return "Error,please contact the administrator "
コード例 #6
0
ファイル: ModelService.py プロジェクト: BestJex/Easy_Data
def initial_execute_status(execute_user_id, start_nodes):
    """
    每次执行model时,初始化执行状态
    :param execute_user_id:
    :param start_nodes: []
    :return:
    """
    # 查找参与运行的 operator
    operator_list = []
    operator_id_queue = []
    for x in start_nodes:
        operator_id_queue.append(x)
    while len(operator_id_queue) > 0:
        operator_id = operator_id_queue.pop(0)
        if operator_id is None or operator_id == "":
            continue
        operator = OperatorDao.get_operator_by_id(operator_id)
        operator_list.append(operator)
        for x in operator.child_operator_ids.split(','):
            operator_id_queue.append(x)

    # 每个operator 状态初始化为initial
    for operator in operator_list:
        OperatorDao.update_operator_by_id(operator.id, "initial")

    # 追加执行记录
    model_execute = ModelExecute(start_nodes=','.join(start_nodes),
                                 status='initial',
                                 execute_user_id=execute_user_id,
                                 create_time=time.strftime(
                                     "%Y-%m-%d %H:%M:%S", time.localtime()))
    model_execute = ModelExecuteDao.create_model_execute(model_execute)
    if model_execute is False:
        return False
    else:
        return model_execute.id
コード例 #7
0
def operator_execute(spark_session, operator_id):
    """
    执行算子
    :param spark_session:
    :param operator_id:
    :return:
    """
    try:
        # 查算子
        operator = OperatorDao.get_operator_by_id(operator_id)
        print("------执行算子------", "operator_id:", operator_id,
              operator.operator_type_id)
        # 获取input_url
        config = json.loads(operator.operator_config)
        file_url_list = config['fileUrl']
        # 获取输入地址
        url_arr = []
        for file_url_dict in file_url_list:
            key = ''
            for ikey in file_url_dict.keys():
                key = ikey
            if operator_id == key:
                url_arr.append(file_url_dict[key])
            else:
                father = OperatorDao.get_operator_by_id(key)
                # 检查父节点是否准备就绪
                if father.status != 'success':
                    return []
                # TODO:暂定从0 开始
                father_output_url_index = file_url_dict[key]
                father_url_arr = father.operator_output_url.split('*,')
                url_arr.append(father_url_arr[father_output_url_index])
        # 算子函数
        if operator.operator_type_id == 1001:
            preprocessService.filter_multi_conditions(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1002:
            preprocessService.sort(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1003:
            preprocessService.column_split(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1005:
            preprocessService.columns_merge(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1006:
            preprocessService.replace(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1007:
            preprocessService.fill_null_value(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1008:
            preprocessService.column_map(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 1009:
            preprocessService.random_split(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2001:
            FEService.quantile_discretization(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2002:
            FEService.vector_indexer(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2003:
            FEService.standard_scaler(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2004:
            FEService.pca(spark_session, operator_id, url_arr[0],
                          json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2005:
            FEService.string_indexer(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2006:
            FEService.one_hot_encoder(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2007:
            FEService.polynomial_expansion(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 2008:
            FEService.chiSqSelector(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 3001:
            ExplorationService.full_table_statistics(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 3002:
            ExplorationService.frequency_statistics(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 3003:
            ExplorationService.correlation_coefficient(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 5001:
            preprocessService.read_data_with_update_record(
                spark_session, operator_id, url_arr[0])
        elif operator.operator_type_id == 6000:
            PredictService.ml_predict(
                spark_session, operator_id, url_arr,
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 6001:
            SecondClassification.svm(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 6002:
            SecondClassification.gbdt(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 6003:
            SecondClassification.lr(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 6004:
            MultipleClassifition.lr(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 6005:
            MultipleClassifition.mpc(
                spark_session, operator_id, url_arr[0],
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 7001:
            Evaluation.second_evaluation(
                spark_session, operator_id,
                json.loads(operator.operator_config)['parameter'])
        elif operator.operator_type_id == 8000:
            ModelService.model_operator(
                operator_id,
                json.loads(operator.operator_config)['parameter'])

        return operator.child_operator_ids.split(',')

    except:
        traceback.print_exc()
        return False