Ejemplo n.º 1
0
def frequency_statistics(spark_session, operator_id, file_url, condition):
    """
    频次统计
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data_pandas(file_url)
        # 频次统计函数
        result_df = frequency_statistics_core(df, condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_file_url = save_data_pandas(result_df)
            run_info = '频次统计算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 2
0
def second_evaluation(spark_session, operator_id, condition):
    """
    二分类评估
    :param spark_session:
    :param operator_id:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 评估函数
        result_df = second_evaluation_core(spark_session, condition,
                                           operator_id)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_df.show()
            result_file_url = save_data(result_df)
            run_info = '评估算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 3
0
def chiSqSelector(spark_session, operator_id, file_url, condition):
    """
    卡方选择
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 卡方选择函数
        result_df = chiSqSelector_core(df, condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_file_url = save_data(result_df)
            run_info = '卡方选择算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 4
0
def quantile_discretization(spark_session, operator_id, file_url, condition):
    """
    分位数离散化页面路由
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 分位数离散化函数
        result_df = quantile_discretization_core(df, condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_df.show()
            result_file_url = save_data(result_df)
            run_info = '分位数离散化算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 5
0
def one_hot_encoder(spark_session, operator_id, file_url, condition):
    """
    独热编码页面路由
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:{"userId":1,"projectId":32,"columnNames":["数量","数量"],"newColumnNames":["独热编码1","独热编码2"]}
    :return:
    """

    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 独热编码函数
        result_df = one_hot_encoder_core(df, condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_file_url = save_data(result_df)
            run_info = '独热编码算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 6
0
def correlation_coefficient(spark_session, operator_id, file_url, condition):
    """
    相关系数
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data_pandas(file_url)
        # 相关系数函数
        result_df = correlation_coefficient_core(df, condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_file_url = save_data_pandas(result_df, '', '', 1)
            run_info = '相关系数算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 7
0
def gbdt(spark_session, operator_id, file_url, condition):
    """
    # GBDT(Gradient Boosting Decision Tree) 又叫 MART(Multiple Additive Regression Tree),是一种迭代的决策树算法,
    # 该算法由多棵决策树组成,所有树的结论累加起来做最终答案。
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # svm_core函数
        result_model_url = gbdt_core(df, condition)
        # 修改计算状态
        run_info = 'GBDT二分类算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_model_url, run_info)
        return [result_model_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 8
0
def column_map(spark_session, operator_id, file_url, condition):
    """
    列映射
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:{"userId":1,"projectId":32,"parameter":[{"colName_1":"利润", "operate_1":"+","value_1":"100","operate":"+","colName_2":"数量", "operate_2":"*","value_2":"0.0001","newName":"newCol1"},{"colName_1":"利润", "operate_1":"+","value_1":"10","operate":"*","colName_2":"数量", "operate_2":"*","value_2":"0.1","newName":"newCol2"}]}
    :return:
    """

    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 列映射函数
        result_df = column_map_core(df, condition["parameter"])
        # 存储结果
        result_file_url = save_data(result_df)
        # 修改计算状态
        run_info = '列映射算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 9
0
def fill_null_value(spark_session, operator_id, file_url, condition):
    """
    填充空值
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition: {'userId':1,'projectId':32,'parameter':[{'operate':'均值填充','colName':''},{'operate':'均值填充','colName':'最大值填充'}]}
    :return:
    """

    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 空值填充函数
        result_df = fill_null_value_core(df, condition["parameter"])
        # 存储结果
        result_file_url = save_data(result_df)
        # 修改计算状态
        run_info = '数据替换算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 10
0
def replace(spark_session, operator_id, file_url, condition):
    """
    数据替换
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition: {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "客户名称"],"replaceCharacters":[{"source":"技术","target":"技术copy"},{"source":"电话","target":"电话copy"}]}
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 替换函数
        result_df = replace_core(df, condition)
        # 存储结果
        result_file_url = save_data(result_df)
        # 修改计算状态
        run_info = '数据替换算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 11
0
def columns_merge(spark_session, operator_id, file_url, condition):
    """
    多列合并
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition: {"userId": 1, "projectId": 32, "columnNames": ["类别", "子类别", "产品名称"], "connector": "-", "newColumnName": "品类名称"}
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 合并函数
        result_df = columns_merge_core(df, condition)
        # 存储结果
        result_file_url = save_data(result_df)
        # 修改计算状态
        run_info = '多列合并算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 12
0
def column_split(spark_session, operator_id, file_url, condition):
    """
    按列拆分
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:  {"userId": 1, "projectId": 32, "columnName": "订购日期", "delimiter": "/", "newColumnNames": ["year", "月"]}
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 拆分函数
        result_df = column_split_core(spark_session, df, condition)
        # 存储结果
        result_file_url = save_data(result_df)
        # 修改计算状态
        run_info = '拆分算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 13
0
def read_data_with_update_record(spark_session, operator_id, file_url):
    """
    读数据算子,拷贝数据并更新算子记录表

    :param spark_session:
    :param operator_id:
    :param file_url:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 存储结果
        result_file_url = save_data(df)

        run_info = 'read_data算子执行成功'
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]
    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 14
0
def sort(spark_session, operator_id, file_url, condition):
    """
    排序

    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition: {"userId":1,"projectId":32,"columnName":"利润","sortType":"降序"}
    :return:
    """

    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 过滤函数
        result_df = sort_core(df, condition['columnName'],
                              condition['sortType'])
        # 存储结果
        result_file_url = save_data(result_df)
        # TODO :判断返回结果是否是String(异常信息)
        run_info = '排序算子执行成功'
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 15
0
def random_split(spark_session, operator_id, file_url, condition):
    """
    按照比例随机划分数据
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 划分函数
        (result_df1, result_df2) = random_split_core(df, condition)
        # 存储结果
        result_file_url1 = save_data(result_df1)
        result_file_url2 = save_data(result_df2)
        # 修改计算状态
        run_info = '列映射算子执行成功'
        OperatorDao.update_operator_by_id(
            operator_id, 'success', result_file_url1 + "*," + result_file_url2,
            run_info)
        return [result_file_url1, result_file_url2]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 16
0
def filter_multi_conditions(spark_session, operator_id, file_url, condition):
    """
    按照多个条件进行过滤

    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition: {"userId":1,"projectId":32,"parameter":[{"colName":"利润", "operate":">","value":"100", "relation":"AND"},{"colName":"装运方式", "operate":"==", "value":"一级", "relation":""}]}
    :return:
    """

    try:

        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 过滤函数
        result_df = filter_core(spark_session, df, condition['parameter'])
        # 存储结果
        result_file_url = save_data(result_df)

        run_info = '过滤算子执行成功'
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_file_url, run_info)
        return [result_file_url]
    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
        return []
Ejemplo n.º 17
0
def lr(spark_session, operator_id, file_url, condition):
    """
    逻辑回归多分类
    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # svm_core函数
        result_model_url = lr_core(df, condition)
        # 修改计算状态
        run_info = '逻辑回归多分类算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_model_url, run_info)
        return [result_model_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 18
0
def mpc(spark_session, operator_id, file_url, condition):
    """
    mpc多分类
    Classifier trainer based on the Multilayer Perceptron.
    Each layer has sigmoid activation function, output layer has softmax.
    Number of inputs has to be equal to the size of feature vectors.
    Number of outputs has to be equal to the total number of labels.

    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # svm_core函数
        result_model_url = mpc_core(df, condition)
        # 修改计算状态
        run_info = 'mpc多分类算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          result_model_url, run_info)
        return [result_model_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 19
0
def initial_execute_status(execute_user_id, start_nodes):
    """
    每次执行model时,初始化执行状态
    :param execute_user_id:
    :param start_nodes: []
    :return:
    """
    # 查找参与运行的 operator
    operator_list = []
    operator_id_queue = []
    for x in start_nodes:
        operator_id_queue.append(x)
    while len(operator_id_queue) > 0:
        operator_id = operator_id_queue.pop(0)
        if operator_id is None or operator_id == "":
            continue
        operator = OperatorDao.get_operator_by_id(operator_id)
        operator_list.append(operator)
        for x in operator.child_operator_ids.split(','):
            operator_id_queue.append(x)

    # 每个operator 状态初始化为initial
    for operator in operator_list:
        OperatorDao.update_operator_by_id(operator.id, "initial")

    # 追加执行记录
    model_execute = ModelExecute(start_nodes=','.join(start_nodes),
                                 status='initial',
                                 execute_user_id=execute_user_id,
                                 create_time=time.strftime(
                                     "%Y-%m-%d %H:%M:%S", time.localtime()))
    model_execute = ModelExecuteDao.create_model_execute(model_execute)
    if model_execute is False:
        return False
    else:
        return model_execute.id
Ejemplo n.º 20
0
def ml_predict(spark_session, operator_id, file_urls, condition):
    """
    机器学习模型预测函数
    :param spark_session:
    :param operator_id:
    :param file_urls: ["modelUrl","predictDataUrl"]
    # 两个输入源 一个是模型 一个是预测数据
    :param condition:
    :return:
    """
    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        for url in file_urls:
            print("------fileUrl:", file_urls)
            if url[-4:] == ".csv":
                url1 = url
            else:
                url0 = url
        df = read_data(spark_session, url1)
        # 预测函数
        result_df = ml_predict_core(spark_session, operator_id, df, url0,
                                    condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_df.show()
            result_file_url = save_data(result_df)
            run_info = '预测算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 21
0
def vector_indexer(spark_session, operator_id, file_url, condition):
    """
    向量索引转换
    # 向量索引转换旨在转换Vector, 例如:[aa, bb, cc],而非本例中的单独值,由于没有合适的数据可用,暂时把单独值转换成vector实现功能: aa -> [aa]

    :param spark_session:
    :param operator_id:
    :param file_url:
    :param condition:
    :return:
    """

    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 读取数据
        df = read_data(spark_session, file_url)
        # 向量索引转换函数
        result_df = vector_indexer_core(df, condition)
        if isinstance(result_df, str):
            OperatorDao.update_operator_by_id(operator_id, 'error', '',
                                              result_df)
        else:
            # 存储结果
            result_df.show()
            result_file_url = save_data(result_df)
            run_info = '向量索引转换化算子执行成功'
            # 修改计算状态
            OperatorDao.update_operator_by_id(operator_id, 'success',
                                              result_file_url, run_info)
            return [result_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []
Ejemplo n.º 22
0
def model_operator(operator_id, condition):
    """
    加载模型算子
    :param operator_id:
    :param condition:{"MLModelId": 2, "modelTypeId": 6001}
    :return:
    """

    try:
        # 修改计算状态
        OperatorDao.update_operator_by_id(operator_id, 'running', '', '')
        # 评估函数
        model_file_url = model_operator_core(condition)
        # 修改计算状态
        run_info = '模型算子执行成功'
        OperatorDao.update_operator_by_id(operator_id, 'success',
                                          model_file_url, run_info)
        return [model_file_url]

    except Exception as e:
        run_info = str(e)
        OperatorDao.update_operator_by_id(operator_id, 'error', '', run_info)
        traceback.print_exc()
    return []