Beispiel #1
0
def convert_fields_type(sds_id, f_t_arrays):
    """
    convert field types of staging data set
    :param sds_id: ObjectId
    :param f_t_arrays: array: [['name', 'str'],['age', 'int'], ['salary',
    'float']]
    :return: new staging_data_set object
    """
    # get project object
    # project = project_business.get_by_id(project_id)

    # create new staging data set
    sds = staging_data_set_business.get_by_id(sds_id)
    # copy data from data(raw) to staging data
    # get all data objects by data_set id

    data_objects = staging_data_business. \
        get_by_staging_data_set_id(sds['id'])
    # convert mongoengine objects to dicts
    data_objects = json_utility.me_obj_list_to_dict_list(data_objects)
    # convert types of values in dicts
    result = data_utility.convert_data_array_by_fields(data_objects,
                                                       f_t_arrays)
    data_objects = result['result']

    # update all rows
    for data_obj in data_objects:
        staging_data_business.update_by_id(data_obj['_id'], data_obj)

    if 'failure_count' in result:
        failure_count = result['failure_count']
        return {'result': sds, 'failure_count': failure_count}
    return {'result': sds}
Beispiel #2
0
 def objective(args):
     steps = args
     params['fit']['args']['steps'] = steps
     sds = staging_data_set_business.get_by_id('595cb76ed123ab59779604c3')
     result = custom_model(params,
                           linear_regressor_model_fn,
                           input,
                           result_sds=sds)
     return result['eval_metrics']['loss']
Beispiel #3
0
def usr2_visualization():
    data = request.get_json()
    staging_data_set_id = data.get('staging_data_set_id')

    sds_data = staging_data_set_business.get_by_id(ObjectId(staging_data_set_id))

    data = sds_data.visualization
    job_obj = sds_data.job
    visual_type = job_obj.toolkit.category

    staging_data_set_id = job_obj.steps[0]["args"][0]["value"]

    result = visualization_service.usr_story2_exploration(data, visual_type, staging_data_set_id)
    return jsonify({'response': json_utility.convert_to_json(result)}), 200
Beispiel #4
0
def model_input_manager_unstructured(conf, data_set_id, **kwargs):
    file = staging_data_set_business.get_by_id(data_set_id).file
    print(file['uri'])
    input = {
        'train_data_dir': file['uri'] + 'train/',
        'validation_data_dir': file['uri'] + 'validation/'
    }
    input['nb_train_samples'] = sum(
        [len(files) for r, d, files in
         os.walk(input['train_data_dir'])])
    input['nb_validation_samples'] = sum(
        [len(files) for r, d, files in
         os.walk(input['validation_data_dir'])])
    return input
Beispiel #5
0
            "split_after_samples": 20
        }
    },
    'fit': {
        "args": {
            "steps": 300
        }
    },
    'evaluate': {
        'args': {
            'steps': 1
        }
    }
}

sds = staging_data_set_business.get_by_id('595cb76ed123ab59779604c3')
from server3.lib.models.randomforest import random_forest_model_fn
result = custom_model(params, random_forest_model_fn, input, result_sds=sds)
print(result)
# #

# # 测试 logistic_regressor
#
# input = {
#     'model_name': 'logistic_regressor',
#     'df_features': iris_feature,
#     'df_labels': iris_label,
# }
# params = {
#     'estimator': {
#         'args': {
Beispiel #6
0
        def wrapper(*args, **kw):

            # create a job
            staging_data_set_obj = staging_data_set_business.get_by_id(
                staging_data_set_id)
            project_obj = project_business.get_by_id(project_id)
            job_spec = {
                "fields": {
                    "source": fields[0],
                    "target": fields[1]
                },
                "params": kw
            }
            job_obj = job_business.add_toolkit_job(toolkit_obj,
                                                   staging_data_set_obj,
                                                   project_obj, **job_spec)
            # update a project
            project_business.insert_job_by_id(project_id, job_obj.id)

            # calculate
            func_rst = func(*args, **kw)
            result = list(func_rst) if isinstance(func_rst,
                                                  tuple) else [func_rst]

            # 新设计的存取方式
            results = {"fields": {"source": fields[0], "target": fields[1]}}
            gen_info = []
            result_spec = toolkit_obj.result_spec

            for arg in result_spec["args"]:
                value = result.pop(0)
                results.update({arg["name"]: value})
                if arg["if_add_column"]:
                    # 不能使用中文名
                    str_name = "%s_col" % toolkit_obj.entry_function
                    value = data_utility.retrieve_nan_index(value, nan_index)
                    try:
                        staging_data_service.update_many_with_new_fields(
                            value, nan_index, fields[0], str_name,
                            staging_data_set_id)
                    except (TypeError, ValueError) as e:
                        print("ERRORS in data saved to database")

                if arg.get("attribute", False) and arg["attribute"] == "label":
                    labels = value
                elif arg.get("attribute",
                             False) and arg["attribute"] == "general_info":
                    gen_info.append({
                        arg["name"]: {
                            "value": value,
                            "description": arg["des"]
                        }
                    })

            # 可视化计算
            # 聚类分析
            if toolkit_obj.category == 0:
                json = {
                    "scatter":
                    data_utility.retrieve_nan_index(args[0], nan_index),
                    "labels":
                    labels,
                    "pie": [{
                        'name': el,
                        'value': labels.count(el)
                    } for el in set(labels)],
                    "centers":
                    results["Centroids of Clusters"],
                    "general_info":
                    gen_info,
                    "fields":
                    fields[0],
                    "category":
                    toolkit_obj.category
                }

            # 特征选取
            elif toolkit_obj.category == 1:
                from scipy.stats import pearsonr
                # from minepy import MINE
                data = list(zip(*args[0]))
                target_flag = 1 if len(args) == 2 else 0
                target = args[1] if target_flag else None

                json = {
                    "Y_target": fields[1],
                    "X_fields": fields[0],
                    "labels": labels,
                    "bar": results["scores"],
                    "general_info": {
                        "Selected Features":
                        "%s out of %s" %
                        (len(list(filter(lambda x: x is True,
                                         labels))), len(fields[0])),
                        "Selected Fields":
                        " ".join(
                            str(el)
                            for el in list(compress(fields[0], labels))),
                        "Number of NaN":
                        len(nan_index)
                    },
                    "scatter": {
                        "y_domain":
                        target,
                        "x_domain":
                        data,
                        "pearsonr": [
                            pearsonr(el, target)[0] if target_flag else None
                            for el in data
                        ],
                        # "mic": [MINE(alpha=0.6, c=15, est="mic_approx").compute_score(el,
                        # list(data[0]).mic()) for el in list(data[1:])]}
                        "mic": [None for el in data]
                    },
                    "category": toolkit_obj.category
                }

            # 数值转换
            elif toolkit_obj.category == 2:
                inn = 0
                while inn in nan_index:
                    inn = inn + 1
                # 由于出来的数据格式不一致,判断是否为二维数据(是=>1, 不是=>0)
                flag_shape = 1 if isinstance(labels[inn], list) else 0

                result_be = labels if flag_shape else np.array(labels).reshape(
                    [-1, 1]).tolist()

                data = list(zip(*args[0]))
                result = list(zip(*result_be))

                # 曾经两表合并,现在不需要了
                # merge_data = list(zip(*(data + result)))
                if len(result) == len(fields[0]):
                    lab_fields = [
                        str(fields[0][i]) + "_New_Col"
                        for i in range(len(result))
                    ]
                else:
                    lab_fields = [
                        str(fields[0][0]) + "_New_Col_" + str(i)
                        for i in range(len(result))
                    ]

                # merge_fields = fields[0] + lab_fields

                flag_str1 = isinstance(args[0][inn][0], str)
                flag_str2 = isinstance(result_be[inn][0], str)
                bar1 = []
                bar2 = []
                for el in fields[0]:
                    indx = fields[0].index(el)
                    raw_d = data[indx]

                    if not flag_str1 and len(set(raw_d)) > 5:
                        bar1_tmp = visualization_service.freq_hist(raw_d)
                    else:
                        seta = set(raw_d)
                        x_domain = [el for el in seta]
                        y_domain = [raw_d.count(el) for el in seta]
                        bar1_tmp = {'x_domain': x_domain, 'y_domain': y_domain}
                    bar1_tmp.update({"field": el, "title": "数据分布直方图(栏位转换前)"})
                    bar1.append(bar1_tmp)

                for el in lab_fields:
                    indx = lab_fields.index(el)
                    raw_re = result[indx]

                    if not flag_str2 and len(set(raw_re)) > 5:
                        bar2_tmp = visualization_service.freq_hist(raw_re)
                    else:
                        seta = set(raw_re)
                        x_domain = [el for el in seta]
                        y_domain = [raw_re.count(el) for el in seta]
                        bar2_tmp = {'x_domain': x_domain, 'y_domain': y_domain}
                    bar2_tmp.update({"field": el, "title": "数据分布直方图(栏位转换后)"})
                    bar2.append(bar2_tmp)

                json = {
                    "category": toolkit_obj.category,
                    "table1": {
                        "title": "原始数据",
                        "field": fields[0],
                        "data": [dict(zip(fields[0], arr)) for arr in args[0]]
                    },
                    "table2": {
                        "title": "转换后数据",
                        "field": lab_fields,
                        "data":
                        [dict(zip(lab_fields, arr)) for arr in result_be]
                    },
                    "bar1": bar1,
                    "bar2": bar2
                }

            # 降维
            elif toolkit_obj.category == 3:
                flag = toolkit_obj.parameter_spec["data"]["type"][
                    "key"] == "transfer_box"
                data = list(zip(*args[0]))

                if flag:
                    data.append(args[1])
                lab = list(zip(*labels))
                lab_fields = ["New Col" + str(i) for i in range(len(lab))]
                var1 = [np.var(da) for da in data]
                var2 = [np.var(da) for da in lab]
                merge_fields = fields[0] + fields[1] if fields[1] else \
                    fields[0]
                x_domain = merge_fields + ["_empty"] + lab_fields
                y_domain = var1 + [0] + var2

                temp = var1[:-1] if flag else var1
                json = {
                    "table1": {
                        "X_fields":
                        fields[0],
                        "Y_fields":
                        fields[1],
                        "data": [
                            dict(zip(merge_fields, arr))
                            for arr in list(zip(*data))
                        ]
                    },
                    "table2": {
                        "data": [dict(zip(lab_fields, arr)) for arr in labels],
                        "fields": lab_fields
                    },
                    "bar": {
                        "x_domain": x_domain,
                        "y_domain": y_domain
                    },
                    "pie1": [{
                        "name": fields[0][i],
                        "value": temp[i]
                    } for i in range(len(temp))],
                    "pie2": [{
                        "name": lab_fields[i],
                        "value": var2[i]
                    } for i in range(len(var2))],
                    "general_info":
                    gen_info,
                    "category":
                    toolkit_obj.category
                }

            else:
                json = {}

            # update a job
            job_business.end_job(job_obj)

            if result_spec["if_reserved"]:
                # create result sds for toolkit
                sds_name = '%s_%s_result' % (toolkit_obj['name'],
                                             job_obj['id'])
                result_sds_obj = staging_data_set_business.add(sds_name,
                                                               'des',
                                                               project_obj,
                                                               job=job_obj,
                                                               type='result')
                logger_service.save_result(
                    result_sds_obj,
                    **{"result": json_utility.convert_to_json(results)})
                logger_service.save_result(result_sds_obj,
                                           **{"visualization": json})
                return {
                    "visual_sds_id": str(result_sds_obj.id) if json else None,
                    "result": results
                }

            return {"result": results}
Beispiel #7
0
def model_to_code(conf, project_id, data_source_id, model_id, job_obj,
                  **kwargs):
    """
    run model by model_id and the parameter config

    :param conf:
    :param project_id:
    :param data_source_id:
    :param model_id:
    :param kwargs:
    :return:
    """
    file_id = kwargs.get('file_id')
    staging_data_set_obj = None
    if data_source_id:
        staging_data_set_obj = \
            staging_data_set_business.get_by_id(data_source_id)
    project_obj = project_business.get_by_id(project_id)
    file_dict = {'file': ObjectId(file_id)} if file_id else {}
    model_obj = model_business.get_by_model_id(model_id)

    run_args = {
        "conf": conf,
        "project_id": project_id,
        "data_source_id": data_source_id,
        "model_id": model_id,
        "kwargs": kwargs
    }

    # # create model job
    # job_obj = job_business.add_model_job(model_obj, staging_data_set_obj,
    #                                      project_obj, params=conf,
    #                                      run_args=run_args,
    #                                      **file_dict)

    job_obj = job_business.update_job_by_id(job_obj.id, model=model_obj,
                                            staging_data_set=staging_data_set_obj,
                                            project=project_obj, params=conf,
                                            run_args=run_args, status=100)
    job_id = str(job_obj.id)

    # model_obj = model_business.get_by_model_id(model_id)
    f = getattr(models, model_obj.to_code_function)

    if model_obj['category'] == 0:
        # keras nn
        head_str = manage_supervised_input_to_str(conf, data_source_id,
                                                  **kwargs)
        return job_service.run_code(conf, project_id, data_source_id,
                                    model_obj, f, job_id, head_str)
    elif model_obj['category'] == ModelType['unstructured']:
        # input from folder
        head_str = manage_unstructured_to_str(conf, data_source_id,
                                              **kwargs)
        return job_service.run_code(conf, project_id, None,
                                    model_obj, f, job_id, head_str,
                                    file_id=data_source_id)

    elif model_obj['category'] == ModelType['advanced']:
        # no input
        return job_service.run_code(conf, project_id, None,
                                    model_obj, f, job_id, '',
                                    file_id=None)
    else:
        # custom models
        head_str = ''
        head_str += 'import logging\n'
        head_str += 'import numpy as np\n'
        head_str += 'import pandas as pd\n'
        head_str += 'import tensorflow as tf\n'
        head_str += 'from tensorflow.python.framework import constant_op\n'
        head_str += 'from tensorflow.python.framework import dtypes\n'
        head_str += 'from tensorflow.contrib.learn.python.learn import metric_spec\n'
        head_str += 'from server3.lib import models\n'
        head_str += 'from server3.lib.models.modified_tf_file.monitors import ValidationMonitor\n'
        head_str += 'from server3.business import staging_data_set_business\n'
        head_str += 'from server3.business import staging_data_business\n'
        head_str += 'from server3.service import staging_data_service\n'
        head_str += "from server3.service import job_service\n"
        head_str += 'from server3.service.model_service import ' \
                    'split_categorical_and_continuous\n'
        head_str += 'from server3.service.custom_log_handler ' \
                    'import MetricsHandler\n'
        head_str += 'model_fn = models.%s\n' % model_obj.entry_function
        head_str += "data_source_id = '%s'\n" % data_source_id
        head_str += "model_name = '%s'\n" % model_obj.name
        head_str += "kwargs = %s\n" % kwargs
        fit = conf.get('fit', None)
        if model_obj['category'] == 1:
            data_fields = fit.get('data_fields', [[], []])
            head_str += 'data_fields = %s\n' % data_fields
            head_str += inspect.getsource(
                model_input_manager_custom_supervised)
            head_str += "input_dict = model_input_manager_custom_supervised(" \
                        "data_fields, data_source_id, model_name, **kwargs)\n"
        elif model_obj['category'] == 2:
            x_cols = fit.get('data_fields', [])
            head_str += "x_cols = %s\n" % x_cols
            head_str += inspect.getsource(model_input_manager_unsupervised)
            head_str += "input_dict = model_input_manager_unsupervised(x_cols, " \
                        "data_source_id, model_name)\n"
        return job_service.run_code(conf, project_id, data_source_id,
                                    model_obj, f, job_id, head_str)
Beispiel #8
0
def kube_run_model(conf, project_id, data_source_id, model_id, job_obj,
                   **kwargs):
    # file_id = kwargs.get('file_id')
    staging_data_set_obj = None
    if data_source_id:
        staging_data_set_obj = \
            staging_data_set_business.get_by_id(data_source_id)
    project_obj = project_business.get_by_id(project_id)
    # file_dict = {'file': ObjectId(file_id)} if file_id else {}
    model_obj = model_business.get_by_model_id(model_id)

    run_args = {
        "conf": conf,
        "project_id": project_id,
        "data_source_id": data_source_id,
        "model_id": model_id,
        "kwargs": kwargs
    }

    job_obj = job_business.update_job_by_id(job_obj.id, model=model_obj,
                                            staging_data_set=staging_data_set_obj,
                                            project=project_obj, params=conf,
                                            run_args=run_args, status=100)

    job_id = str(job_obj.id)
    print(job_id)
    return run_model(conf, project_id, data_source_id, model_id, job_id,
                     **kwargs)
    cwd = os.getcwd()
    job_name = job_id + '-training-job'
    client = kube_service.client
    try:
        # TODO need to terminate running pod
        kube_service.delete_job(job_name)
        while True:
            kube_service.get_job(job_name)
            time.sleep(1)
    except client.rest.ApiException:
        print('job not exists or deleted, ok to create')

    kube_json = {
        "apiVersion": "batch/v1",
        "kind": "Job",
        "metadata": {
            "name": job_name
        },
        "spec": {
            "template": {
                "metadata": {
                    "labels": {
                        "app": job_id
                    }
                },
                "spec": {
                    "containers": [
                        {
                            "name": job_id,
                            "image": "10.52.14.192/gzyw/model_app_pre",
                            "imagePullPolicy": "IfNotPresent",
                            "securityContext": {
                                "privileged": True,
                            },
                            "stdin": True,
                            "command": ["/usr/local/bin/python"],
                            "args": [
                                "run_model.py",
                                "--job_id", job_id
                            ],
                            "volumeMounts": [
                                {
                                    "mountPath": "/pyserver/user_directory",
                                    "name": "nfsvol"
                                },
                            ]
                        }
                    ],
                    "restartPolicy": "Never",
                    # "activeDeadlineSeconds": 1,
                    "volumes": [
                        {
                            "name": "nfsvol",
                            "persistentVolumeClaim": {
                                "claimName": "nfs-pvc"
                            }
                        },
                    ]
                },
            },
        }
    }
    # file_utils.write_to_filepath(json.dumps(kube_json), './model_app.json')
    # return
    api = kube_service.job_api
    resp = api.create_namespaced_job(body=kube_json, namespace=NAMESPACE)
    print("Job created. status='%s'" % str(resp.status))
    return {'job_id': job_id}