Ejemplo n.º 1
0
def save_as_result(job_id, new_sds_name):
    job_obj = job_business.get_by_job_id(job_id)
    result = job_obj.result
    toolkit = job_obj.toolkit
    project_obj = job_obj.project

    sds_id = staging_data_set_business.add(
        name=new_sds_name,
        description='des',
        project=project_obj,
        # job=job_obj
    )

    # 拿到原表
    old_sds = StepBusiness.get_datasource(job_obj.steps)
    table = staging_data_business.get_by_staging_data_set_id(old_sds)

    table_dict = []
    for i in range(len(table)):
        row = table[i].to_mongo().to_dict()
        row.pop("_id")
        row.pop("staging_data_set")
        table_dict.append(row)

    # 复制原表
    staging_data_business.add_many(staging_data_set=sds_id,
                                   data_array=table_dict)

    # 保存结果
    save_result_sub(result, sds_id, toolkit)
Ejemplo n.º 2
0
        def wrapper(*args, **kw):
            # create a job
            # model_obj = model_business.get_by_model_id(model_id)
            result_dir = kwargs.get('result_dir')

            project_obj = project_business.get_by_id(project_id)

            job_obj = job_business.get_by_job_id(job_id)

            # update a project
            project_business.insert_job_by_id(project_id, job_obj.id)
            project_business.update_items_to_list_field(project_id,
                                                        related_tasks=TYPE.get(
                                                            model_obj.category,
                                                            []))
            # create result sds for model
            sds_name = '%s_%s_result' % (model_obj['name'], job_obj['id'])
            try:
                sds = staging_data_set_business.get_by_job_id(job_obj.id)
            except DoesNotExist:
                print('free to create sds')
            else:
                staging_data_set_business.remove_by_id(sds.id)
            finally:
                result_sds_obj = staging_data_set_business.add(sds_name,
                                                               'des',
                                                               project_obj,
                                                               job=job_obj,
                                                               type='result')

            # run
            if result_dir:
                # result_dir += str(job_obj['id']) + '/'
                try:
                    os.makedirs(result_dir)
                except FileExistsError:
                    print('dir exists, no need to create')
                kw['result_dir'] = result_dir

            # generate_job_py(func, *args, **kw, result_sds=result_sds_obj,
            #                 project_id=project_id)

            func_result = func(*args,
                               **kw,
                               result_sds=result_sds_obj,
                               project_id=project_id,
                               job_id=job_id)
            # update a job
            job_business.end_job(job_obj)
            if isinstance(func_result, dict):
                func_result['job_id'] = str(job_obj['id'])

            return func_result
Ejemplo n.º 3
0
        def wrapper(*args, **kw):

            # create a job
            staging_data_set_obj = staging_data_set_business.get_by_id(
                staging_data_set_id)
            project_obj = project_business.get_by_id(project_id)
            job_spec = {
                "fields": {
                    "source": fields[0],
                    "target": fields[1]
                },
                "params": kw
            }
            job_obj = job_business.add_toolkit_job(toolkit_obj,
                                                   staging_data_set_obj,
                                                   project_obj, **job_spec)
            # update a project
            project_business.insert_job_by_id(project_id, job_obj.id)

            # calculate
            func_rst = func(*args, **kw)
            result = list(func_rst) if isinstance(func_rst,
                                                  tuple) else [func_rst]

            # 新设计的存取方式
            results = {"fields": {"source": fields[0], "target": fields[1]}}
            gen_info = []
            result_spec = toolkit_obj.result_spec

            for arg in result_spec["args"]:
                value = result.pop(0)
                results.update({arg["name"]: value})
                if arg["if_add_column"]:
                    # 不能使用中文名
                    str_name = "%s_col" % toolkit_obj.entry_function
                    value = data_utility.retrieve_nan_index(value, nan_index)
                    try:
                        staging_data_service.update_many_with_new_fields(
                            value, nan_index, fields[0], str_name,
                            staging_data_set_id)
                    except (TypeError, ValueError) as e:
                        print("ERRORS in data saved to database")

                if arg.get("attribute", False) and arg["attribute"] == "label":
                    labels = value
                elif arg.get("attribute",
                             False) and arg["attribute"] == "general_info":
                    gen_info.append({
                        arg["name"]: {
                            "value": value,
                            "description": arg["des"]
                        }
                    })

            # 可视化计算
            # 聚类分析
            if toolkit_obj.category == 0:
                json = {
                    "scatter":
                    data_utility.retrieve_nan_index(args[0], nan_index),
                    "labels":
                    labels,
                    "pie": [{
                        'name': el,
                        'value': labels.count(el)
                    } for el in set(labels)],
                    "centers":
                    results["Centroids of Clusters"],
                    "general_info":
                    gen_info,
                    "fields":
                    fields[0],
                    "category":
                    toolkit_obj.category
                }

            # 特征选取
            elif toolkit_obj.category == 1:
                from scipy.stats import pearsonr
                # from minepy import MINE
                data = list(zip(*args[0]))
                target_flag = 1 if len(args) == 2 else 0
                target = args[1] if target_flag else None

                json = {
                    "Y_target": fields[1],
                    "X_fields": fields[0],
                    "labels": labels,
                    "bar": results["scores"],
                    "general_info": {
                        "Selected Features":
                        "%s out of %s" %
                        (len(list(filter(lambda x: x is True,
                                         labels))), len(fields[0])),
                        "Selected Fields":
                        " ".join(
                            str(el)
                            for el in list(compress(fields[0], labels))),
                        "Number of NaN":
                        len(nan_index)
                    },
                    "scatter": {
                        "y_domain":
                        target,
                        "x_domain":
                        data,
                        "pearsonr": [
                            pearsonr(el, target)[0] if target_flag else None
                            for el in data
                        ],
                        # "mic": [MINE(alpha=0.6, c=15, est="mic_approx").compute_score(el,
                        # list(data[0]).mic()) for el in list(data[1:])]}
                        "mic": [None for el in data]
                    },
                    "category": toolkit_obj.category
                }

            # 数值转换
            elif toolkit_obj.category == 2:
                inn = 0
                while inn in nan_index:
                    inn = inn + 1
                # 由于出来的数据格式不一致,判断是否为二维数据(是=>1, 不是=>0)
                flag_shape = 1 if isinstance(labels[inn], list) else 0

                result_be = labels if flag_shape else np.array(labels).reshape(
                    [-1, 1]).tolist()

                data = list(zip(*args[0]))
                result = list(zip(*result_be))

                # 曾经两表合并,现在不需要了
                # merge_data = list(zip(*(data + result)))
                if len(result) == len(fields[0]):
                    lab_fields = [
                        str(fields[0][i]) + "_New_Col"
                        for i in range(len(result))
                    ]
                else:
                    lab_fields = [
                        str(fields[0][0]) + "_New_Col_" + str(i)
                        for i in range(len(result))
                    ]

                # merge_fields = fields[0] + lab_fields

                flag_str1 = isinstance(args[0][inn][0], str)
                flag_str2 = isinstance(result_be[inn][0], str)
                bar1 = []
                bar2 = []
                for el in fields[0]:
                    indx = fields[0].index(el)
                    raw_d = data[indx]

                    if not flag_str1 and len(set(raw_d)) > 5:
                        bar1_tmp = visualization_service.freq_hist(raw_d)
                    else:
                        seta = set(raw_d)
                        x_domain = [el for el in seta]
                        y_domain = [raw_d.count(el) for el in seta]
                        bar1_tmp = {'x_domain': x_domain, 'y_domain': y_domain}
                    bar1_tmp.update({"field": el, "title": "数据分布直方图(栏位转换前)"})
                    bar1.append(bar1_tmp)

                for el in lab_fields:
                    indx = lab_fields.index(el)
                    raw_re = result[indx]

                    if not flag_str2 and len(set(raw_re)) > 5:
                        bar2_tmp = visualization_service.freq_hist(raw_re)
                    else:
                        seta = set(raw_re)
                        x_domain = [el for el in seta]
                        y_domain = [raw_re.count(el) for el in seta]
                        bar2_tmp = {'x_domain': x_domain, 'y_domain': y_domain}
                    bar2_tmp.update({"field": el, "title": "数据分布直方图(栏位转换后)"})
                    bar2.append(bar2_tmp)

                json = {
                    "category": toolkit_obj.category,
                    "table1": {
                        "title": "原始数据",
                        "field": fields[0],
                        "data": [dict(zip(fields[0], arr)) for arr in args[0]]
                    },
                    "table2": {
                        "title": "转换后数据",
                        "field": lab_fields,
                        "data":
                        [dict(zip(lab_fields, arr)) for arr in result_be]
                    },
                    "bar1": bar1,
                    "bar2": bar2
                }

            # 降维
            elif toolkit_obj.category == 3:
                flag = toolkit_obj.parameter_spec["data"]["type"][
                    "key"] == "transfer_box"
                data = list(zip(*args[0]))

                if flag:
                    data.append(args[1])
                lab = list(zip(*labels))
                lab_fields = ["New Col" + str(i) for i in range(len(lab))]
                var1 = [np.var(da) for da in data]
                var2 = [np.var(da) for da in lab]
                merge_fields = fields[0] + fields[1] if fields[1] else \
                    fields[0]
                x_domain = merge_fields + ["_empty"] + lab_fields
                y_domain = var1 + [0] + var2

                temp = var1[:-1] if flag else var1
                json = {
                    "table1": {
                        "X_fields":
                        fields[0],
                        "Y_fields":
                        fields[1],
                        "data": [
                            dict(zip(merge_fields, arr))
                            for arr in list(zip(*data))
                        ]
                    },
                    "table2": {
                        "data": [dict(zip(lab_fields, arr)) for arr in labels],
                        "fields": lab_fields
                    },
                    "bar": {
                        "x_domain": x_domain,
                        "y_domain": y_domain
                    },
                    "pie1": [{
                        "name": fields[0][i],
                        "value": temp[i]
                    } for i in range(len(temp))],
                    "pie2": [{
                        "name": lab_fields[i],
                        "value": var2[i]
                    } for i in range(len(var2))],
                    "general_info":
                    gen_info,
                    "category":
                    toolkit_obj.category
                }

            else:
                json = {}

            # update a job
            job_business.end_job(job_obj)

            if result_spec["if_reserved"]:
                # create result sds for toolkit
                sds_name = '%s_%s_result' % (toolkit_obj['name'],
                                             job_obj['id'])
                result_sds_obj = staging_data_set_business.add(sds_name,
                                                               'des',
                                                               project_obj,
                                                               job=job_obj,
                                                               type='result')
                logger_service.save_result(
                    result_sds_obj,
                    **{"result": json_utility.convert_to_json(results)})
                logger_service.save_result(result_sds_obj,
                                           **{"visualization": json})
                return {
                    "visual_sds_id": str(result_sds_obj.id) if json else None,
                    "result": results
                }

            return {"result": results}
Ejemplo n.º 4
0
def add_staging_data_set_by_data_set_id(sds_name, sds_description, project_id,
                                        data_set_id):
    """
    Create staging_data_set and copy to staging_data by original data_set id

    :param sds_name: str
    :param sds_description: str
    :param project_id: ObjectId
    :param data_set_id: ObjectId
    :return: new staging_data_set object
    """
    # get project object
    # project = project_business.get_by_id(project_id)

    # create new staging data set
    ds_obj = data_set_business.get_by_id(data_set_id)
    ds = ds_obj.to_mongo()
    ds.pop('name')
    ds.pop('description')
    sds = staging_data_set_business.add(sds_name, sds_description, project_id,
                                        **ds)

    # update project info
    # note: related_field in data set become related_fields here
    project_business.update_items_to_list_field(
        project_id,
        tags=ds.get('tags', []),
        related_tasks=ds.get('related_tasks', []),
        related_fields=ds.get('related_field', []))

    # generate the project volume path
    project = project_business.get_by_id(project_id)
    user_ID = ownership_business.get_owner(project, 'project').user_ID
    volume_dir = os.path.join(USER_DIR, user_ID, project.name, 'volume/')
    if not os.path.exists(volume_dir):
        os.makedirs(volume_dir)

    # copy data from data(raw) to staging data
    # get all data objects by data_set id
    try:
        # copy the file instance to project volume
        if hasattr(ds_obj, 'file') and ds_obj.file:
            file = ds_obj.file
            if os.path.isdir(file.uri):
                dst = os.path.join(volume_dir, os.path.dirname(file.uri))
                # if dir exists, remove it and copytree, cause copytree will
                #  create the dir
                if os.path.exists(dst):
                    shutil.rmtree(dst)
                shutil.copytree(file.uri, dst)
            else:
                shutil.copy(file.uri, volume_dir)

        data_objects = data_business.get_by_data_set(data_set_id)
        # convert mongoengine objects to dicts
        data_objects = json_utility.me_obj_list_to_dict_list(data_objects)

        # remove data set id when import to sds
        for d in data_objects:
            d.pop('data_set')

        if data_objects:
            staging_data_business.add_many(sds, data_objects)
        return sds
    except Exception as e:
        # remove staging_data_set and staging_data
        staging_data_business.remove_by_staging_data_set_id(sds.id)
        staging_data_set_business.remove_by_id(sds.id)
        raise e