def convert_fields_type(sds_id, f_t_arrays): """ convert field types of staging data set :param sds_id: ObjectId :param f_t_arrays: array: [['name', 'str'],['age', 'int'], ['salary', 'float']] :return: new staging_data_set object """ # get project object # project = project_business.get_by_id(project_id) # create new staging data set sds = staging_data_set_business.get_by_id(sds_id) # copy data from data(raw) to staging data # get all data objects by data_set id data_objects = staging_data_business. \ get_by_staging_data_set_id(sds['id']) # convert mongoengine objects to dicts data_objects = json_utility.me_obj_list_to_dict_list(data_objects) # convert types of values in dicts result = data_utility.convert_data_array_by_fields(data_objects, f_t_arrays) data_objects = result['result'] # update all rows for data_obj in data_objects: staging_data_business.update_by_id(data_obj['_id'], data_obj) if 'failure_count' in result: failure_count = result['failure_count'] return {'result': sds, 'failure_count': failure_count} return {'result': sds}
def objective(args): steps = args params['fit']['args']['steps'] = steps sds = staging_data_set_business.get_by_id('595cb76ed123ab59779604c3') result = custom_model(params, linear_regressor_model_fn, input, result_sds=sds) return result['eval_metrics']['loss']
def usr2_visualization(): data = request.get_json() staging_data_set_id = data.get('staging_data_set_id') sds_data = staging_data_set_business.get_by_id(ObjectId(staging_data_set_id)) data = sds_data.visualization job_obj = sds_data.job visual_type = job_obj.toolkit.category staging_data_set_id = job_obj.steps[0]["args"][0]["value"] result = visualization_service.usr_story2_exploration(data, visual_type, staging_data_set_id) return jsonify({'response': json_utility.convert_to_json(result)}), 200
def model_input_manager_unstructured(conf, data_set_id, **kwargs): file = staging_data_set_business.get_by_id(data_set_id).file print(file['uri']) input = { 'train_data_dir': file['uri'] + 'train/', 'validation_data_dir': file['uri'] + 'validation/' } input['nb_train_samples'] = sum( [len(files) for r, d, files in os.walk(input['train_data_dir'])]) input['nb_validation_samples'] = sum( [len(files) for r, d, files in os.walk(input['validation_data_dir'])]) return input
"split_after_samples": 20 } }, 'fit': { "args": { "steps": 300 } }, 'evaluate': { 'args': { 'steps': 1 } } } sds = staging_data_set_business.get_by_id('595cb76ed123ab59779604c3') from server3.lib.models.randomforest import random_forest_model_fn result = custom_model(params, random_forest_model_fn, input, result_sds=sds) print(result) # # # # 测试 logistic_regressor # # input = { # 'model_name': 'logistic_regressor', # 'df_features': iris_feature, # 'df_labels': iris_label, # } # params = { # 'estimator': { # 'args': {
def wrapper(*args, **kw): # create a job staging_data_set_obj = staging_data_set_business.get_by_id( staging_data_set_id) project_obj = project_business.get_by_id(project_id) job_spec = { "fields": { "source": fields[0], "target": fields[1] }, "params": kw } job_obj = job_business.add_toolkit_job(toolkit_obj, staging_data_set_obj, project_obj, **job_spec) # update a project project_business.insert_job_by_id(project_id, job_obj.id) # calculate func_rst = func(*args, **kw) result = list(func_rst) if isinstance(func_rst, tuple) else [func_rst] # 新设计的存取方式 results = {"fields": {"source": fields[0], "target": fields[1]}} gen_info = [] result_spec = toolkit_obj.result_spec for arg in result_spec["args"]: value = result.pop(0) results.update({arg["name"]: value}) if arg["if_add_column"]: # 不能使用中文名 str_name = "%s_col" % toolkit_obj.entry_function value = data_utility.retrieve_nan_index(value, nan_index) try: staging_data_service.update_many_with_new_fields( value, nan_index, fields[0], str_name, staging_data_set_id) except (TypeError, ValueError) as e: print("ERRORS in data saved to database") if arg.get("attribute", False) and arg["attribute"] == "label": labels = value elif arg.get("attribute", False) and arg["attribute"] == "general_info": gen_info.append({ arg["name"]: { "value": value, "description": arg["des"] } }) # 可视化计算 # 聚类分析 if toolkit_obj.category == 0: json = { "scatter": data_utility.retrieve_nan_index(args[0], nan_index), "labels": labels, "pie": [{ 'name': el, 'value': labels.count(el) } for el in set(labels)], "centers": results["Centroids of Clusters"], "general_info": gen_info, "fields": fields[0], "category": toolkit_obj.category } # 特征选取 elif toolkit_obj.category == 1: from scipy.stats import pearsonr # from minepy import MINE data = list(zip(*args[0])) target_flag = 1 if len(args) == 2 else 0 target = args[1] if target_flag else None json = { "Y_target": fields[1], "X_fields": fields[0], "labels": labels, "bar": results["scores"], "general_info": { "Selected Features": "%s out of %s" % (len(list(filter(lambda x: x is True, labels))), len(fields[0])), "Selected Fields": " ".join( str(el) for el in list(compress(fields[0], labels))), "Number of NaN": len(nan_index) }, "scatter": { "y_domain": target, "x_domain": data, "pearsonr": [ pearsonr(el, target)[0] if target_flag else None for el in data ], # "mic": [MINE(alpha=0.6, c=15, est="mic_approx").compute_score(el, # list(data[0]).mic()) for el in list(data[1:])]} "mic": [None for el in data] }, "category": toolkit_obj.category } # 数值转换 elif toolkit_obj.category == 2: inn = 0 while inn in nan_index: inn = inn + 1 # 由于出来的数据格式不一致,判断是否为二维数据(是=>1, 不是=>0) flag_shape = 1 if isinstance(labels[inn], list) else 0 result_be = labels if flag_shape else np.array(labels).reshape( [-1, 1]).tolist() data = list(zip(*args[0])) result = list(zip(*result_be)) # 曾经两表合并,现在不需要了 # merge_data = list(zip(*(data + result))) if len(result) == len(fields[0]): lab_fields = [ str(fields[0][i]) + "_New_Col" for i in range(len(result)) ] else: lab_fields = [ str(fields[0][0]) + "_New_Col_" + str(i) for i in range(len(result)) ] # merge_fields = fields[0] + lab_fields flag_str1 = isinstance(args[0][inn][0], str) flag_str2 = isinstance(result_be[inn][0], str) bar1 = [] bar2 = [] for el in fields[0]: indx = fields[0].index(el) raw_d = data[indx] if not flag_str1 and len(set(raw_d)) > 5: bar1_tmp = visualization_service.freq_hist(raw_d) else: seta = set(raw_d) x_domain = [el for el in seta] y_domain = [raw_d.count(el) for el in seta] bar1_tmp = {'x_domain': x_domain, 'y_domain': y_domain} bar1_tmp.update({"field": el, "title": "数据分布直方图(栏位转换前)"}) bar1.append(bar1_tmp) for el in lab_fields: indx = lab_fields.index(el) raw_re = result[indx] if not flag_str2 and len(set(raw_re)) > 5: bar2_tmp = visualization_service.freq_hist(raw_re) else: seta = set(raw_re) x_domain = [el for el in seta] y_domain = [raw_re.count(el) for el in seta] bar2_tmp = {'x_domain': x_domain, 'y_domain': y_domain} bar2_tmp.update({"field": el, "title": "数据分布直方图(栏位转换后)"}) bar2.append(bar2_tmp) json = { "category": toolkit_obj.category, "table1": { "title": "原始数据", "field": fields[0], "data": [dict(zip(fields[0], arr)) for arr in args[0]] }, "table2": { "title": "转换后数据", "field": lab_fields, "data": [dict(zip(lab_fields, arr)) for arr in result_be] }, "bar1": bar1, "bar2": bar2 } # 降维 elif toolkit_obj.category == 3: flag = toolkit_obj.parameter_spec["data"]["type"][ "key"] == "transfer_box" data = list(zip(*args[0])) if flag: data.append(args[1]) lab = list(zip(*labels)) lab_fields = ["New Col" + str(i) for i in range(len(lab))] var1 = [np.var(da) for da in data] var2 = [np.var(da) for da in lab] merge_fields = fields[0] + fields[1] if fields[1] else \ fields[0] x_domain = merge_fields + ["_empty"] + lab_fields y_domain = var1 + [0] + var2 temp = var1[:-1] if flag else var1 json = { "table1": { "X_fields": fields[0], "Y_fields": fields[1], "data": [ dict(zip(merge_fields, arr)) for arr in list(zip(*data)) ] }, "table2": { "data": [dict(zip(lab_fields, arr)) for arr in labels], "fields": lab_fields }, "bar": { "x_domain": x_domain, "y_domain": y_domain }, "pie1": [{ "name": fields[0][i], "value": temp[i] } for i in range(len(temp))], "pie2": [{ "name": lab_fields[i], "value": var2[i] } for i in range(len(var2))], "general_info": gen_info, "category": toolkit_obj.category } else: json = {} # update a job job_business.end_job(job_obj) if result_spec["if_reserved"]: # create result sds for toolkit sds_name = '%s_%s_result' % (toolkit_obj['name'], job_obj['id']) result_sds_obj = staging_data_set_business.add(sds_name, 'des', project_obj, job=job_obj, type='result') logger_service.save_result( result_sds_obj, **{"result": json_utility.convert_to_json(results)}) logger_service.save_result(result_sds_obj, **{"visualization": json}) return { "visual_sds_id": str(result_sds_obj.id) if json else None, "result": results } return {"result": results}
def model_to_code(conf, project_id, data_source_id, model_id, job_obj, **kwargs): """ run model by model_id and the parameter config :param conf: :param project_id: :param data_source_id: :param model_id: :param kwargs: :return: """ file_id = kwargs.get('file_id') staging_data_set_obj = None if data_source_id: staging_data_set_obj = \ staging_data_set_business.get_by_id(data_source_id) project_obj = project_business.get_by_id(project_id) file_dict = {'file': ObjectId(file_id)} if file_id else {} model_obj = model_business.get_by_model_id(model_id) run_args = { "conf": conf, "project_id": project_id, "data_source_id": data_source_id, "model_id": model_id, "kwargs": kwargs } # # create model job # job_obj = job_business.add_model_job(model_obj, staging_data_set_obj, # project_obj, params=conf, # run_args=run_args, # **file_dict) job_obj = job_business.update_job_by_id(job_obj.id, model=model_obj, staging_data_set=staging_data_set_obj, project=project_obj, params=conf, run_args=run_args, status=100) job_id = str(job_obj.id) # model_obj = model_business.get_by_model_id(model_id) f = getattr(models, model_obj.to_code_function) if model_obj['category'] == 0: # keras nn head_str = manage_supervised_input_to_str(conf, data_source_id, **kwargs) return job_service.run_code(conf, project_id, data_source_id, model_obj, f, job_id, head_str) elif model_obj['category'] == ModelType['unstructured']: # input from folder head_str = manage_unstructured_to_str(conf, data_source_id, **kwargs) return job_service.run_code(conf, project_id, None, model_obj, f, job_id, head_str, file_id=data_source_id) elif model_obj['category'] == ModelType['advanced']: # no input return job_service.run_code(conf, project_id, None, model_obj, f, job_id, '', file_id=None) else: # custom models head_str = '' head_str += 'import logging\n' head_str += 'import numpy as np\n' head_str += 'import pandas as pd\n' head_str += 'import tensorflow as tf\n' head_str += 'from tensorflow.python.framework import constant_op\n' head_str += 'from tensorflow.python.framework import dtypes\n' head_str += 'from tensorflow.contrib.learn.python.learn import metric_spec\n' head_str += 'from server3.lib import models\n' head_str += 'from server3.lib.models.modified_tf_file.monitors import ValidationMonitor\n' head_str += 'from server3.business import staging_data_set_business\n' head_str += 'from server3.business import staging_data_business\n' head_str += 'from server3.service import staging_data_service\n' head_str += "from server3.service import job_service\n" head_str += 'from server3.service.model_service import ' \ 'split_categorical_and_continuous\n' head_str += 'from server3.service.custom_log_handler ' \ 'import MetricsHandler\n' head_str += 'model_fn = models.%s\n' % model_obj.entry_function head_str += "data_source_id = '%s'\n" % data_source_id head_str += "model_name = '%s'\n" % model_obj.name head_str += "kwargs = %s\n" % kwargs fit = conf.get('fit', None) if model_obj['category'] == 1: data_fields = fit.get('data_fields', [[], []]) head_str += 'data_fields = %s\n' % data_fields head_str += inspect.getsource( model_input_manager_custom_supervised) head_str += "input_dict = model_input_manager_custom_supervised(" \ "data_fields, data_source_id, model_name, **kwargs)\n" elif model_obj['category'] == 2: x_cols = fit.get('data_fields', []) head_str += "x_cols = %s\n" % x_cols head_str += inspect.getsource(model_input_manager_unsupervised) head_str += "input_dict = model_input_manager_unsupervised(x_cols, " \ "data_source_id, model_name)\n" return job_service.run_code(conf, project_id, data_source_id, model_obj, f, job_id, head_str)
def kube_run_model(conf, project_id, data_source_id, model_id, job_obj, **kwargs): # file_id = kwargs.get('file_id') staging_data_set_obj = None if data_source_id: staging_data_set_obj = \ staging_data_set_business.get_by_id(data_source_id) project_obj = project_business.get_by_id(project_id) # file_dict = {'file': ObjectId(file_id)} if file_id else {} model_obj = model_business.get_by_model_id(model_id) run_args = { "conf": conf, "project_id": project_id, "data_source_id": data_source_id, "model_id": model_id, "kwargs": kwargs } job_obj = job_business.update_job_by_id(job_obj.id, model=model_obj, staging_data_set=staging_data_set_obj, project=project_obj, params=conf, run_args=run_args, status=100) job_id = str(job_obj.id) print(job_id) return run_model(conf, project_id, data_source_id, model_id, job_id, **kwargs) cwd = os.getcwd() job_name = job_id + '-training-job' client = kube_service.client try: # TODO need to terminate running pod kube_service.delete_job(job_name) while True: kube_service.get_job(job_name) time.sleep(1) except client.rest.ApiException: print('job not exists or deleted, ok to create') kube_json = { "apiVersion": "batch/v1", "kind": "Job", "metadata": { "name": job_name }, "spec": { "template": { "metadata": { "labels": { "app": job_id } }, "spec": { "containers": [ { "name": job_id, "image": "10.52.14.192/gzyw/model_app_pre", "imagePullPolicy": "IfNotPresent", "securityContext": { "privileged": True, }, "stdin": True, "command": ["/usr/local/bin/python"], "args": [ "run_model.py", "--job_id", job_id ], "volumeMounts": [ { "mountPath": "/pyserver/user_directory", "name": "nfsvol" }, ] } ], "restartPolicy": "Never", # "activeDeadlineSeconds": 1, "volumes": [ { "name": "nfsvol", "persistentVolumeClaim": { "claimName": "nfs-pvc" } }, ] }, }, } } # file_utils.write_to_filepath(json.dumps(kube_json), './model_app.json') # return api = kube_service.job_api resp = api.create_namespaced_job(body=kube_json, namespace=NAMESPACE) print("Job created. status='%s'" % str(resp.status)) return {'job_id': job_id}