def init_source(experiment_id): """ 初始化算法需要的初始数据 :param experiment_id: 试验id :return: """ out = redis_util.load('experiment_' + experiment_id) if out is not None: return rest.responseto(out) experiment = experiment_service.get_experiment(experiment_id) source = source_service.get_source(experiment.source_id) # 获得已经选择的变量,包括target selected_variable = variable_service.get_selected_variables(source.id) selected = selected_variable['used'] selected_list = map(lambda x: x.variable_name, selected) target = selected_variable['target'].variable_name # 解析文件为dataframe data_frame = None if source.file_type == DBConstant.SOURCE_FILE_TYPE_CSV: data_frame = pandas.read_csv(source.file_path) elif source_service == DBConstant.SOURCE_FILE_TYPE_EXCEL: data_frame = pandas.read_excel(source.file_path) # 变量分bin初始化 out = lr_service.get_init(data_frame, valid=selected_list, target=target, fineMinLeafRate=0) redis_util.dump('experiment_' + experiment_id, out) return rest.responseto(out)
def if_applyed(): """ 在挑选变量进入ks计算之前,判断woe是否已经apply到train文件了 """ if safely_apply: return rest.responseto(success=True) else: return rest.responseto(success=False)
def create_model_name(): model_name = request.args.get("model_name") if tool_model_service.load_model(model_name=model_name, model_branch="master") is None: if tool_model_service.create_branch(): return rest.responseto({ "model_name": model_name, "model_branch": "master" }) else: return rest.responseto("create fail", success=False) return rest.responseto("name exist", success=False)
def parse(): # 对train文件进行转换,分析 model_name = request.form.get("modelName") branch = request.form.get("branch") # 用户指定的文件相对路径 file_path = request.form.get("filePath") target = request.form.get("target") root_path = app.config["ROOT_PATH"] path = root_path + "/" + file_path # 以模型名称和分支名作为唯一的key key = model_name + "_" + branch # df_train = None # 流程继续下去的前提就是路径是真实存在的 if os.path.exists(path): # 检查是否已经加载过了 if global_value.has_key(key) is False: # 重新加载资源 df_all = pd.read_excel(path) df_train = df_all[df_all['dev_ind'] == 1] df_test = df_all[df_all['dev_ind'] == 0] df_map = { model_name + "_" + branch: { "df_all": df_all, "df_train": df_train, "df_test": df_test } } global_value.set_value(**df_map) else: df_map = global_value.get_value(key) df_train = df_map['df_train'] df = ba.get_df_summary(df_train) # 得到df_train,将dataframe转换为用于展示前端展示的数据 data_map = cmm.df_for_html(df) result = tool_model_service.load_model(model_name=model_name, model_branch=branch) branches = [] v = result[0] for n in result: branches.append(n.model_branch) # data_map["current_model"] = model_name data_map["branches"] = branches data_map["selected_list"] = v.selected_list data_map["target"] = v.model_target return rest.responseto(data=data_map) else: return rest.responseto(message="file not exist", success=False)
def get_projects(user_id): """ 获得该用户所有的工程 :return: """ projects = project_service.get_projects(user_id) return rest.responseto(projects,cls = tool_model.AlchemyEncoder)
def divide_manually(): boundary = request.form.get("boundary") variable_name = request.form.get("variable_name") branch = request.form.get("branch") model_name = request.form.get("model_name") type = request.form.get("type") df_map = global_value.get_value(model_name + "_" + branch) df_train = df_map['df_train'] boundary_list = [] if type == "true": for s in boundary.split(","): temp = [] temp.extend(map(cmm.transfer, s.split("|"))) boundary_list.append(temp) columns = ['bin_num', variable_name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] else: for s in boundary.split(","): boundary_list.append(float(s)) columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] target = tool_model_service.load_model(model_name=model_name, model_branch=branch)[0]["model_target"] result = bf.adjust_bin(df_train, type == "true", variable_name, boundary_list , target=target, expected_column={variable_name}) iv = result['IV'].sum() df = pd.DataFrame(result, columns=columns) data = generate_response(variable_name, df, iv) return rest.responseto(data=data)
def user_register(): """ 用户注册接口 :param email:用户邮箱 :param password: 用户密码 :param nick_name: 用户昵称 :return: 注册是否成功 """ email = request.form.get("email") password = request.form.get("password") nick_name = request.form.get("nick_name") if email is None or password is None: raise Error.USER_LACK_NECESSARY_INFO user = tool_model.User() user.user_email = email # 将密码进行MD5加密 user.user_password = cmm.to_md5(password) user.user_nick = nick_name result = user_service.add_user(user) if result is None: result = False else: result = True return rest.responseto(result)
def get_res(key): url = es_host + key+"/_search?pretty" d = json.dumps({"size": 5000}) # response = requests.get(url) response = requests.post(url, data=d) # print(response) return rest.responseto(data=json.loads(response.text))
def init(): """ 初始化binning record :return: dict 格式: {variable_name1 -var_table -var_params(province,goods,bads...) -is_selected -iv variable_name2...} """ name = request.form.get("modelName") branch = request.form.get("branch") # 根据key从内存中获取已上传文件的相关参数 df_map = global_value.get_value(name + "_" + branch) result = tool_model_service.load_model(model_name=name, model_branch=branch) # selected_list在数据库中是json格式,在python中是一个dict,格式为:select_variable:index(变量的位置) selected_list_json = json.loads(result[0].selected_list) selected_list = selected_list_json.keys() min_val = 0 df = df_map['df_train'] init_result = get_init(df, target=result[0].model_target, valid=selected_list) # 根据init_result获得变量的区间 out = get_boundary(init_result, min_val) # 根据iv排序 out_sorted_iv = sort_iv(out) return rest.responseto(data=out_sorted_iv)
def rank(): """ 外部按钮点击触发排序 :return: 返回的结果与init一致 """ data = request.form.get("data") json_obj = json.loads(data) out_sorted_iv = sort_iv(json_obj) return rest.responseto(data=out_sorted_iv)
def login(): """ 用户登录 :return: 邮箱和密码是否一致 """ email = request.form.get('email') password = request.form.get('password') result = user_service.auth_user(email, password) return rest.responseto(result)
def handle_error(e): """ 全局异常拦截 :param e: 被截获的异常 :return: 接口抛出异常后直接将这个异常返回 """ code = 500 if isinstance(e, HoneybeeException): code = e.code log.error(e, exc_info=1) return rest.responseto(None, message=e.message, code=code, success=False)
def save(): model_name = request.values.get("model_name") branch = request.values.get("branch") data = request.values.get("data") dict = json.loads(data) tool_model_service.del_binning_record(model_name, branch) list = [] for key, val in dict.items(): obj = ModelContent(model_name=model_name, model_branch=branch, variable_name=key, variable_iv=val["iv"], binning_record=json.dumps(val["var_table"],ensure_ascii=False), is_selected=val["is_selected"]) list.append(obj) if tool_model_service.save_binning_record(list) is not True: return rest.responseto(success=False) return rest.responseto()
def init_model_name(): """ training文件变量初始化页面,将会初始化各个模型的条目供用户选择 :return: [model_name1,model_name2,model_name3] """ result = tool_model_service.load_model(is_deleted=0) # only get model_name form result result = list(set(map(lambda x: x.model_name, result))) return rest.responseto(result)
def get_branch_name(): """ 根据模型名称获得分支信息和文件路径 :param: model_name :return: model_name file_path """ model_name = request.args.get("modelName") result = tool_model_service.load_model(model_name=model_name) # get model_branch and file_path from result result = list(map(lambda x: x.model_branch, result)) return rest.responseto(result)
def upload(): """工具依赖的源文件修改""" # 在跨域的情况下,前端会发送OPTIONS请求进行试探,然后再发送POST请求 if request.method == 'POST': # 获取training文件上传的路径 storage = app.config['ROOT_PATH'] files = request.files.getlist("file[]") for file in files: from unicodedata import normalize filename = normalize('NFKD', file.filename).encode('utf-8', 'ignore') file_path = storage + "/" + filename # filename = secure_filename(file.filename.decode('utf-8')) if (os.path.exists(file_path)): return rest.responseto(data="file exist", success=False) else: file.save(file_path) model_name = filename.split(".") tool_model_service.create_branch(model_name=model_name[0], model_branch="master", create_date=datetime.now(), modify_date=datetime.now(), file_path=file_path, model_target="") return rest.responseto(data="success")
def load_all(): model_name = request.values.get("model_name") branch = request.values.get("branch") result = tool_model_service.load_binning_record(model_name, branch) data = {} if result is not None: for row in result: data[row["variable_name"]] = {"iv": row["variable_iv"], "var_table": json.loads(row["binning_record"]), "is_selected":row["is_selected"]==const.SELECTED} return rest.responseto(data = data)
def new_branch(): model_name = request.form.get("model_name") branch = request.form.get("branch") original_branch = request.form.get("original_branch") result = tool_model_service.load_binning_record(model_name, original_branch) list = [] for record in result: obj = ModelContent(model_name=model_name, model_branch=branch, variable_name=record["variable_name"], variable_iv=record["variable_iv"], binning_record=record["binning_record"], is_selected=record["is_selected"]) list.append(obj) if tool_model_service.copy_branch(model_name, branch, original_branch): tool_model_service.save_binning_record(list) return rest.responseto(data=True) return rest.responseto(data=False)
def update_source(source_id): set_name = request.form.get("setName") if_delete = request.form.get("ifDelete") source = tool_model.Source() source.id = source_id source.set_name = set_name source.is_deleted = 1 if bool(if_delete) else None num = source_service.update_source(source) if num > 0: return rest.responseto("") else: raise Error.SOURCE_UPDATE_FAIL
def load_applyed(): """读取apply后的文件""" # 在跨域的情况下,前端会发送OPTIONS请求进行试探,然后再发送POST请求 if request.method == 'POST': # 获取training文件上传的路径 model_name = request.form.get("model_name") branch = request.form.get("branch") files = request.files.getlist("file[]") for file in files: df_map = global_value.get_value(model_name + "_" + branch) df = pd.read_excel(file, encoding="utf-8") df_map["df_train_woe"] = df[df['dev_ind'] == 1] df_map["df_test_woe"] = df[df['dev_ind'] == 0] return rest.responseto(data="success")
def use_or_not(source_id): """ 预览数据集后,选择有价值的变量 :return: 如果操作不顺利,异常在外层将会被拦截 """ form = request.form selected_variable_str = form.get('selected') target = form.get('target') selected_variable_list = None if selected_variable_str is not None: selected_variable_list = selected_variable_str.split(",") variable_service.set_variable_selected(source_id, selected_variable_list, target) return rest.responseto("")
def add_project(): """ 新建一个工程 :param:projectName 工程名称 :param:projectTask 工程处理的模型类型 :param:projectDesc 工程描述 :return: 是否添加成功 True or False """ form = request.form project = tool_model.Project() project.owner_id = form.get("ownerId") project.project_name = form.get("projectName") project.project_task = form.get("projectTask") project.project_desc = form.get("projectDesc") result = project_service.create_project(project)[0] return rest.responseto(result)
def add_experiment(): """ 添加一个算法实验 :param title: 试验名称 :param project_id: 工程id :param source_id: 资源id :param algorithm: 算法代号 :return: primary_key """ form = request.form experiment = tool_model.Experiment() experiment.title = form.get('title') experiment.project_id = form.get('project_id') experiment.source_id = form.get('source_id') experiment.algorithm = form.get('algorithm') primary_key = experiment_service.create_experiment(experiment) return rest.responseto(primary_key)
def list_source(project_id): """ 罗列该工程所有的资源 :param project_id: 所属工程的id :return: 所有资源的相关参数 """ sources = source_service.get_sources(project_id=project_id) def map_value(source): d = { 'setName': source.set_name, 'fileName': source.file_name, 'fileSize': source.file_size, 'fileScope': source.file_scope, 'addAt': source.create_date.isoformat(sep=" "), 'origin': source.file_origin, 'readable': source.file_readable } return d list = map(map_value, sources) return rest.responseto(list)
def add_source(): """ 新增一个数据源 :param:projectId 所属工程的id :param:fileScope 文件可被访问的范围 :param:fileName 上传的文件的别名 :param:dataSet 上传的文件 :param:fileOrigin 文件的来源 :return: 是否添加成功 True or False """ form = request.form source = tool_model.Source() source.project_id = form.get('projectId') source.file_scope = form.get('fileScope') source.set_name = form.get('setName') # 用户为dataset指定的别名 source.file_origin = form.get('fileOrigin') # 文件的来源 # 获取上传的文件并判断 files = request.files.getlist("dataSet") if len(files) <= 0: # 若文件不存在时将会抛出异常 raise Error.SOURCE_NO_FILE_RECEIVED f = files[0] from unicodedata import normalize file_name = normalize('NFKD', f.filename).encode('utf-8', 'ignore') file_path = app.config['ROOT_PATH'] + "/" + file_name file_type = file_name[file_name.find('.') + 1:] # 保存文件至本地 f.save(file_path) size = simple_util.get_file_size(file_path) source.file_type = file_type source.file_name = file_name # 上传的文件的名称 source.file_path = file_path source.file_size = size result = source_service.add_source(source)[0] return rest.responseto(result)
def commit_branch(): model_name = request.form.get("model_name") branch = request.form.get("branch") selected_list = request.form.get("selected_list") target = request.form.get("target") file_path = request.form.get("file_path") root_path = app.config["ROOT_PATH"] path = root_path + "/" + file_path key = model_name+"_"+branch if global_value.has_key(key) is False: # 重新加载资源 df_all = pd.read_excel(path) df_train = df_all[df_all['dev_ind'] == 1] df_test = df_all[df_all['dev_ind'] == 0] df_map = {model_name + "_" + branch: {"df_all": df_all, "df_train": df_train, "df_test": df_test}} global_value.set_value(**df_map) return rest.responseto(data=tool_model_service.update_branch(model_name, branch, target, selected_list=selected_list, file_path = file_path))
def checkout(): model_name = request.values.get("model_name") branch = request.values.get("branch") result = tool_model_service.load_model(model_name=model_name, model_branch = branch) return rest.responseto(data=result[0])
def es_req(key): url = app.config["es_host"] + key + "/_search?pretty" response = requests.get(url) return rest.responseto(data=json.loads(response.text))
def divide(): """ 分裂操作 先将从data中得到的范围,从excel中筛选相应的数据 筛选完成后,调用init方法对数据进行初始化,得到一定数据的范围区间 将该范围区间与原来的区间合并. 调用adjust方法获得的结果即为分裂后的结果 :return: {variable_name1 -var_table -var_params{province,goods,bads...} -iv } """ model_name = request.form.get("modelName") branch = request.form.get('branch') df_map = global_value.get_value(model_name + "_" + branch) df_train = df_map['df_train'] min_val = 0 data = request.form.get('data') # 解析json data_map = json.loads(data, object_pairs_hook=OrderedDict) name = data_map["name"] target = request.form.get("target") # 将excel转化为dataframe,只读取target和name两列 df = pd.DataFrame(df_train, columns={target, name}) bound_list = None if data_map["selected"]["type"] == 'Numerical': # 根据min和max的边界去筛选数据 min = data_map["selected"]["min_boundary"] max = data_map["selected"]["max_boundary"] df = df[(df[name].astype(float) >= float(min)) & (df[name].astype(float) < float(max))] out = get_init(df, target=target, invalid=[], fineMinLeafRate=0) bound_list = get_divide_min_bound(out) list = data_map["table"] # 删除要被分裂的项 del list[data_map["selectedIndex"]] for v in list: bound_list.append(float(v["min_boundary"])) # bound_list.append(np.nan) result = bf.adjust_bin(df_train, data_map["selected"]["type"] == 'Categorical', name, bound_list , target=target, expected_column={name}) columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] iv = result['IV'].sum() df = pd.DataFrame(result, columns=columns) data = generate_response(name, df, iv) # data = get_merged(name, df, min_val) return rest.responseto(data=data) else: val = data_map["selected"][name].split("|") df[name] = df[name].apply(lambda x: simple_util.float_nan_to_str_nan(x)) df = df[df[name].isin(val)] list = data_map["table"] # 删除要被分裂的项 del list[data_map["selectedIndex"]] out = get_init(df, target=target, invalid=[], fineMinLeafRate=0) bound_list = get_divide_caterotical_bound(out, name) # 被分裂的项的下标 index = data_map["selectedIndex"] # 将分裂的结果加入原有的列表中 for v in list: bound_list.append(map(cmm.transfer, v[name].split("|"))) result = bf.adjust_bin(df_train, data_map["selected"]["type"] == 'Categorical', name, bound_list , target=target, expected_column={name}) iv = result['IV'].sum() columns = ['bin_num', name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] df = pd.DataFrame(result, columns=columns) data = generate_response(name, df, iv) # data = get_merged(name, df, min_val) return rest.responseto(data=data)
def merge(): """归并操作""" model_name = request.form.get("modelName") branch = request.form.get("branch") # 要执行合并的variable var_name = request.form.get('varName') # 变量的类型 type = request.form.get('type').encode('utf-8') # 选定的范围 boundary = request.form.get('boundary').encode('utf-8') # 每个bin_num的max的大小,都以逗号隔开 # 总的范围 all_boundary = request.form.get('allBoundary').encode('utf-8') # 每个bin_num的max的大小,都以逗号隔开 # 获得target # target = request.form.get('allBoundary').encode('utf-8'); target = request.form.get('target') if target is None: target = 'bad_4w' excepted_column = {var_name} min_val = 0 df_map = global_value.get_value(model_name + "_" + branch) result = None type_bool = False df = None if type == 'Numerical': # 将字符转换为list boundary_list = map(eval, boundary.split("&")) all_boundary_list = [] # 将字符转换为list,nan替换为np.nan for a in all_boundary.split("&"): if a != 'nan': a = float(a) else: a = np.nan all_boundary_list.append(a) boundary_list = list(set(all_boundary_list).difference(set(boundary_list))) # boundary_list.append(np.nan) selected_list = boundary_list columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] else: type_bool = True temp = [] for s in boundary.split("&"): temp.extend(map(cmm.transfer, s.split("|"))) selected_list = [temp] if all_boundary != '': for s in all_boundary.split("&"): selected_list.append(map(cmm.transfer, s.split("|"))) columns = ['bin_num', var_name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] result = bf.adjust_bin(df_map["df_train"], type_bool, var_name, selected_list, target=target, expected_column=excepted_column) # 获得合并的结果 iv = result['IV'].sum() df = pd.DataFrame(result, columns=columns) data = generate_response(var_name, df, iv) # data = get_merged(var_name, df, min_val) return rest.responseto(data=data)