def init(): """ 初始化binning record :return: dict 格式: {variable_name1 -var_table -var_params(province,goods,bads...) -is_selected -iv variable_name2...} """ name = request.form.get("modelName") branch = request.form.get("branch") # 根据key从内存中获取已上传文件的相关参数 df_map = global_value.get_value(name + "_" + branch) result = tool_model_service.load_model(model_name=name, model_branch=branch) # selected_list在数据库中是json格式,在python中是一个dict,格式为:select_variable:index(变量的位置) selected_list_json = json.loads(result[0].selected_list) selected_list = selected_list_json.keys() min_val = 0 df = df_map['df_train'] init_result = get_init(df, target=result[0].model_target, valid=selected_list) # 根据init_result获得变量的区间 out = get_boundary(init_result, min_val) # 根据iv排序 out_sorted_iv = sort_iv(out) return rest.responseto(data=out_sorted_iv)
def divide_manually(): boundary = request.form.get("boundary") variable_name = request.form.get("variable_name") branch = request.form.get("branch") model_name = request.form.get("model_name") type = request.form.get("type") df_map = global_value.get_value(model_name + "_" + branch) df_train = df_map['df_train'] boundary_list = [] if type == "true": for s in boundary.split(","): temp = [] temp.extend(map(cmm.transfer, s.split("|"))) boundary_list.append(temp) columns = ['bin_num', variable_name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] else: for s in boundary.split(","): boundary_list.append(float(s)) columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] target = tool_model_service.load_model(model_name=model_name, model_branch=branch)[0]["model_target"] result = bf.adjust_bin(df_train, type == "true", variable_name, boundary_list , target=target, expected_column={variable_name}) iv = result['IV'].sum() df = pd.DataFrame(result, columns=columns) data = generate_response(variable_name, df, iv) return rest.responseto(data=data)
def variable_verify(): """ 变量相关性校验 """ model_name = request.form.get("modelName") branch = request.form.get("branch") corr_cap = request.form.get("corrCap") if corr_cap is not None: corr_cap = float(corr_cap) else: corr_cap = 1 variables = request.form.get("variables") variable_list = variables.split(",") variable_list = [x + "_woe" for x in variable_list] df_map = global_value.get_value(model_name + "_" + branch) df_train_woe = df_map["df_train_woe"] df = pd.DataFrame(df_train_woe, columns=variable_list) if common.is_valid_correlation(df, corr_cap) is False: result = common.get_correlation(df) return rest.responseto(result.to_dict()) else: return rest.responseto(None)
def parse(): # 对train文件进行转换,分析 model_name = request.form.get("modelName") branch = request.form.get("branch") # 用户指定的文件相对路径 file_path = request.form.get("filePath") target = request.form.get("target") root_path = app.config["ROOT_PATH"] path = root_path + "/" + file_path # 以模型名称和分支名作为唯一的key key = model_name + "_" + branch # df_train = None # 流程继续下去的前提就是路径是真实存在的 if os.path.exists(path): # 检查是否已经加载过了 if global_value.has_key(key) is False: # 重新加载资源 df_all = pd.read_excel(path) df_train = df_all[df_all['dev_ind'] == 1] df_test = df_all[df_all['dev_ind'] == 0] df_map = { model_name + "_" + branch: { "df_all": df_all, "df_train": df_train, "df_test": df_test } } global_value.set_value(**df_map) else: df_map = global_value.get_value(key) df_train = df_map['df_train'] df = ba.get_df_summary(df_train) # 得到df_train,将dataframe转换为用于展示前端展示的数据 data_map = cmm.df_for_html(df) result = tool_model_service.load_model(model_name=model_name, model_branch=branch) branches = [] v = result[0] for n in result: branches.append(n.model_branch) # data_map["current_model"] = model_name data_map["branches"] = branches data_map["selected_list"] = v.selected_list data_map["target"] = v.model_target return rest.responseto(data=data_map) else: return rest.responseto(message="file not exist", success=False)
def load_applyed(): """读取apply后的文件""" # 在跨域的情况下,前端会发送OPTIONS请求进行试探,然后再发送POST请求 if request.method == 'POST': # 获取training文件上传的路径 model_name = request.form.get("model_name") branch = request.form.get("branch") files = request.files.getlist("file[]") for file in files: df_map = global_value.get_value(model_name + "_" + branch) df = pd.read_excel(file, encoding="utf-8") df_map["df_train_woe"] = df[df['dev_ind'] == 1] df_map["df_test_woe"] = df[df['dev_ind'] == 0] return rest.responseto(data="success")
def apply(): """将train数据得到的woe与test数据进行匹配""" req = request.form.get('data') var_dict = json.loads(req) model_name = var_dict["modelName"] branch = var_dict["branch"] df_map = global_value.get_value(model_name + "_" + branch) data = var_dict["data"] # df = df_test.append(df_train) df = df_map['df_all'].copy() var_list = data.keys() for var_name in var_list: df[var_name + '_woe'] = df[var_name].apply(lambda var_value: apply_get_woe_value(var_name, var_value, data)) global withIntercept withIntercept = True # if withIntercept: # df['intercept_woe'] = 1.0 df_map["df_train_woe"] = df[df['dev_ind'] == 1] df_map["df_test_woe"] = df[df['dev_ind'] == 0] global apply_result, safely_apply apply_result = df safely_apply = True output = BytesIO() writer = pd.ExcelWriter(output, engine='xlsxwriter') df.to_excel(writer, startrow=0, merge_cells=False, sheet_name="Sheet_1") workbook = writer.book worksheet = writer.sheets["Sheet_1"] format = workbook.add_format() format.set_bg_color('#eeeeee') worksheet.set_column(0, 9, 28) writer.close() output.seek(0) file = send_file(output, as_attachment=True, attachment_filename='df_iv.xlsx') response = rest.make_response(file) return rest.responsePandas(response)
def variable_select_manual(): """ 手动选择变量 """ all_list = request.form.get("all_list") selected_list = request.form.get("selected_list") target = request.form.get("target") with_intercept = request.form.get("with_intercept") == 'true' model_name = request.form.get("modelName") branch = request.form.get("branch") ks_group_num = request.form.get("ks_group_num") df_map = global_value.get_value(model_name + "_" + branch) df_train_woe = df_map["df_train_woe"] df_test_woe = df_map["df_test_woe"] ks_group_num = ks_group_num if ks_group_num != '' else 20 data = lmf.get_logit_backward_manually(df_train_woe, df_test_woe, all_list.split(","), selected_list.split(","), target, ks_group_num, with_intercept) return rest.responseto(data=data)
def variable_select(): """ apply完成后,第一次进入时的变量选择 """ model_name = request.form.get("modelName") branch = request.form.get("branch") var_list = request.form.get("var_list") df_map = global_value.get_value(model_name + "_" + branch) # 调用接口时发现var_list为空,那么主动从数据库中读取 if var_list is None or var_list == '': result = tool_model_service.get_selected_variable(model_name, branch)[0] var_list = result["selected_variable"].decode('utf-8') else: # 清除旧数据,插入新的数据 if (tool_model_service.del_selected_variable(model_name, branch)): tool_model_service.save_selected_variable(model_name, branch, var_list) else: return rest.responseto(messege="fail to save selected variable", success=False) target = request.form.get("target") withIntercept = request.form.get("with_intercept") == 'true' ks_group_num = request.form.get("ks_group_num") ks_group_num = ks_group_num if ks_group_num != '' else 20 df_train_woe = df_map["df_train_woe"] df_test_woe = df_map["df_test_woe"] data = lmf.get_logit_backward(df_train_woe, df_test_woe, target, ks_group_num, var_list.split(","), withIntercept) if data is None: return rest.responseto(success=False) return rest.responseto(data=data)
def ppp(): return global_value.get_value("")
def divide(): """ 分裂操作 先将从data中得到的范围,从excel中筛选相应的数据 筛选完成后,调用init方法对数据进行初始化,得到一定数据的范围区间 将该范围区间与原来的区间合并. 调用adjust方法获得的结果即为分裂后的结果 :return: {variable_name1 -var_table -var_params{province,goods,bads...} -iv } """ model_name = request.form.get("modelName") branch = request.form.get('branch') df_map = global_value.get_value(model_name + "_" + branch) df_train = df_map['df_train'] min_val = 0 data = request.form.get('data') # 解析json data_map = json.loads(data, object_pairs_hook=OrderedDict) name = data_map["name"] target = request.form.get("target") # 将excel转化为dataframe,只读取target和name两列 df = pd.DataFrame(df_train, columns={target, name}) bound_list = None if data_map["selected"]["type"] == 'Numerical': # 根据min和max的边界去筛选数据 min = data_map["selected"]["min_boundary"] max = data_map["selected"]["max_boundary"] df = df[(df[name].astype(float) >= float(min)) & (df[name].astype(float) < float(max))] out = get_init(df, target=target, invalid=[], fineMinLeafRate=0) bound_list = get_divide_min_bound(out) list = data_map["table"] # 删除要被分裂的项 del list[data_map["selectedIndex"]] for v in list: bound_list.append(float(v["min_boundary"])) # bound_list.append(np.nan) result = bf.adjust_bin(df_train, data_map["selected"]["type"] == 'Categorical', name, bound_list , target=target, expected_column={name}) columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] iv = result['IV'].sum() df = pd.DataFrame(result, columns=columns) data = generate_response(name, df, iv) # data = get_merged(name, df, min_val) return rest.responseto(data=data) else: val = data_map["selected"][name].split("|") df[name] = df[name].apply(lambda x: simple_util.float_nan_to_str_nan(x)) df = df[df[name].isin(val)] list = data_map["table"] # 删除要被分裂的项 del list[data_map["selectedIndex"]] out = get_init(df, target=target, invalid=[], fineMinLeafRate=0) bound_list = get_divide_caterotical_bound(out, name) # 被分裂的项的下标 index = data_map["selectedIndex"] # 将分裂的结果加入原有的列表中 for v in list: bound_list.append(map(cmm.transfer, v[name].split("|"))) result = bf.adjust_bin(df_train, data_map["selected"]["type"] == 'Categorical', name, bound_list , target=target, expected_column={name}) iv = result['IV'].sum() columns = ['bin_num', name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] df = pd.DataFrame(result, columns=columns) data = generate_response(name, df, iv) # data = get_merged(name, df, min_val) return rest.responseto(data=data)
def merge(): """归并操作""" model_name = request.form.get("modelName") branch = request.form.get("branch") # 要执行合并的variable var_name = request.form.get('varName') # 变量的类型 type = request.form.get('type').encode('utf-8') # 选定的范围 boundary = request.form.get('boundary').encode('utf-8') # 每个bin_num的max的大小,都以逗号隔开 # 总的范围 all_boundary = request.form.get('allBoundary').encode('utf-8') # 每个bin_num的max的大小,都以逗号隔开 # 获得target # target = request.form.get('allBoundary').encode('utf-8'); target = request.form.get('target') if target is None: target = 'bad_4w' excepted_column = {var_name} min_val = 0 df_map = global_value.get_value(model_name + "_" + branch) result = None type_bool = False df = None if type == 'Numerical': # 将字符转换为list boundary_list = map(eval, boundary.split("&")) all_boundary_list = [] # 将字符转换为list,nan替换为np.nan for a in all_boundary.split("&"): if a != 'nan': a = float(a) else: a = np.nan all_boundary_list.append(a) boundary_list = list(set(all_boundary_list).difference(set(boundary_list))) # boundary_list.append(np.nan) selected_list = boundary_list columns = ['bin_num', 'min', 'max', 'min_boundary', 'max_boundary', 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] else: type_bool = True temp = [] for s in boundary.split("&"): temp.extend(map(cmm.transfer, s.split("|"))) selected_list = [temp] if all_boundary != '': for s in all_boundary.split("&"): selected_list.append(map(cmm.transfer, s.split("|"))) columns = ['bin_num', var_name, 'bads', 'goods', 'total', 'total_perc', 'bad_rate', 'woe', 'type'] result = bf.adjust_bin(df_map["df_train"], type_bool, var_name, selected_list, target=target, expected_column=excepted_column) # 获得合并的结果 iv = result['IV'].sum() df = pd.DataFrame(result, columns=columns) data = generate_response(var_name, df, iv) # data = get_merged(var_name, df, min_val) return rest.responseto(data=data)