def proc_validattion(dataset_path,config_path,model_path): print '####PROC VALIDATION#####' print 'dataset_path:\n',dataset_path print 'config_path:\n',config_path print 'model_path:\n',model_path #fillna config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv' cfg = config.config() cfg.load_file(config_path, dataset_path) for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 output = open(model_path, 'rb') clf_model = pickle.load(output) output.close() clf = clf_model['clf'] X_test = cfg.dataset_train[clf_model['features_list']] y_test = cfg.dataset_train['target'] y_hat = clf.predict_proba(X_test)[:,1] ks = compute_ks(y_hat,y_test) print 'global_bt:',cfg.global_bt print 'global_gt:', cfg.global_gt print 'ks:',ks return ks
def fit_single_lr(dataset_path, config_path, var_list_specfied, out_model_path, c=0.01): dataset_train = pd.read_csv(dataset_path) cfg = pd.read_csv(config_path) candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name'] b = [ var for var in dataset_train.columns if sum(dataset_train[var].isnull()) == 0 ] candidate_var_list = list(set(candidate_var_list).intersection(set(b))) if var_list_specfied.__len__() > 0: candidate_var_list = list( set(candidate_var_list).intersection(set(var_list_specfied))) print('candidate_var_list length:\n', candidate_var_list.__len__()) print('candidate_var_list:\n', candidate_var_list) print('change dtypes:float64 to float32') for var in candidate_var_list: dataset_train[var] = dataset_train[var].astype(np.float32) X_train = dataset_train[dataset_train.target >= 0][candidate_var_list] y_train = dataset_train[dataset_train.target >= 0]['target'] print('c:', c) clf_lr_a = LogisticRegression(C=c, penalty='l1', tol=0.01, class_weight='balanced') clf_lr_a.fit(X_train, y_train) coefs = clf_lr_a.coef_.ravel().copy() proba = clf_lr_a.predict_proba(X_train)[:, 1] ks = compute_ks(proba, y_train) model = {} model['clf'] = clf_lr_a model['features_list'] = candidate_var_list model['coefs'] = coefs model['ks'] = ks output = open(out_model_path, 'wb') pickle.dump(model, output) output.close() return model
def warm_up(params_list): X_train, y_train, d, l1, alpha = params_list trainset = zip(X_train, y_train) # 初始化 ks_list = [] ftrl = FTRL(dim=d, l1=l1, l2=1.0, alpha=alpha, beta=1.0) print('%s\tFTRL MODEL TRAINING ROUND:[l1]\t%s\t[alpha]\t%s\t' % (time.asctime(time.localtime(time.time())), str(l1), str(alpha))) for j in range(50): # 50 datasubset = trainset[j * 10000:(j + 1) * 10000] ftrl.train(datasubset, verbos=0, max_itr=10000, eta=0.01, epochs=100) y_hat = 1.0 / (1.0 + np.exp(X_train.dot(ftrl.w))) ks = compute_ks(y_hat, y_train) ks_list.append(ks) print('%s\t[l1]\t%s\t[alpha]\t%s\titer=%s\tks:%s' % (time.asctime(time.localtime( time.time())), str(l1), str(alpha), str( (j + 1) * 10000), str(ks))) print('%s\tFTRL MODEL TRAINING ROUND:[l1]\t%s\t[alpha]\t%s\t[ks_avg]\t%s' % (time.asctime(time.localtime(time.time())), str(l1), str(alpha), str(sum(ks_list) / ks_list.__len__()))) return sum(ks_list) / ks_list.__len__()
def grid_search_lr_c_validation( X_train, y_train, validation_dataset_list, cs=[0.01], df_coef_path=False, pic_coefpath_title='Logistic Regression Path', pic_coefpath=False, pic_performance_title='Logistic Regression Performance', pic_performance=False): """ grid search optimal hyper parameters c with the best ks performance :param X_train: features dataframe :param y_train: target :param cs: list of c value :param df_coef_path: the file path for logistic regression coefficient dataframe :param pic_coefpath_title: the pic title for coefficient path picture :param pic_coefpath: the file path for coefficient path picture :param pic_performance_title: the pic title for ks performance picture :param pic_performance: the file path for ks performance picture :return: a tuple of c and ks value with the best ks performance """ # init a LogisticRegression model clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01, class_weight='balanced') print("Computing regularization path ...") start = datetime.now() print start coefs_ = [] ks = [] ks_validation1 = [] ks_validation2 = [] counter = 0 for c in cs: print 'time: ', time.asctime(time.localtime( time.time())), 'counter: ', counter, ' c: ', c clf_l1_LR.set_params(C=c) clf_l1_LR.fit(X_train, y_train) coefs_.append(clf_l1_LR.coef_.ravel().copy()) proba = clf_l1_LR.predict_proba(X_train)[:, 1] validation_proba1 = clf_l1_LR.predict_proba( validation_dataset_list[0][X_train.columns])[:, 1] ks.append(compute_ks(proba, y_train)) ks_validation1.append( compute_ks(validation_proba1, validation_dataset_list[0]['target'])) print 'ks:\t', ks[-1], 'ks_validation1:\t', ks_validation1[-1] counter += 1 end = datetime.now() print end print("This took ", end - start) coef_cv_df = pd.DataFrame(coefs_, columns=X_train.columns) coef_cv_df['ks'] = ks coef_cv_df['ks_validation1'] = ks_validation1 coef_cv_df['c'] = cs if df_coef_path: file_name = df_coef_path if isinstance(df_coef_path, str) else None coef_cv_df.to_csv(file_name) coefs_ = np.array(coefs_) fig1 = plt.figure('fig1') plt.plot(np.log10(cs), coefs_) ymin, ymax = plt.ylim() plt.xlabel('log(C)') plt.ylabel('Coefficients') plt.title(pic_coefpath_title) plt.axis('tight') if pic_coefpath: file_name = pic_coefpath if isinstance(pic_coefpath, str) else None plt.savefig(file_name) plt.close() else: pass # plt.show() # plt.close() fig2 = plt.figure('fig2') plt.plot(np.log10(cs), ks) plt.xlabel('log(C)') plt.ylabel('ks score') plt.title(pic_performance_title) plt.axis('tight') if pic_performance: file_name = pic_performance if isinstance(pic_performance, str) else None plt.savefig(file_name) plt.close() else: pass # plt.show() # plt.close() flag = coefs_ < 0 if np.array(ks)[flag.sum(axis=1) == 0].__len__() > 0: idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() else: idx = np.array(ks).argmax() return (cs[idx], ks[idx])
def grid_search_lr_c(X_train, y_train, df_coef_path=False, pic_coefpath_title='Logistic Regression Path', pic_coefpath=False, pic_performance_title='Logistic Regression Performance', pic_performance=False): """ grid search optimal hyper parameters c with the best ks performance :param X_train: features dataframe :param y_train: target :param df_coef_path: the file path for logistic regression coefficient dataframe :param pic_coefpath_title: the pic title for coefficient path picture :param pic_coefpath: the file path for coefficient path picture :param pic_performance_title: the pic title for ks performance picture :param pic_performance: the file path for ks performance picture :return: a tuple of c and ks value with the best ks performance """ # init a LogisticRegression model clf_l1_LR = LogisticRegression(C=0.1, penalty='l1', tol=0.01, class_weight='balanced') cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 3) print("Computing regularization path ...") start = datetime.now() print start coefs_ = [] ks = [] for c in cs: clf_l1_LR.set_params(C=c) clf_l1_LR.fit(X_train, y_train) coefs_.append(clf_l1_LR.coef_.ravel().copy()) proba = clf_l1_LR.predict_proba(X_train)[:, 1] ks.append(compute_ks(proba, y_train)) end = datetime.now() print end print("This took ", end - start) coef_cv_df = pd.DataFrame(coefs_, columns=X_train.columns) coef_cv_df['ks'] = ks coef_cv_df['c'] = cs if df_coef_path: file_name = df_coef_path if isinstance(df_coef_path, str) else None coef_cv_df.to_csv(file_name) coefs_ = np.array(coefs_) fig1 = plt.figure('fig1') plt.plot(np.log10(cs), coefs_) ymin, ymax = plt.ylim() plt.xlabel('log(C)') plt.ylabel('Coefficients') plt.title(pic_coefpath_title) plt.axis('tight') if pic_coefpath: file_name = pic_coefpath if isinstance(pic_coefpath, str) else None plt.savefig(file_name) else: plt.show() fig2 = plt.figure('fig2') plt.plot(np.log10(cs), ks) plt.xlabel('log(C)') plt.ylabel('ks score') plt.title(pic_performance_title) plt.axis('tight') if pic_performance: file_name = pic_performance if isinstance(pic_performance, str) else None plt.savefig(file_name) else: plt.show() flag = coefs_ < 0 idx = np.array(ks)[flag.sum(axis=1) == 0].argmax() return (cs[idx], ks[idx])
for j in range(clf_path_list.__len__()): dataset_path = 'E:\\ScoreCard\\cs_model\\gendata\\' + 'cs_m1_pos_woe_transed_rule_20170'+str(i+1) \ +'_features_20170'+str(j+1)+'.csv' print dataset_path dataset_train = pd.read_csv(dataset_path) X_train = dataset_train X_train = X_train[clf['features_name']] y_train = dataset_train['target'] print X_train.describe() print 'Checking features dtypes:' for var in clf['features_name']: # fill null X_train.loc[X_train[var].isnull(), (var)] = 0 proba = clf['classifier'].predict_proba(X_train)[:, 1] ks = compute_ks(proba, y_train) print ks clf_ks.append(ks) print 'ks summary: ', clf_path_list[i], '\n', clf_ks ks_matrix.append(clf_ks) print 'ks_matrix:\n', ks_matrix ###################################################### log_file.close() # restore the output to initial pattern sys.stdout = stdout_backup print "Now this will be presented on screen"
# cpd for j in range(30): print 'cpd:',j subset = dataset_woe_transed[dataset_woe_transed['cpd']==(j+1)] X_train = subset[clf['features_name'][:18]].values y_train = subset['target'].values model_ks_dict[j+1]['y_train'].extend(y_train) proba = clf['classifier'].predict_proba(X_train)[:,1] # ks = compute_ks(proba,y_train) model_ks_dict[j+1]['y_hat'].extend(proba) for j in range(30): ks = compute_ks(np.array(model_ks_dict[j+1]['y_hat']),np.array(model_ks_dict[j+1]['y_train'])) sample_cnt = model_ks_dict[j+1]['y_train'].__len__() bad_sample_cnt = sum(model_ks_dict[j+1]['y_train']) bad_sample_rate = bad_sample_cnt*1.0/sample_cnt print('cs_cpd:%s\tsample_cnt:%s\tbad_sample_cnt:%s\tbad_sample_rate:%s\tks:%s' % (str(j+1),str(sample_cnt),str(bad_sample_cnt),str(bad_sample_rate),str(ks))) v_cpd = [] v_y_train = [] v_y_proba = [] for j in range(30): v_cpd.extend([j]*model_ks_dict[j+1]['y_train'].__len__()) v_y_train.extend(model_ks_dict[j+1]['y_train']) v_y_proba.extend(model_ks_dict[j+1]['y_hat']) ks_test = pd.DataFrame()
X_train_enc = enc.transform(X_train).toarray() trainset = zip(X_train_enc, y_train) d = X_train_enc.shape[1] ks_avg_list = [] clf_l1_LR = LogisticRegression(C=0.01, penalty='l1', tol=0.01, class_weight='balanced') print '[START]', time.asctime(time.localtime(time.time())) clf_l1_LR.fit(X_train, y_train) print '[END]', time.asctime(time.localtime(time.time())) y_hat = clf_l1_LR.predict_proba(X_train)[:, 1] ks = compute_ks(y_hat, y_train) print ks # 0.258477461319 clf_l1_LR = LogisticRegression(C=0.01, penalty='l1', tol=0.01, class_weight='balanced') print '[START]', time.asctime(time.localtime(time.time())) clf_l1_LR.fit(X_train_enc, y_train) print '[END]', time.asctime(time.localtime(time.time())) y_hat = clf_l1_LR.predict_proba(X_train_enc)[:, 1] ks = compute_ks(y_hat, y_train) print ks # 0.287423006797
fit_single_lr(woe_train_path3, config_path, var_list_specfied, out_model_path) config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv' cfg = config.config() cfg.load_file(config_path, woe_test_path3) for var in [ tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 for var in [ tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns) ]: # fill null cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0 output = open(out_model_path, 'rb') clf_model = pickle.load(output) output.close() clf = clf_model['clf'] X_test = cfg.dataset_train[clf_model['features_list']] y_test = cfg.dataset_train['target'] y_hat = clf.predict_proba(X_test)[:, 1] ks = compute_ks(y_hat, y_test)