Example #1
0
def proc_validattion(dataset_path,config_path,model_path):
    print '####PROC VALIDATION#####'
    print 'dataset_path:\n',dataset_path
    print 'config_path:\n',config_path
    print 'model_path:\n',model_path
    #fillna
    config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv'
    cfg = config.config()
    cfg.load_file(config_path, dataset_path)

    for var in [tmp for tmp in cfg.bin_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0

    for var in [tmp for tmp in cfg.discrete_var_list if tmp in list(cfg.dataset_train.columns)]:
        # fill null
        cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0

    output = open(model_path, 'rb')
    clf_model = pickle.load(output)
    output.close()

    clf = clf_model['clf']
    X_test = cfg.dataset_train[clf_model['features_list']]
    y_test = cfg.dataset_train['target']

    y_hat = clf.predict_proba(X_test)[:,1]
    ks = compute_ks(y_hat,y_test)
    print 'global_bt:',cfg.global_bt
    print 'global_gt:', cfg.global_gt
    print 'ks:',ks
    return ks
Example #2
0
def fit_single_lr(dataset_path,
                  config_path,
                  var_list_specfied,
                  out_model_path,
                  c=0.01):
    dataset_train = pd.read_csv(dataset_path)
    cfg = pd.read_csv(config_path)
    candidate_var_list = cfg[cfg['is_modelfeature'] == 1]['var_name']

    b = [
        var for var in dataset_train.columns
        if sum(dataset_train[var].isnull()) == 0
    ]
    candidate_var_list = list(set(candidate_var_list).intersection(set(b)))

    if var_list_specfied.__len__() > 0:
        candidate_var_list = list(
            set(candidate_var_list).intersection(set(var_list_specfied)))

    print('candidate_var_list length:\n', candidate_var_list.__len__())
    print('candidate_var_list:\n', candidate_var_list)

    print('change dtypes:float64 to float32')
    for var in candidate_var_list:
        dataset_train[var] = dataset_train[var].astype(np.float32)

    X_train = dataset_train[dataset_train.target >= 0][candidate_var_list]
    y_train = dataset_train[dataset_train.target >= 0]['target']

    print('c:', c)
    clf_lr_a = LogisticRegression(C=c,
                                  penalty='l1',
                                  tol=0.01,
                                  class_weight='balanced')

    clf_lr_a.fit(X_train, y_train)
    coefs = clf_lr_a.coef_.ravel().copy()

    proba = clf_lr_a.predict_proba(X_train)[:, 1]
    ks = compute_ks(proba, y_train)

    model = {}
    model['clf'] = clf_lr_a
    model['features_list'] = candidate_var_list
    model['coefs'] = coefs
    model['ks'] = ks

    output = open(out_model_path, 'wb')
    pickle.dump(model, output)
    output.close()

    return model
def warm_up(params_list):
    X_train, y_train, d, l1, alpha = params_list
    trainset = zip(X_train, y_train)
    # 初始化
    ks_list = []
    ftrl = FTRL(dim=d, l1=l1, l2=1.0, alpha=alpha, beta=1.0)
    print('%s\tFTRL MODEL TRAINING ROUND:[l1]\t%s\t[alpha]\t%s\t' %
          (time.asctime(time.localtime(time.time())), str(l1), str(alpha)))
    for j in range(50):  # 50
        datasubset = trainset[j * 10000:(j + 1) * 10000]
        ftrl.train(datasubset, verbos=0, max_itr=10000, eta=0.01, epochs=100)
        y_hat = 1.0 / (1.0 + np.exp(X_train.dot(ftrl.w)))
        ks = compute_ks(y_hat, y_train)
        ks_list.append(ks)
        print('%s\t[l1]\t%s\t[alpha]\t%s\titer=%s\tks:%s' %
              (time.asctime(time.localtime(
                  time.time())), str(l1), str(alpha), str(
                      (j + 1) * 10000), str(ks)))

    print('%s\tFTRL MODEL TRAINING ROUND:[l1]\t%s\t[alpha]\t%s\t[ks_avg]\t%s' %
          (time.asctime(time.localtime(time.time())), str(l1), str(alpha),
           str(sum(ks_list) / ks_list.__len__())))
    return sum(ks_list) / ks_list.__len__()
def grid_search_lr_c_validation(
        X_train,
        y_train,
        validation_dataset_list,
        cs=[0.01],
        df_coef_path=False,
        pic_coefpath_title='Logistic Regression Path',
        pic_coefpath=False,
        pic_performance_title='Logistic Regression Performance',
        pic_performance=False):
    """
    grid search optimal hyper parameters c with the best ks performance
    :param X_train: features dataframe
    :param y_train: target
    :param cs: list of c value
    :param df_coef_path: the file path for logistic regression coefficient dataframe
    :param pic_coefpath_title: the pic title for coefficient path picture
    :param pic_coefpath: the file path for coefficient path picture
    :param pic_performance_title: the pic title for ks performance picture
    :param pic_performance: the file path for ks performance picture
    :return: a tuple of c and ks value with the best ks performance
    """
    # init a LogisticRegression model
    clf_l1_LR = LogisticRegression(C=0.1,
                                   penalty='l1',
                                   tol=0.01,
                                   class_weight='balanced')

    print("Computing regularization path ...")
    start = datetime.now()
    print start
    coefs_ = []
    ks = []
    ks_validation1 = []
    ks_validation2 = []
    counter = 0
    for c in cs:
        print 'time: ', time.asctime(time.localtime(
            time.time())), 'counter: ', counter, ' c: ', c
        clf_l1_LR.set_params(C=c)
        clf_l1_LR.fit(X_train, y_train)
        coefs_.append(clf_l1_LR.coef_.ravel().copy())

        proba = clf_l1_LR.predict_proba(X_train)[:, 1]
        validation_proba1 = clf_l1_LR.predict_proba(
            validation_dataset_list[0][X_train.columns])[:, 1]

        ks.append(compute_ks(proba, y_train))
        ks_validation1.append(
            compute_ks(validation_proba1,
                       validation_dataset_list[0]['target']))

        print 'ks:\t', ks[-1], 'ks_validation1:\t', ks_validation1[-1]
        counter += 1

    end = datetime.now()
    print end
    print("This took ", end - start)
    coef_cv_df = pd.DataFrame(coefs_, columns=X_train.columns)
    coef_cv_df['ks'] = ks
    coef_cv_df['ks_validation1'] = ks_validation1
    coef_cv_df['c'] = cs

    if df_coef_path:
        file_name = df_coef_path if isinstance(df_coef_path, str) else None
        coef_cv_df.to_csv(file_name)

    coefs_ = np.array(coefs_)

    fig1 = plt.figure('fig1')
    plt.plot(np.log10(cs), coefs_)
    ymin, ymax = plt.ylim()
    plt.xlabel('log(C)')
    plt.ylabel('Coefficients')
    plt.title(pic_coefpath_title)
    plt.axis('tight')
    if pic_coefpath:
        file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
        plt.savefig(file_name)
        plt.close()
    else:
        pass
        # plt.show()
        # plt.close()

    fig2 = plt.figure('fig2')
    plt.plot(np.log10(cs), ks)
    plt.xlabel('log(C)')
    plt.ylabel('ks score')
    plt.title(pic_performance_title)
    plt.axis('tight')
    if pic_performance:
        file_name = pic_performance if isinstance(pic_performance,
                                                  str) else None
        plt.savefig(file_name)
        plt.close()
    else:
        pass
        # plt.show()
        # plt.close()

    flag = coefs_ < 0
    if np.array(ks)[flag.sum(axis=1) == 0].__len__() > 0:
        idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()
    else:
        idx = np.array(ks).argmax()

    return (cs[idx], ks[idx])
Example #5
0
def grid_search_lr_c(X_train,
                     y_train,
                     df_coef_path=False,
                     pic_coefpath_title='Logistic Regression Path',
                     pic_coefpath=False,
                     pic_performance_title='Logistic Regression Performance',
                     pic_performance=False):
    """
    grid search optimal hyper parameters c with the best ks performance
    :param X_train: features dataframe
    :param y_train: target
    :param df_coef_path: the file path for logistic regression coefficient dataframe
    :param pic_coefpath_title: the pic title for coefficient path picture
    :param pic_coefpath: the file path for coefficient path picture
    :param pic_performance_title: the pic title for ks performance picture
    :param pic_performance: the file path for ks performance picture
    :return: a tuple of c and ks value with the best ks performance
    """
    # init a LogisticRegression model
    clf_l1_LR = LogisticRegression(C=0.1,
                                   penalty='l1',
                                   tol=0.01,
                                   class_weight='balanced')
    cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 3)

    print("Computing regularization path ...")
    start = datetime.now()
    print start
    coefs_ = []
    ks = []
    for c in cs:
        clf_l1_LR.set_params(C=c)
        clf_l1_LR.fit(X_train, y_train)
        coefs_.append(clf_l1_LR.coef_.ravel().copy())

        proba = clf_l1_LR.predict_proba(X_train)[:, 1]
        ks.append(compute_ks(proba, y_train))

    end = datetime.now()
    print end
    print("This took ", end - start)
    coef_cv_df = pd.DataFrame(coefs_, columns=X_train.columns)
    coef_cv_df['ks'] = ks
    coef_cv_df['c'] = cs

    if df_coef_path:
        file_name = df_coef_path if isinstance(df_coef_path, str) else None
        coef_cv_df.to_csv(file_name)

    coefs_ = np.array(coefs_)

    fig1 = plt.figure('fig1')
    plt.plot(np.log10(cs), coefs_)
    ymin, ymax = plt.ylim()
    plt.xlabel('log(C)')
    plt.ylabel('Coefficients')
    plt.title(pic_coefpath_title)
    plt.axis('tight')
    if pic_coefpath:
        file_name = pic_coefpath if isinstance(pic_coefpath, str) else None
        plt.savefig(file_name)
    else:
        plt.show()

    fig2 = plt.figure('fig2')
    plt.plot(np.log10(cs), ks)
    plt.xlabel('log(C)')
    plt.ylabel('ks score')
    plt.title(pic_performance_title)
    plt.axis('tight')
    if pic_performance:
        file_name = pic_performance if isinstance(pic_performance,
                                                  str) else None
        plt.savefig(file_name)
    else:
        plt.show()

    flag = coefs_ < 0
    idx = np.array(ks)[flag.sum(axis=1) == 0].argmax()

    return (cs[idx], ks[idx])
Example #6
0
    for j in range(clf_path_list.__len__()):
        dataset_path = 'E:\\ScoreCard\\cs_model\\gendata\\' + 'cs_m1_pos_woe_transed_rule_20170'+str(i+1) \
                       +'_features_20170'+str(j+1)+'.csv'
        print dataset_path
        dataset_train = pd.read_csv(dataset_path)

        X_train = dataset_train
        X_train = X_train[clf['features_name']]
        y_train = dataset_train['target']
        print X_train.describe()

        print 'Checking features dtypes:'
        for var in clf['features_name']:
            # fill null
            X_train.loc[X_train[var].isnull(), (var)] = 0

        proba = clf['classifier'].predict_proba(X_train)[:, 1]
        ks = compute_ks(proba, y_train)
        print ks
        clf_ks.append(ks)
    print 'ks summary: ', clf_path_list[i], '\n', clf_ks
    ks_matrix.append(clf_ks)
print 'ks_matrix:\n', ks_matrix

######################################################
log_file.close()
# restore the output to initial pattern
sys.stdout = stdout_backup

print "Now this will be presented on screen"
        # cpd
        for j in range(30):
            print 'cpd:',j
            subset = dataset_woe_transed[dataset_woe_transed['cpd']==(j+1)]

            X_train = subset[clf['features_name'][:18]].values
            y_train = subset['target'].values
            model_ks_dict[j+1]['y_train'].extend(y_train)

            proba = clf['classifier'].predict_proba(X_train)[:,1]
            # ks = compute_ks(proba,y_train)
            model_ks_dict[j+1]['y_hat'].extend(proba)


    for j in range(30):
        ks = compute_ks(np.array(model_ks_dict[j+1]['y_hat']),np.array(model_ks_dict[j+1]['y_train']))
        sample_cnt = model_ks_dict[j+1]['y_train'].__len__()
        bad_sample_cnt = sum(model_ks_dict[j+1]['y_train'])
        bad_sample_rate = bad_sample_cnt*1.0/sample_cnt
        print('cs_cpd:%s\tsample_cnt:%s\tbad_sample_cnt:%s\tbad_sample_rate:%s\tks:%s'
              % (str(j+1),str(sample_cnt),str(bad_sample_cnt),str(bad_sample_rate),str(ks)))

    v_cpd = []
    v_y_train = []
    v_y_proba = []
    for j in range(30):
        v_cpd.extend([j]*model_ks_dict[j+1]['y_train'].__len__())
        v_y_train.extend(model_ks_dict[j+1]['y_train'])
        v_y_proba.extend(model_ks_dict[j+1]['y_hat'])

    ks_test = pd.DataFrame()
X_train_enc = enc.transform(X_train).toarray()
trainset = zip(X_train_enc, y_train)
d = X_train_enc.shape[1]
ks_avg_list = []

clf_l1_LR = LogisticRegression(C=0.01,
                               penalty='l1',
                               tol=0.01,
                               class_weight='balanced')
print '[START]', time.asctime(time.localtime(time.time()))
clf_l1_LR.fit(X_train, y_train)
print '[END]', time.asctime(time.localtime(time.time()))

y_hat = clf_l1_LR.predict_proba(X_train)[:, 1]
ks = compute_ks(y_hat, y_train)
print ks
# 0.258477461319

clf_l1_LR = LogisticRegression(C=0.01,
                               penalty='l1',
                               tol=0.01,
                               class_weight='balanced')
print '[START]', time.asctime(time.localtime(time.time()))
clf_l1_LR.fit(X_train_enc, y_train)
print '[END]', time.asctime(time.localtime(time.time()))

y_hat = clf_l1_LR.predict_proba(X_train_enc)[:, 1]
ks = compute_ks(y_hat, y_train)
print ks
# 0.287423006797
Example #9
0
fit_single_lr(woe_train_path3, config_path, var_list_specfied, out_model_path)

config_path = r'E:\Code\Python_ML_Code\cs_model\config\config_cs_model.csv'
cfg = config.config()
cfg.load_file(config_path, woe_test_path3)

for var in [
        tmp for tmp in cfg.bin_var_list
        if tmp in list(cfg.dataset_train.columns)
]:
    # fill null
    cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0

for var in [
        tmp for tmp in cfg.discrete_var_list
        if tmp in list(cfg.dataset_train.columns)
]:
    # fill null
    cfg.dataset_train.loc[cfg.dataset_train[var].isnull(), (var)] = 0

output = open(out_model_path, 'rb')
clf_model = pickle.load(output)
output.close()

clf = clf_model['clf']
X_test = cfg.dataset_train[clf_model['features_list']]
y_test = cfg.dataset_train['target']

y_hat = clf.predict_proba(X_test)[:, 1]
ks = compute_ks(y_hat, y_test)