Example #1
0
def select_model(train_x, train_y, test_x, test_y):
    cv_params = {'n_estimators': [50,100,200,300,400]}
    other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth':3, 'min_child_weight': 20, 'gamma':0,
                    'subsample': 0.8, 'colsample_bytree':1.0, 'reg_alpha': 0, 'reg_lambda': 1,
                    'silent':1, 'scale_pos_weight':1, 'seed':0}

    model = xgb.XGBClassifier(**other_params)
    optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4)
    optimized_XGB.fit(train_x, train_y)
    #evalute_result = optimized_XGB.cv_results_
    #print "每轮迭代运行结果:{0}".format(evalute_result)
    print "参数的最佳取值:{0}".format(optimized_XGB.best_params_)
    print "最佳模型得分:{0}".format(optimized_XGB.best_score_)

    predict_train_y = optimized_XGB.predict(train_x)
    train_ks = run_ks(train_y, predict_train_y)

    predict_test_y = optimized_XGB.predict(test_x)
    test_ks = run_ks(test_y, predict_test_y)

    #print "训练集ks: %f,测试集ks: %f" % (train_ks, test_ks)
    print train_ks
Example #2
0
def test_xgb_noin(data,
                  fea_list_path,
                  in_fea,
                  model_path,
                  filter_name=['name', 'idcard', 'phone', 'loan_dt', 'label']):
    fea_list = []
    in_fea_list = []
    df = pd.read_csv(data, sep='\t')
    df_label = df['label']
    df_out = df
    for key in filter_name:
        if key in df.keys():
            del df[key]
    if fea_list_path != "":
        fea_list = [
            x.strip() for x in open(fea_list_path) if len(x.strip()) > 0
        ]
    print len(fea_list)
    if in_fea != "":
        in_fea_list = [x.strip() for x in open(in_fea) if len(x.strip()) > 0]
    print len(in_fea_list)
    ss = list(set(fea_list) - set(in_fea_list))
    #print ss
    print len(ss)
    df = df[fea_list]
    for i in ss:
        df[i] = 0
    df_label = df_label[~df.isnull().all(axis=1)]
    df_out = df_out[~df.isnull().all(axis=1)]
    df = df[~df.isnull().all(axis=1)]
    test_x = df
    test_Y = df_label
    f_list = fea_list

    dTest = xgb.DMatrix(test_x, label=test_Y)
    clf_xgb = xgb.Booster()
    clf_xgb.load_model(model_path)
    print "Read test done"
    dTest = xgb.DMatrix(test_x)
    y_proba = clf_xgb.predict(dTest)
    ks_ = run_ks(test_Y, y_proba)
    #auc_ = roc_auc_score(test_Y, y_proba)
    print "%f" % (ks_['ks'])
Example #3
0
def test_xgb(test_tbl, xgb_model, train_list):
    df_test_x, df_test_y, f_list_test, df_median = data_preprocess(test_tbl)
    df_test = pd.DataFrame()
    for e in train_list:
        df_test[e] = df_test_x[e]
    df_test_x = df_test
    # df_test_x.fillna(-1, inplace=True)
    print 'Read test done'
    test_y = np.array(df_test_y)
    xgb = xgb_model
    test_x = np.array(df_test_x)
    y_proba = xgb.predict_proba(test_x)
    y_score = [item[0] for item in y_proba]
    y_good = [1 - item for item in test_y]
    tmp_df = pd.DataFrame()
    tmp_df['f'] = y_score
    tmp_df['good'] = y_good
    tmp_df['bad'] = test_y
    ks_dict = run_ks(test_y, y_proba[:, 1])
    auc = roc_auc_score(test_y, y_proba[:, 1])
    print "%f\t%f" % (auc, ks_dict['ks'])
    print_ks(ks_dict, test_tbl + '_score_ks_detail')
Example #4
0
def test_xgb(data,
             fea_list_path,
             model_path,
             filter_name=['name', 'idcard', 'phone', 'loan_dt', 'label']):
    fea_list = []
    df = pd.read_csv(data, sep='\t')
    df_label = df['label']
    df_out = df
    for key in filter_name:
        if key in df.keys():
            del df[key]
    if fea_list_path != "":
        fea_list = [
            x.strip() for x in open(fea_list_path) if len(x.strip()) > 0
        ]
    df = df[fea_list]
    df_label = df_label[~df.isnull().all(axis=1)]
    df_out = df_out[~df.isnull().all(axis=1)]
    df = df[~df.isnull().all(axis=1)]
    test_x = df
    test_Y = df_label
    f_list = fea_list

    dTest = xgb.DMatrix(test_x, label=test_Y)
    clf_xgb = xgb.Booster()
    clf_xgb.load_model(model_path)
    print "Read test done"
    dTest = xgb.DMatrix(test_x)
    y_proba = clf_xgb.predict(dTest)
    print y_proba
    ks_ = run_ks(test_Y, y_proba)
    auc_ = roc_auc_score(test_Y, y_proba)
    print "%f\t%f" % (auc_, ks_['ks'])
    df_out['prob'] = y_proba
    print_ks(ks_, data + r'_score_ks_detail')
    df_out.to_csv(data + r'_prob_', index=False, sep='\t')
Example #5
0
def xgb_model(sample_tbl, model_path,
              xgb_params={'nthread': 4, 'n_estimators': 80, 'max_depth': 3,
                          'min_child_weight': 2, 'gamma': 0.1, 'subsample': 0.4, 'learning_rate': 0.06,
                          'colsample_bytree': 0.5, 'scale_pos_weight': 1, 'seed': 100}):
    # os.path.join(model_res_path,  'score'), os.path.join(model_res_path, 'fip')
    plst = xgb_params.items()
    f_score = os.path.join(model_path, "score")
    f_write = os.path.join(model_path, "fip")

    df_x, df_y, df_name = sample_tbl
    x = np.array(df_x)
    y = np.array(df_y)
    # 深度是n,节点数2**(n+1)-1,叶子节点数2**n
    kf = StratifiedKFold(n_splits=5, shuffle=True)
    auc_list = []
    auc_list_train = []
    ks_list = []
    ks_list_train = []
    times = 0
    for train_index, dev_index in kf.split(x, y):
        # print "KFold: %d\nauc\tks" % times
        times += 1
        x_train, x_dev = x[train_index], x[dev_index]
        y_train, y_dev = y[train_index], y[dev_index]

        dtrain = xgb.DMatrix(x_train, label=y_train)
        dvalid = xgb.DMatrix(x_dev, label=y_dev)

        # evallist = [(dtrain, 'train'), (dvalid, 'eval')]
        #clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators'], evals=evallist)
        clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators'])


        clf_xgb.save_model(os.path.join(model_path, 'clf_'+str(times)))
        clf_xgb.dump_model(os.path.join(model_path, 'dump_raw_'+str(times)))

        f_weights_dict = clf_xgb.get_score(importance_type='weight')
        f_gains_dict = clf_xgb.get_score(importance_type='gain')
        f_covers_dict = clf_xgb.get_score(importance_type='cover')
        fea_analysis = []
        for f_key in f_weights_dict:
            fea_analysis.append(
                {'feature': f_list[int(f_key[1:])], 'weight': f_weights_dict[f_key], 'gain': f_gains_dict[f_key],
                 'cover': f_covers_dict[f_key]})
        fea_analysis_df = pd.DataFrame(fea_analysis, columns=['feature', 'weight', 'gain', 'cover'])
        fea_analysis_df.sort_values(['gain'], ascending=False, inplace=True)
        fea_analysis_df.to_csv(f_write+"_"+str(times), index=None, sep='\t')

        y_proba = clf_xgb.predict(dvalid)
        y_proba_train = clf_xgb.predict(dtrain)

        ks_dict = run_ks(y_dev, y_proba)
        ks_dict_train = run_ks(y_train, y_proba_train)

        auc = roc_auc_score(y_dev, y_proba)
        auc_train = roc_auc_score(y_train, y_proba_train)
        # print "%f\t%f" % (auc, ks)

        ks_list.append(ks_dict['ks'])
        auc_list.append(auc)
        ks_list_train.append(ks_dict_train['ks'])
        auc_list_train.append(auc_train)
        print_ks(ks_dict, f_score)

    fea_analysis_df.to_csv(f_write, index=None, sep='\t')
    clf_xgb.save_model(os.path.join(model_path, "clf"))
    clf_xgb.dump_model(os.path.join(model_path, "dump_raw"))
    dd = dict()
    dd['train_ks'] = ks_list_train
    dd['valida_ks'] = ks_list
    dd['train_auc'] = auc_list_train
    dd['valida_auc'] = auc_list
    train_ks_df = pd.DataFrame(dd)
    train_ks_df.to_csv(os.path.join(model_path, "ks_auc"),index=False, sep='\t')
    
    #params = clf_xgb.get_params()
    #print params
    ks_mean = np.mean(ks_list)
    ks_var = np.std(ks_list)
    auc_mean = np.mean(auc_list)
    auc_var = np.std(auc_list)
    ks_mean_train = np.mean(ks_list_train)
    ks_var_train = np.std(ks_list_train)
    auc_mean_train = np.mean(auc_list_train)
    auc_var_train = np.std(auc_list_train)

    print('train: ')
    print "ks mean: %f, ks var: %f" % (ks_mean_train, ks_var_train)
    print "auc mean: %f, auc var: %f" % (auc_mean_train, auc_var_train)
    print('validation:')
    print "ks mean: %f, ks var: %f" % (ks_mean, ks_var)
    print "auc mean: %f, auc var: %f" % (auc_mean, auc_var)
Example #6
0
def train_xgb(
    sample_data,
    model_path,
    params={
        'learning_rate': 0.1,
        'n_estimators': 50,
        'max_depth': 3,
        'min_child_weight': 20,
        'gamma': 0,
        'subsample': 0.8,
        'colsample_bytree': 1.0,
        'reg_alpha': 0,
        'reg_lambda': 1,
        'silent': 1,
        'scale_pos_weight': 1,
        'seed': 0
    }):
    train_x, train_Y, valid_x, valid_Y, f_list = sample_data
    ks_list = []
    auc_list = []

    plst = params.items()
    dTrain = xgb.DMatrix(train_x, label=train_Y)
    dValid = xgb.DMatrix(valid_x, label=valid_Y)
    num_round = params['n_estimators']
    model_clf = xgb.train(plst, dTrain, num_round)
    model_clf.save_model(os.path.join(model_path, "model_clf"))

    f_weights_dict = model_clf.get_score(importance_type='weight')
    f_gains_dict = model_clf.get_score(importance_type='gain')
    f_cover_dict = model_clf.get_score(importance_type='cover')
    #print len(f_weights_dict)
    #print len(f_gains_dict)
    #print len(f_cover_dict)
    fea_analysis = []
    for f_key in f_weights_dict:
        fea_analysis.append({
            'feature': f_key,
            'weight': f_weights_dict[f_key],
            'gain': f_gains_dict[f_key],
            'cover': f_cover_dict[f_key]
        })

    fea_analysis_df = pd.DataFrame(
        fea_analysis, columns=['feature', 'weight', 'gain', 'cover'])
    fea_analysis_df.to_csv(model_path + r'/fip')

    Y_train_prob = model_clf.predict(dTrain)
    Y_valid_prob = model_clf.predict(dValid)

    train_ks = run_ks(train_Y, Y_train_prob)['ks']
    print "train_ks"
    print train_ks
    valid_ks = run_ks(valid_Y, Y_valid_prob)['ks']
    print "valid_ks"
    print valid_ks
    train_auc = roc_auc_score(train_Y, Y_train_prob)
    valid_auc = roc_auc_score(valid_Y, Y_valid_prob)
    print "train_auc"
    print train_auc
    print "valid_auc"
    print valid_auc
    ks_list.append(train_ks)
    ks_list.append(valid_ks)

    auc_list.append(train_auc)
    auc_list.append(valid_auc)

    fw = open(model_path + r'/result', 'a')
    str_ = str(train_ks) + '\t' + str(valid_ks) + '\t' + str(
        train_auc) + '\t' + str(valid_auc) + '\t' + json.dumps(params) + '\n'
    print str_
    fw.write(str_)
    print "run success!"