예제 #1
0
def main():
    # df_train, df_test = ParseData.loadPartData()
    df_train, df_test = ParseData.loadData()

    feature_categorical = baseline.getFeatureCategorical(df_train)

    ##################################################
    # 日期特征处理
    feature_date = baseline.getFeatureDate(df_train)
    df_train = baseline.parseDateToInt(df_train, feature_date)
    df_test = baseline.parseDateToInt(df_test, feature_date)
    ##################################################

    importance_list = ['nasrdw_recd_date', 'var_jb_43', 'var_jb_94', 'creditlimitamount_4',
                      'var_jb_15', 'var_jb_23', 'creditlimitamount_3', 'var_jb_73', 'var_jb_25', 'var_jb_22']

    # importance_list = ['querydate', 'nasrdw_recd_date', 'var_jb_43', 'opendate_3', 'var_jb_94', 'opendate_4',
    #  'var_jb_23', 'var_jb_15', 'creditlimitamount_4', 'var_jb_73']

    X_train, y_train, X_test, y_test = getTrainTestSampleImportance(df_train, df_test, feature_categorical, importance_list)

    xgb = trainModel(X_train, y_train, X_test, y_test)

    img = showPicture(xgb, 0)

    train_sentence, test_sentence, seg_sentence = CodeGenerateXgb.create(img)
    print(train_sentence)
    print(test_sentence)
    print(seg_sentence)
예제 #2
0
def main():
    # df_train, df_test = ParseData.loadPartData()
    df_train, df_test = ParseData.loadData()
    feature_list = [
        'nasrdw_recd_date', 'var_jb_43', 'var_jb_94', 'creditlimitamount_4',
        'var_jb_15', 'var_jb_23', 'creditlimitamount_3', 'var_jb_73',
        'var_jb_25', 'var_jb_22'
    ]
    # feature_list = ['querydate', 'nasrdw_recd_date', 'var_jb_43', 'opendate_3', 'var_jb_94', 'opendate_4',
    #                 'var_jb_23', 'var_jb_15', 'creditlimitamount_4', 'var_jb_73']

    ##################################################
    # 日期特征处理
    feature_date = StandardVersion.getFeatureDate(df_train)
    df_train = StandardVersion.parseDateToInt(df_train, feature_date)
    df_test = StandardVersion.parseDateToInt(df_test, feature_date)
    ##################################################

    x, dtree = train_tree_regressor(df_train, feature_list)
    graph = make_picture(x, dtree)
    show_picture(graph)
    train_sentence, test_sentence, seg_sentence = CodeGenerateDecisionTree.create(
        dtree, feature_list)
    print(train_sentence)
    print(test_sentence)
    print(seg_sentence)
예제 #3
0
def trainMultiModel(train_list, test_list, feature_categorical):
    model_list = []
    all_pred = []
    pred_list = []
    true_list = []
    for i in range(len(train_list)):
        df_train, df_test = train_list[i], test_list[i]
        print('%d.训练样本%s,测试样本%s' % (i, df_train.shape, df_test.shape))
        # 测试集该类没有样本 跳过
        if df_test.shape[0] == 0:
            continue

        X_train, y_train, X_test, y_test = baseline.getTrainTestSample(
            df_train, df_test, feature_categorical)

        gbm, y_pred = baseline.trainModel(X_train, y_train, X_test, y_test)
        model_list.append(gbm)

        pred_list.append(y_pred)
        true_list.append(y_test)

        if len(all_pred) == 0:
            all_pred = y_pred
            all_test = y_test
        else:
            all_pred = np.hstack((all_pred, y_pred))
            all_test = np.hstack((all_test, y_test))

        print('The auc score is:', roc_auc_score(all_test, all_pred))

    return all_pred, all_test, model_list, pred_list, true_list
예제 #4
0
def getKmeansAllFeature(df_train, df_test, n_components=4):
    import StandardVersion
    print('in %s' % sys._getframe().f_code.co_name)

    feature_categorical = Tools.feature_categorical
    df_train_smooth = df_train.copy()
    df_test_smooth = df_test.copy()

    # 获得要提取的特征列
    iv_more_than_point_one = Tools.iv_more_than_point_one
    feature_categorical = Tools.feature_categorical
    if ParseData.TYPE == 'OOT_noDate':
        iv_more_than_point_one = list(
            set(iv_more_than_point_one) - set(Tools.feature_date))
        feature_categorical = list(
            set(feature_categorical) - set(Tools.feature_date))

    kmeans_list = iv_more_than_point_one + feature_categorical
    df_train_smooth = df_train_smooth[kmeans_list]
    df_test_smooth = df_test_smooth[kmeans_list]

    # 缺失值处理 类别特征str化 平滑处理
    # for column in Tools.iv_more_than_point_one:
    #     df_train_smooth[column] = df_train_smooth[column].fillna(df_train_smooth[column].mean())
    #     df_test_smooth[column] = df_test_smooth[column].fillna(df_test_smooth[column].mean())

    df_train_smooth = df_train_smooth.fillna(-99999)
    df_test_smooth = df_test_smooth.fillna(-99999)
    df_train_smooth = StandardVersion.proprocessCateory(
        df_train_smooth, feature_categorical)
    df_test_smooth = StandardVersion.proprocessCateory(df_test_smooth,
                                                       feature_categorical)
    df_train_smooth = Tools.apply_log1p_transformation(df_train_smooth,
                                                       iv_more_than_point_one)
    df_test_smooth = Tools.apply_log1p_transformation(df_test_smooth,
                                                      iv_more_than_point_one)

    # 开始kmeans训练
    if ParseData.existModel('KmeansAllFeature%d.model' % n_components):
        print('加载KmeansAllFeature文件..')
        kmeans = ParseData.loadModel('KmeansAllFeature%d.model' % n_components)
    else:
        print('开始训练kmeans模型..')
        kmeans = KMeans(n_clusters=n_components).fit(df_train_smooth)
        print('训练完毕')
        ParseData.saveModel(kmeans, 'KmeansAllFeature%d.model' % n_components)

    labels_train = kmeans.predict(df_train_smooth)
    labels_test = kmeans.predict(df_test_smooth)
    df_train['KmeansAll'] = labels_train.tolist()
    df_test['KmeansAll'] = labels_test.tolist()

    print(df_train['KmeansAll'].head())

    # 转为one-hot编码 4列
    df_train, df_test, column_name = StandardVersion.cateToOneHot(
        df_train, df_test, ['KmeansAll'], 'KmeansAllFeature')

    return df_train, df_test, column_name
예제 #5
0
def getKmediodAllFeature(df_train, df_test, n_components=4):
    import StandardVersion
    print('in %s' % sys._getframe().f_code.co_name)

    feature_categorical = Tools.feature_categorical
    df_train_smooth = df_train.copy()
    df_test_smooth = df_test.copy()

    # 获得要提取的特征列
    kmediod_list = Tools.iv_more_than_point_one + feature_categorical
    df_train_smooth = df_train_smooth[kmediod_list]
    df_test_smooth = df_test_smooth[kmediod_list]

    df_train_smooth = df_train_smooth.fillna(-99999)
    df_test_smooth = df_test_smooth.fillna(-99999)
    df_train_smooth = StandardVersion.proprocessCateory(
        df_train_smooth, feature_categorical)
    df_test_smooth = StandardVersion.proprocessCateory(df_test_smooth,
                                                       feature_categorical)
    df_train_smooth = Tools.apply_log1p_transformation(
        df_train_smooth, Tools.iv_more_than_point_one)
    df_test_smooth = Tools.apply_log1p_transformation(
        df_test_smooth, Tools.iv_more_than_point_one)

    # 开始kmediod训练
    if ParseData.existModel('KmediodAllFeature%d.model' % n_components):
        print('加载KmediodAllFeature文件..')
        kmediod = ParseData.loadModel('KmediodAllFeature%d.model' %
                                      n_components)
    else:
        print('开始训练kmediod模型..')
        kmediod = kmedoids.KMediod(n_components).fit(df_train_smooth)
        print('训练完毕')
        ParseData.saveModel(kmediod,
                            'KmediodAllFeature%d.model' % n_components)

    labels_train = kmediod.predict(df_train_smooth)
    labels_test = kmediod.predict(df_test_smooth)
    df_train['KmediodAll'] = labels_train.tolist()
    df_test['KmediodAll'] = labels_test.tolist()

    print(df_train['KmediodAll'].head())

    # 转为one-hot编码 4列
    df_train, df_test, column_name = StandardVersion.cateToOneHot(
        df_train, df_test, ['KmediodAll'], 'KmediodAllFeature')

    return df_train, df_test, column_name
예제 #6
0
def getGMMCategoryFeature(df_train, df_test, n_components=4):
    print('in %s' % sys._getframe().f_code.co_name)
    import StandardVersion
    feature_categorical = Tools.feature_categorical
    print(feature_categorical)

    gmm_list = feature_categorical + Tools.iv_more_than_point_one
    print('特征数量:%d' % (len(gmm_list)))

    x_train = df_train[gmm_list].copy().fillna(-99999)
    x_test = df_test[gmm_list].copy().fillna(-99999)
    x_train = StandardVersion.proprocessCateory(x_train, feature_categorical)
    x_test = StandardVersion.proprocessCateory(x_test, feature_categorical)

    feature_num = x_train.shape[1]

    if ParseData.existModel('GMMCategoryFeature%d_%d.model' %
                            (n_components, feature_num)):
        print('加载GMMCategoryFeature文件..')
        gmm = ParseData.loadModel('GMMCategoryFeature%d_%d.model' %
                                  (n_components, feature_num))
    else:
        print('开始对类别特征训练GMM模型...')
        gmm = GMM(n_components=n_components,
                  reg_covar=0.0001).fit(x_train)  # 可以调reg_covar=0.0001
        print('训练完毕')
        ParseData.saveModel(
            gmm, 'GMMCategoryFeature%d_%d.model' % (n_components, feature_num))

    labels_train = gmm.predict(x_train)
    labels_test = gmm.predict(x_test)

    df_train['gmm'] = labels_train.tolist()
    df_test['gmm'] = labels_test.tolist()

    # 转为one-hot编码
    df_train, df_test, column_name = StandardVersion.cateToOneHot(
        df_train, df_test, ['gmm'], 'GMMCategoryFeature')
    # df_train = df_train.drop('gmm', axis=1)
    # df_test = df_test.drop('gmm', axis=1)
    return df_train, df_test, column_name
예제 #7
0
def nullCountcut(df_train, df_test):
    print('in %s' % sys._getframe().f_code.co_name)
    import StandardVersion
    df_train['nunNum'] = df_train.isnull().sum(axis=1).tolist()
    df_test['nunNum'] = df_test.isnull().sum(axis=1).tolist()

    x1_d, x1_iv, x1_cut, x1_woe = mono_bin(df_train.bad, df_train.nunNum)
    df_train['null_count'] = fenxiang(df_train, 'nunNum', x1_cut)
    df_test['null_count'] = fenxiang(df_test, 'nunNum', x1_cut)

    # 转为one-hot编码 4列
    df_train, df_test, column_name = StandardVersion.cateToOneHot(
        df_train, df_test, ['null_count'], 'null_seg')
    return df_train, df_test, column_name
예제 #8
0
def getGMMNullFeature(df_train, df_test, n_components=4):
    print('in %s' % sys._getframe().f_code.co_name)
    import StandardVersion
    X_train = df_train.copy()
    X_test = df_test.copy()

    # 空值为1 非空为0 降维
    print('正在将缺失值设为0 非缺失值设为1..')
    df_train_null = X_train.where(X_train.isnull(), 0).fillna(1).astype(int)
    df_test_null = X_test.where(X_test.isnull(), 0).fillna(1).astype(int)

    # 加载pca模型

    pca_component = 10
    if ParseData.existModel('PCANullFeature%d.model' % pca_component):
        print('加载PCANullFeature文件..')
        pca = ParseData.loadModel('PCANullFeature%d.model' % pca_component)
    else:
        pca = PCA(n_components=pca_component)
        pca.fit(df_train_null.values)
        ParseData.saveModel(pca, 'PCANullFeature%d.model' % pca_component)

    df_train_null = pca.transform(df_train_null.values)
    df_test_null = pca.transform(df_test_null.values)
    print(df_train_null.shape)
    print(df_test_null.shape)

    # 加载GMM空-非空模型
    if ParseData.existModel('GMMNullFeature%d.model' % n_components):
        print('加载GMMNullFeature文件..')
        gmm = ParseData.loadModel('GMMNullFeature%d.model' % n_components)
    else:
        print('开始对类别特征训练GMM模型...')
        gmm = GMM(n_components=n_components).fit(df_train_null)

        print('训练完毕')
        ParseData.saveModel(gmm, 'GMMNullFeature%d.model' % n_components)

    labels_train = gmm.predict(df_train_null)
    labels_test = gmm.predict(df_test_null)
    df_train['gmmNull'] = labels_train.tolist()
    df_test['gmmNull'] = labels_test.tolist()

    print(df_train['gmmNull'].head())

    # 转为one-hot编码 4列
    df_train, df_test, column_name = StandardVersion.cateToOneHot(
        df_train, df_test, ['gmmNull'], 'GMMNullFeature')
    return df_train, df_test, column_name
예제 #9
0
def getKmeansNullFeature(df_train, df_test, n_components=4):
    print('in %s' % sys._getframe().f_code.co_name)
    import StandardVersion
    X_train = df_train.copy()
    X_test = df_test.copy()

    # 空值为1 非空为0 降维
    df_train_null = X_train.where(X_train.isnull(), 0).fillna(1).astype(int)
    df_test_null = X_test.where(X_test.isnull(), 0).fillna(1).astype(int)
    pca = PCA(n_components=10)
    df_train_null = pca.fit_transform(df_train_null.values)
    df_test_null = pca.transform(df_test_null.values)
    print(df_train_null.shape)
    print(df_test_null.shape)
    print(df_test_null)

    if ParseData.existModel('KmeansNullFeature%d.model' % n_components):
        print('加载KmeansCategoryFeature文件..')
        kmeans = ParseData.loadModel('KmeansNullFeature%d.model' %
                                     n_components)
    else:
        print('开始对类别特征训练Kmeans模型...')
        kmeans = KMeans(n_clusters=n_components).fit(df_train_null)

        print('训练完毕')
        ParseData.saveModel(kmeans, 'KmeansNullFeature%d.model' % n_components)

    labels_train = kmeans.predict(df_train_null)
    labels_test = kmeans.predict(df_test_null)
    df_train['KmeansNull'] = labels_train.tolist()
    df_test['KmeansNull'] = labels_test.tolist()

    print(df_train['KmeansNull'].head())

    # 转为one-hot编码 4列
    df_train, df_test, column_name = StandardVersion.cateToOneHot(
        df_train, df_test, ['KmeansNull'], 'KmeansNullFeature')
    return df_train, df_test, column_name
예제 #10
0
def main():
    # df_train, df_test = ParseData.loadPartData()
    # df_train, df_test = ParseData.loadData()
    # df_train, df_test = ParseData.loadOOTData()
    # df_train, df_test = ParseData.loadOOT15Data()
    df_train, df_test = ParseData.loadData_new()

    # ParseData.TYPE = 'OOT_noDate'

    ##################################################
    # 日期特征处理
    feature_date = baseline.getFeatureDate(df_train)
    df_train = baseline.parseDateToInt(df_train, feature_date)
    df_test = baseline.parseDateToInt(df_test, feature_date)
    # 类别特征处理
    df_train, df_test = baseline.CategoryPCA(
        df_train, df_test, baseline.getFeatureCategorical(df_train))
    ##################################################

    feature_categorical = baseline.getFeatureCategorical(df_train)

    train_list, test_list = transSampleToList(df_train, df_test,
                                              feature_categorical)

    all_pred, all_true, model_list, pred_list, true_list = trainMultiModel(
        train_list, test_list, feature_categorical)

    new_pred, new_pred_list = fractional_calibration(pred_list,
                                                     true_list)  # 分数校准
    all_pred = new_pred

    Evaluation.getKsValue(all_true, all_pred)
    Evaluation.getAucValue(all_true, all_pred)

    Evas.main(model_list)
    Evaluation.get_pos_neg_picture(all_true, all_pred)