def main(): # df_train, df_test = ParseData.loadPartData() df_train, df_test = ParseData.loadData() feature_categorical = baseline.getFeatureCategorical(df_train) ################################################## # 日期特征处理 feature_date = baseline.getFeatureDate(df_train) df_train = baseline.parseDateToInt(df_train, feature_date) df_test = baseline.parseDateToInt(df_test, feature_date) ################################################## importance_list = ['nasrdw_recd_date', 'var_jb_43', 'var_jb_94', 'creditlimitamount_4', 'var_jb_15', 'var_jb_23', 'creditlimitamount_3', 'var_jb_73', 'var_jb_25', 'var_jb_22'] # importance_list = ['querydate', 'nasrdw_recd_date', 'var_jb_43', 'opendate_3', 'var_jb_94', 'opendate_4', # 'var_jb_23', 'var_jb_15', 'creditlimitamount_4', 'var_jb_73'] X_train, y_train, X_test, y_test = getTrainTestSampleImportance(df_train, df_test, feature_categorical, importance_list) xgb = trainModel(X_train, y_train, X_test, y_test) img = showPicture(xgb, 0) train_sentence, test_sentence, seg_sentence = CodeGenerateXgb.create(img) print(train_sentence) print(test_sentence) print(seg_sentence)
def main(): # df_train, df_test = ParseData.loadPartData() df_train, df_test = ParseData.loadData() feature_list = [ 'nasrdw_recd_date', 'var_jb_43', 'var_jb_94', 'creditlimitamount_4', 'var_jb_15', 'var_jb_23', 'creditlimitamount_3', 'var_jb_73', 'var_jb_25', 'var_jb_22' ] # feature_list = ['querydate', 'nasrdw_recd_date', 'var_jb_43', 'opendate_3', 'var_jb_94', 'opendate_4', # 'var_jb_23', 'var_jb_15', 'creditlimitamount_4', 'var_jb_73'] ################################################## # 日期特征处理 feature_date = StandardVersion.getFeatureDate(df_train) df_train = StandardVersion.parseDateToInt(df_train, feature_date) df_test = StandardVersion.parseDateToInt(df_test, feature_date) ################################################## x, dtree = train_tree_regressor(df_train, feature_list) graph = make_picture(x, dtree) show_picture(graph) train_sentence, test_sentence, seg_sentence = CodeGenerateDecisionTree.create( dtree, feature_list) print(train_sentence) print(test_sentence) print(seg_sentence)
def trainMultiModel(train_list, test_list, feature_categorical): model_list = [] all_pred = [] pred_list = [] true_list = [] for i in range(len(train_list)): df_train, df_test = train_list[i], test_list[i] print('%d.训练样本%s,测试样本%s' % (i, df_train.shape, df_test.shape)) # 测试集该类没有样本 跳过 if df_test.shape[0] == 0: continue X_train, y_train, X_test, y_test = baseline.getTrainTestSample( df_train, df_test, feature_categorical) gbm, y_pred = baseline.trainModel(X_train, y_train, X_test, y_test) model_list.append(gbm) pred_list.append(y_pred) true_list.append(y_test) if len(all_pred) == 0: all_pred = y_pred all_test = y_test else: all_pred = np.hstack((all_pred, y_pred)) all_test = np.hstack((all_test, y_test)) print('The auc score is:', roc_auc_score(all_test, all_pred)) return all_pred, all_test, model_list, pred_list, true_list
def getKmeansAllFeature(df_train, df_test, n_components=4): import StandardVersion print('in %s' % sys._getframe().f_code.co_name) feature_categorical = Tools.feature_categorical df_train_smooth = df_train.copy() df_test_smooth = df_test.copy() # 获得要提取的特征列 iv_more_than_point_one = Tools.iv_more_than_point_one feature_categorical = Tools.feature_categorical if ParseData.TYPE == 'OOT_noDate': iv_more_than_point_one = list( set(iv_more_than_point_one) - set(Tools.feature_date)) feature_categorical = list( set(feature_categorical) - set(Tools.feature_date)) kmeans_list = iv_more_than_point_one + feature_categorical df_train_smooth = df_train_smooth[kmeans_list] df_test_smooth = df_test_smooth[kmeans_list] # 缺失值处理 类别特征str化 平滑处理 # for column in Tools.iv_more_than_point_one: # df_train_smooth[column] = df_train_smooth[column].fillna(df_train_smooth[column].mean()) # df_test_smooth[column] = df_test_smooth[column].fillna(df_test_smooth[column].mean()) df_train_smooth = df_train_smooth.fillna(-99999) df_test_smooth = df_test_smooth.fillna(-99999) df_train_smooth = StandardVersion.proprocessCateory( df_train_smooth, feature_categorical) df_test_smooth = StandardVersion.proprocessCateory(df_test_smooth, feature_categorical) df_train_smooth = Tools.apply_log1p_transformation(df_train_smooth, iv_more_than_point_one) df_test_smooth = Tools.apply_log1p_transformation(df_test_smooth, iv_more_than_point_one) # 开始kmeans训练 if ParseData.existModel('KmeansAllFeature%d.model' % n_components): print('加载KmeansAllFeature文件..') kmeans = ParseData.loadModel('KmeansAllFeature%d.model' % n_components) else: print('开始训练kmeans模型..') kmeans = KMeans(n_clusters=n_components).fit(df_train_smooth) print('训练完毕') ParseData.saveModel(kmeans, 'KmeansAllFeature%d.model' % n_components) labels_train = kmeans.predict(df_train_smooth) labels_test = kmeans.predict(df_test_smooth) df_train['KmeansAll'] = labels_train.tolist() df_test['KmeansAll'] = labels_test.tolist() print(df_train['KmeansAll'].head()) # 转为one-hot编码 4列 df_train, df_test, column_name = StandardVersion.cateToOneHot( df_train, df_test, ['KmeansAll'], 'KmeansAllFeature') return df_train, df_test, column_name
def getKmediodAllFeature(df_train, df_test, n_components=4): import StandardVersion print('in %s' % sys._getframe().f_code.co_name) feature_categorical = Tools.feature_categorical df_train_smooth = df_train.copy() df_test_smooth = df_test.copy() # 获得要提取的特征列 kmediod_list = Tools.iv_more_than_point_one + feature_categorical df_train_smooth = df_train_smooth[kmediod_list] df_test_smooth = df_test_smooth[kmediod_list] df_train_smooth = df_train_smooth.fillna(-99999) df_test_smooth = df_test_smooth.fillna(-99999) df_train_smooth = StandardVersion.proprocessCateory( df_train_smooth, feature_categorical) df_test_smooth = StandardVersion.proprocessCateory(df_test_smooth, feature_categorical) df_train_smooth = Tools.apply_log1p_transformation( df_train_smooth, Tools.iv_more_than_point_one) df_test_smooth = Tools.apply_log1p_transformation( df_test_smooth, Tools.iv_more_than_point_one) # 开始kmediod训练 if ParseData.existModel('KmediodAllFeature%d.model' % n_components): print('加载KmediodAllFeature文件..') kmediod = ParseData.loadModel('KmediodAllFeature%d.model' % n_components) else: print('开始训练kmediod模型..') kmediod = kmedoids.KMediod(n_components).fit(df_train_smooth) print('训练完毕') ParseData.saveModel(kmediod, 'KmediodAllFeature%d.model' % n_components) labels_train = kmediod.predict(df_train_smooth) labels_test = kmediod.predict(df_test_smooth) df_train['KmediodAll'] = labels_train.tolist() df_test['KmediodAll'] = labels_test.tolist() print(df_train['KmediodAll'].head()) # 转为one-hot编码 4列 df_train, df_test, column_name = StandardVersion.cateToOneHot( df_train, df_test, ['KmediodAll'], 'KmediodAllFeature') return df_train, df_test, column_name
def getGMMCategoryFeature(df_train, df_test, n_components=4): print('in %s' % sys._getframe().f_code.co_name) import StandardVersion feature_categorical = Tools.feature_categorical print(feature_categorical) gmm_list = feature_categorical + Tools.iv_more_than_point_one print('特征数量:%d' % (len(gmm_list))) x_train = df_train[gmm_list].copy().fillna(-99999) x_test = df_test[gmm_list].copy().fillna(-99999) x_train = StandardVersion.proprocessCateory(x_train, feature_categorical) x_test = StandardVersion.proprocessCateory(x_test, feature_categorical) feature_num = x_train.shape[1] if ParseData.existModel('GMMCategoryFeature%d_%d.model' % (n_components, feature_num)): print('加载GMMCategoryFeature文件..') gmm = ParseData.loadModel('GMMCategoryFeature%d_%d.model' % (n_components, feature_num)) else: print('开始对类别特征训练GMM模型...') gmm = GMM(n_components=n_components, reg_covar=0.0001).fit(x_train) # 可以调reg_covar=0.0001 print('训练完毕') ParseData.saveModel( gmm, 'GMMCategoryFeature%d_%d.model' % (n_components, feature_num)) labels_train = gmm.predict(x_train) labels_test = gmm.predict(x_test) df_train['gmm'] = labels_train.tolist() df_test['gmm'] = labels_test.tolist() # 转为one-hot编码 df_train, df_test, column_name = StandardVersion.cateToOneHot( df_train, df_test, ['gmm'], 'GMMCategoryFeature') # df_train = df_train.drop('gmm', axis=1) # df_test = df_test.drop('gmm', axis=1) return df_train, df_test, column_name
def nullCountcut(df_train, df_test): print('in %s' % sys._getframe().f_code.co_name) import StandardVersion df_train['nunNum'] = df_train.isnull().sum(axis=1).tolist() df_test['nunNum'] = df_test.isnull().sum(axis=1).tolist() x1_d, x1_iv, x1_cut, x1_woe = mono_bin(df_train.bad, df_train.nunNum) df_train['null_count'] = fenxiang(df_train, 'nunNum', x1_cut) df_test['null_count'] = fenxiang(df_test, 'nunNum', x1_cut) # 转为one-hot编码 4列 df_train, df_test, column_name = StandardVersion.cateToOneHot( df_train, df_test, ['null_count'], 'null_seg') return df_train, df_test, column_name
def getGMMNullFeature(df_train, df_test, n_components=4): print('in %s' % sys._getframe().f_code.co_name) import StandardVersion X_train = df_train.copy() X_test = df_test.copy() # 空值为1 非空为0 降维 print('正在将缺失值设为0 非缺失值设为1..') df_train_null = X_train.where(X_train.isnull(), 0).fillna(1).astype(int) df_test_null = X_test.where(X_test.isnull(), 0).fillna(1).astype(int) # 加载pca模型 pca_component = 10 if ParseData.existModel('PCANullFeature%d.model' % pca_component): print('加载PCANullFeature文件..') pca = ParseData.loadModel('PCANullFeature%d.model' % pca_component) else: pca = PCA(n_components=pca_component) pca.fit(df_train_null.values) ParseData.saveModel(pca, 'PCANullFeature%d.model' % pca_component) df_train_null = pca.transform(df_train_null.values) df_test_null = pca.transform(df_test_null.values) print(df_train_null.shape) print(df_test_null.shape) # 加载GMM空-非空模型 if ParseData.existModel('GMMNullFeature%d.model' % n_components): print('加载GMMNullFeature文件..') gmm = ParseData.loadModel('GMMNullFeature%d.model' % n_components) else: print('开始对类别特征训练GMM模型...') gmm = GMM(n_components=n_components).fit(df_train_null) print('训练完毕') ParseData.saveModel(gmm, 'GMMNullFeature%d.model' % n_components) labels_train = gmm.predict(df_train_null) labels_test = gmm.predict(df_test_null) df_train['gmmNull'] = labels_train.tolist() df_test['gmmNull'] = labels_test.tolist() print(df_train['gmmNull'].head()) # 转为one-hot编码 4列 df_train, df_test, column_name = StandardVersion.cateToOneHot( df_train, df_test, ['gmmNull'], 'GMMNullFeature') return df_train, df_test, column_name
def getKmeansNullFeature(df_train, df_test, n_components=4): print('in %s' % sys._getframe().f_code.co_name) import StandardVersion X_train = df_train.copy() X_test = df_test.copy() # 空值为1 非空为0 降维 df_train_null = X_train.where(X_train.isnull(), 0).fillna(1).astype(int) df_test_null = X_test.where(X_test.isnull(), 0).fillna(1).astype(int) pca = PCA(n_components=10) df_train_null = pca.fit_transform(df_train_null.values) df_test_null = pca.transform(df_test_null.values) print(df_train_null.shape) print(df_test_null.shape) print(df_test_null) if ParseData.existModel('KmeansNullFeature%d.model' % n_components): print('加载KmeansCategoryFeature文件..') kmeans = ParseData.loadModel('KmeansNullFeature%d.model' % n_components) else: print('开始对类别特征训练Kmeans模型...') kmeans = KMeans(n_clusters=n_components).fit(df_train_null) print('训练完毕') ParseData.saveModel(kmeans, 'KmeansNullFeature%d.model' % n_components) labels_train = kmeans.predict(df_train_null) labels_test = kmeans.predict(df_test_null) df_train['KmeansNull'] = labels_train.tolist() df_test['KmeansNull'] = labels_test.tolist() print(df_train['KmeansNull'].head()) # 转为one-hot编码 4列 df_train, df_test, column_name = StandardVersion.cateToOneHot( df_train, df_test, ['KmeansNull'], 'KmeansNullFeature') return df_train, df_test, column_name
def main(): # df_train, df_test = ParseData.loadPartData() # df_train, df_test = ParseData.loadData() # df_train, df_test = ParseData.loadOOTData() # df_train, df_test = ParseData.loadOOT15Data() df_train, df_test = ParseData.loadData_new() # ParseData.TYPE = 'OOT_noDate' ################################################## # 日期特征处理 feature_date = baseline.getFeatureDate(df_train) df_train = baseline.parseDateToInt(df_train, feature_date) df_test = baseline.parseDateToInt(df_test, feature_date) # 类别特征处理 df_train, df_test = baseline.CategoryPCA( df_train, df_test, baseline.getFeatureCategorical(df_train)) ################################################## feature_categorical = baseline.getFeatureCategorical(df_train) train_list, test_list = transSampleToList(df_train, df_test, feature_categorical) all_pred, all_true, model_list, pred_list, true_list = trainMultiModel( train_list, test_list, feature_categorical) new_pred, new_pred_list = fractional_calibration(pred_list, true_list) # 分数校准 all_pred = new_pred Evaluation.getKsValue(all_true, all_pred) Evaluation.getAucValue(all_true, all_pred) Evas.main(model_list) Evaluation.get_pos_neg_picture(all_true, all_pred)