import lightgbm as lgb import matplotlib.pyplot as plt from sklearn.model_selection import KFold from sklearn.metrics import mean_squared_error from imblearn.under_sampling import RandomUnderSampler sys.path.append('../') from util.feature import add_feature, fillna from util.metric import mse from util import variables plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 train = pd.read_csv('../data/d_train_20180102.csv') train = fillna(train) train = add_feature(train) predictor = [ column for column in train.columns if column not in ['id', '体检日期', '血糖'] ] rus = RandomUnderSampler(random_state=2018, return_indices=True) XALL, yALL, idx_resampled = rus.fit_sample(train[predictor], (train['血糖'] > 7).astype(int)) yALL = train.iloc[idx_resampled]['血糖'] XALL = pd.DataFrame(XALL, columns=predictor) print('Feature: ', XALL.columns.tolist()) kf = KFold(n_splits=5, shuffle=True, random_state=2018) preds = np.zeros((train.shape[0], 5)) feature_importance = []
import numpy as np import pandas as pd from sklearn import linear_model from sklearn.preprocessing import MinMaxScaler sys.path.append('../') from util.feature import add_feature, fillna from util import variables train = pd.read_csv('../data/d_train_20180102.csv') test = pd.read_csv('../data/d_test_A_20180102.csv') test['血糖'] = -1 all_data = pd.concat([train, test], ignore_index=True) all_data = fillna(all_data) all_data = add_feature(all_data) feature_col = [ column for column in all_data.columns if column not in ['id', '性别', '体检日期', '血糖'] ] scaler = MinMaxScaler() scaler.fit(all_data.loc[:, feature_col]) all_data.loc[:, feature_col] = scaler.transform(all_data[feature_col]) train = all_data.loc[all_data['血糖'] >= 0.0, :] test = all_data.loc[all_data['血糖'] < 0.0, :] # splits into male and female train_m = train.loc[train['性别'] == 0, :]