import xgboost as xgb import numpy as np import pandas as pd import matplotlib.pyplot as plt import sklearn.metrics as metrics import joblib from utils.dataprocess import preProc, preProcTest, toJson from sklearn.model_selection import train_test_split, GridSearchCV # xgb原生接口 dataTrain = pd.read_csv(r'C:\Users\ZY\Desktop\ML\VI_train.csv') xTrain, yTrain, scaler = preProc(dataTrain, toNumpy=True) # weight=反例数量/正例数量 weight = (yTrain.shape[0] - sum(yTrain)) * 1.0 / sum(yTrain) xTrain, xValidation, yTrain, yValidation = train_test_split(xTrain, yTrain, test_size=0.2) dtrain = xgb.DMatrix(xTrain, label=yTrain) dvalidation = xgb.DMatrix(xValidation, label=yValidation) params = { # xgboost宏观特征参数 'booster': 'gbtree', 'nthread': 5, # 线程数 'silent': 0, # 为1时,静默开启 # booster参数 'eta': 0.1, # learning rate 通过减少每一步的权重,提高鲁棒性 'gamma': 0.1, # 节点分裂所需要的最小损失函数下降值 'max_depth': 9, # 最大树高,限制过拟合 'lambda': 2, # 权重的L2正则项 'alpha': 1, # 权重的L1正则项
''' for i in range(self.layer_num): x = self.relu[i](self.bns[i](self.hiddens[i](x))) x = self.predict(x) return x # 读入数据 dataTrain = pd.read_csv(r'C:\Users\ZY\Desktop\ML\VI_train.csv') weight_negative = sum(dataTrain['Response']) * 1.0 / dataTrain.shape[0] # 由于类别不平衡,采用权重解决,分别设置负类和正类的权重 weights = [weight_negative, 1 - weight_negative] # [0.12293666666666667, 0.8770633333333333] # pytorch要求权重输入为tensor weights = torch.from_numpy(np.array(weights)).type(torch.FloatTensor) xTrain, yTrain, scaler = preProc(dataTrain, toTensor=True) torch.save(xTrain, 'xTrain.pt') torch.save(yTrain, 'yTrain.pt') xTrain, xValidation, yTrain, yValidation = train_test_split(xTrain, yTrain, test_size=0.2, random_state=1) torch_dataset = Data.TensorDataset(xTrain, yTrain) loader = Data.DataLoader( dataset=torch_dataset, batch_size=BATCH_SIZE, shuffle=True, ) net = Net(14, 15, 2, 3) print(net)
from sklearn.svm import SVC import numpy as np import pandas as pd import joblib import time from utils.dataprocess import preProc, preProcTest, toJson import sklearn.metrics as metrics t1 = time.time() # 训练数据处理 dataTrain = pd.read_csv(r'C:\Users\ZY\Desktop\ML\VI_train.csv') xTrain, yTrain, scaler = preProc(dataTrain) # 建立模型 max_iter = 100000000 # model里加上class_weight='balanced',等价于正负例分别乘以权重sum(负例)、sum(正例) # fit里有参数sample_weight,为每个sample赋上权重,是长度等于sample数量的array # 这两个作用相同,只使用一个 model = SVC(C=1.0, kernel='rbf', gamma='auto', tol=0.2, cache_size=1024, class_weight='balanced', max_iter=max_iter) model.fit(xTrain, yTrain, sample_weight=None) score = model.score(xTrain, yTrain) print('Score:', score) pred_y = model.predict(xTrain) fscore = metrics.f1_score(yTrain, pred_y) print('Fvalue:', fscore) equal1count = sum(pred_y == 1) print('预测结果为1的数量:', equal1count) print(pred_y) joblib.dump(model, 'carInsurancePredSVM.model') print('迭代次数', max_iter, '耗时:', time.time() - t1)