def train_lr_model(train_file, model_coef, model_file): """ Args: train_file: process file for lr train model_coef: w1 w2 model_file: model pkl """ feature_count = 118 feature_list = range(feature_count) train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) lr_cf = LRCV(Cs=[1, 10, 100], penalty="l2", tol=0.0001, max_iter=500, cv=5).fit(train_feature, train_label) scores = lr_cf.scores_.values() print(scores) # scores = lr_cf.scores_.values()[0] print("diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))) print("Accuracy:%s" % (scores.mean())) lr_cf = LRCV(Cs=[1, 10, 100], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(train_feature, train_label) scores = lr_cf.scores_.values() print("AUC:%s" % (scores))
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file): """ Args: train_file:file for training model feature_num_file:file to store total feature len mix_tree_model_file: tree part of the mix model mix_lr_model_file:lr part of the mix model """ train_feature, train_label = get_train_data(train_file, feature_num_file) train_mat = xgb.DMatrix(train_feature, train_label) (tree_depth, tree_num, learning_rate) = get_mix_model_tree_info() bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate) bst.save_model(mix_tree_model_file) tree_leaf = bst.predict(train_mat, pred_leaf=True) total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth) lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, cv=5)\ .fit(total_feature_list, train_label) scores = lr_clf.scores_.values()[0] print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)]))) print ("Accuracy:%f(+-%0.2f)" % (scores.mean(), scores.std() * 2)) lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, scoring='roc_auc', cv=5).fit( total_feature_list, train_label) scores = lr_clf.scores_.values()[0] print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)]))) print ("AUC:%f,(+-%0.2f)" % (scores.mean(), scores.std() * 2)) fw = open(mix_lr_model_file, "w+") coef = lr_clf.coef_[0] fw.write(','.join([str(ele) for ele in coef]))
def train_lr_model(train_file, model_coef, model_file): """ Agrs: train_file: process file for lr train model_coef: w1, w2 ... model_file: model_pkl """ total_feature = 118 train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",",usecols=-1) feature_list = range(total_feature) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",",usecols=feature_list) lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5).fit(train_feature, train_label) scores = list(lr_cf.scores_.values())[0] print("diff:%s"%(",".join([str(ele) for ele in scores.mean(axis=0)]))) print("Accuracy:%s (+-%0.2f)"%(scores.mean(), scores.std()*2)) lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(train_feature, train_label) scores = list(lr_cf.scores_.values())[0] print("diff:%s"%(",".join([str(ele) for ele in scores.mean(axis=0)]))) print("AUC:%s (+-%0.2f)"%(scores.mean(), scores.std()*2)) fw = open(model_coef, "w+", encoding="utf-8") fw.write(",".join([str(ele) for ele in lr_cf.coef_[0]])) fw.close() joblib.dump(lr_cf, model_file)
def train_lr_model(train_file,model_coef,model_file,feature_num_file): total_feature_num=GF.get_feature_num(feature_num_file) train_label = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = range (total_feature_num) train_feature = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=feature_list) lr_cf=LRCV(Cs=[1],penalty='l2',tol=0.0001,max_iter=500,cv=5).fit(train_feature,train_label) scores=list(lr_cf.scores_.values())[0] print('diff:%s' %(','.join([str(ele) for ele in scores.mean(axis=0)]))) print('Accuracy:%s (+-%0.2f)' %(scores.mean(),scores.std()*2)) #平均值0.842616805029923上下0.01就可覆盖90%的值,说明0.842616805029923是很靠谱的 lr_cf = LRCV (Cs=[1], penalty='l2', tol=0.0001, max_iter=500, cv=5,scoring='roc_auc').fit (train_feature, train_label) scores = list (lr_cf.scores_.values ())[0] print ('diff:%s' % (','.join ([str (ele) for ele in scores.mean (axis=0)]))) print ('AUC:%s (+-%0.2f)' % (scores.mean (), scores.std () * 2)) coef=lr_cf.coef_[0] fw=open(model_coef,'w+') fw.write(','.join(str(ele) for ele in coef)) fw.close() joblib.dump(lr_cf,model_file)
def train_lr_model(train_file, model_coef, model_file): """ :param train_file: process file for lr training :param model_coef: w1, w2, ... :param model_file: model pkl """ # 98+20=118. 所有离散特征的总维度为98,所有连续特征的总维度为20 # 118 表示所有特征的总维度。label的维度为1,因此train_file.txt、test_file.txt的列数为119 total_feature_num = FEATURE_NUM # usecols=-1 表示使用最后一列, 也就是label train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = list(range(total_feature_num)) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) # Cs - 正则化参数,分别为 1、0.1、0.01;penalty="l2" L2正则;tol 参数迭代停止的条件;max_iter 最大迭代次数; # cv=5 5折交叉验证,将训练数据分为5份,每次拿80%作为训练集,20%作为测试集,总共进行5次训练 # 最优化方法solver选择拟牛顿法 # 由结果可知,正则化参数为1时,模型的AUC最高。这里将[1, 10, 100]改为[1] lr_cf = LRCV(Cs=[1], penalty="l2", tol=1e-4, max_iter=500, cv=5).fit(train_feature, train_label) # 得到一个5行3列的二维数组 scores = list(lr_cf.scores_.values())[0] # 查看每一个正则化参数(1、0.1、0.01)所对应的5折交叉验证的平均准确率 # 由结果可知,正则化参数为0.01时,模型的准确率最高 print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)])) # 各个正则化参数的总平均准确率 print("accuracy:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2)) # 除了上述查看模型的准确率,还需要关心模型的AUC,scoring="roc_auc" lr_cf = LRCV(Cs=[1], penalty="l2", tol=1e-4, max_iter=500, cv=5, scoring="roc_auc").fit(train_feature, train_label) scores = list(lr_cf.scores_.values())[0] # 由结果可知,正则化参数为1时,模型的AUC最高。与上述的准确率最优参数不一致,这里倾向于选择AUC最高的参数1 print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)])) # 各个正则化参数的总平均AUC print("AUC:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2)) coef = lr_cf.coef_[0] # 保存模型的训练参数 with open(model_coef, "w+") as f: f.write(",".join([str(ele) for ele in coef])) # 将整个模型实例化到文件 joblib.dump(lr_cf, model_file)
def train_gbdt_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file): """ :param train_file: file for training model :param feature_num_file: file to store total feature len :param mix_tree_model_file: tree part of the mix model :param mix_lr_model_file: lr part of the mix model """ train_feature, train_label = get_train_data(train_file, feature_num_file) train_mat = xgb.DMatrix(train_feature, train_label) tree_num, tree_depth, learning_rate = get_mix_model_tree_info() bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate) bst.save_model(mix_tree_model_file) # tree_leaf是一个二维矩阵,行数为数据集中的样本个数(训练集为30162),列数为树的棵数 # 表示每个样本在每棵树中所预测的叶节点index(该叶节点在整棵树节点中的下标index,根节点下标为0) tree_leaf = bst.predict(train_mat, pred_leaf=True) # print(tree_leaf[0]) # 输出树中最大的叶节点索引,节点索引从0开始,即 树中共有np.max(tree_leaf) +1 个节点 # print(np.max(tree_leaf)) total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth) lr_cf = LRCV(Cs=[1], penalty="l2", dual=False, tol=1e-4, max_iter=500, cv=5).fit(total_feature_list, train_label) # 得到一个5行3列的二维数组 scores = list(lr_cf.scores_.values())[0] # 查看每一个正则化参数(1、0.1、0.01)所对应的5折交叉验证的平均准确率 # 由结果可知,正则化参数为0.01时,模型的准确率最高 print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)])) # 各个正则化参数的总平均准确率 print("accuracy:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2)) # 除了上述查看模型的准确率,还需要关心模型的AUC,scoring="roc_auc" lr_cf = LRCV(Cs=[1], penalty="l2", dual=False, tol=1e-4, max_iter=500, cv=5, scoring="roc_auc").fit(total_feature_list, train_label) scores = list(lr_cf.scores_.values())[0] # 由结果可知,正则化参数为1时,模型的AUC最高。与上述的准确率最优参数不一致,这里倾向于选择AUC最高的参数1 print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)])) # 各个正则化参数的总平均AUC print("AUC:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2)) with open(mix_lr_model_file, "w+") as f: f.write(",".join([str(ele) for ele in lr_cf.coef_[0]]))
def train_lr_mode(train_file, model_coef, model_file, feature_num_file): tolal_feature_num = 118 tolal_feature_num = utils.get_feature_num(feature_num_file) train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=',', usecols=-1) feature_list = range(tolal_feature_num) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.00001, max_iter=1000, cv=5).fit(train_feature, train_label) #CS 为 正则化参数 1 或者0.1, 0.01 :[1,10,100] 的倒数 #tol 参数迭代停止的条件: #cv=5 5折交叉验证 #solver 梯度下降的方式:坐标轴下降法、拟牛顿法、随机梯度下降法(适应大数据量,随机抽取数据进行迭代);因为是l2正则化,所以只能选择拟牛顿法和随机梯度下降; #默认是拟牛顿法 scores = lr_cf.scores_.values() scores = list(scores)[0] #提取values值 print(scores) print("diff %s : " % (" ".join([str(ele) for ele in scores.mean(axis=0)]))) #按照列求均值 print("Accuracy %s ,(+- %0.2f ): " % (scores.mean(), scores.std() * 2)) #模型的auc Cs=[1,10,100] lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.00001, max_iter=1000, cv=5, scoring='roc_auc').fit(train_feature, train_label) scores = lr_cf.scores_.values() scores = list(scores)[0] # 提取values值 print(scores) print("diff %s : " % (" ".join([str(ele) for ele in scores.mean(axis=0)]))) # 按照列求均值 print("AUC %s ,(+- %0.2f ):" % (scores.mean(), scores.std() * 2)) # 发现 Cs=[1,10,100] 正则化参数 为 1的时候,效果最好,故而,后面全选1 #保存模型文件 coef = lr_cf.coef_[0] with open(model_coef, "w+", encoding='utf-8') as file: file.write(",".join(str(ele) for ele in coef)) joblib.dump(lr_cf, model_file + "jb")
def train_lr_model(train_file, model_coef, model_file): """ :param train_file: process file for lr train :param model_coef: w1,w2, :param model_file: model pkl """ total_feature_num = 118 # 离散化后的特征维度 train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) # delimiter=","声明分隔符是, feature_list = range(total_feature_num) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) # 118 维是样本特征 lr_cf = LRCV( Cs=[1, 10, 100], penalty="l2", tol=0.0001, max_iter=500, cv=5, ).fit(train_feature, train_label) # 正则化相关的参数 Cs是正则项系数,求倒数,tol=0.0001,残差收敛条件 scores = lr_cf.scores_.values()[0] # shape=[5,3] print(",".join([str(ele) for ele in scores.mean(axis=0)])) # 对每列求平均 print("diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))) # 三个平均值 print("Accurcy:%s%(+-%0.2f)" % (scores.mean(), scores.std() * 2)) # 一个平均值 正态分布,+-2*std 就是90% lr_cf = LRCV(Cs=[1, 10, 100], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit( train_feature, train_label) # 正则化相关的参数 Cs是正则项系数,求倒数,tol=0.0001,残差收敛条件 scores = lr_cf.scores.values()[0] print("diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))) # 三个平均值 print("Auc:%s%(+-%0.2f)" % (scores.mean(), scores.std() * 2)) # 一个平均值 coef = lr_cf.coef_[0] fw = open(model_coef, ) fw.write(",".join(str(ele) for ele in coef)) fw.close() joblib.dump(lr_cf, model_file)
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file): """ Args: train_file: file for training model feature_num_file: file to store total feature len mix_tree_model_file: tree part of the mix model mix_lr_model_file: lr part of the mix model """ train_feature, train_label = get_train_data(train_file, feature_num_file) train_mat = xgb.DMatrix(train_feature, train_label) tree_num, tree_depth, learning_rate = 10, 6, 0.3 bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate) bst.save_model(mix_tree_model_file) tree_leaf = bst.predict(train_mat, pred_leaf=True) print(tree_leaf[0]) print(np.max(tree_leaf)) total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth) lr_cf = LRCV(Cs=[1, 10], penalty="l2", tol=0.0001, max_iter=500, cv=3, scoring='roc_auc').fit(total_feature_list, train_label) fw = open(mix_lr_model_file, "w+")
def train_lr_model(train_file, model_coef, model_file): """ Args: train_file: process file for lr train model_coef: w1 w2 ... model_file: pkl Return: """ total_feature_num = 32 train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=-1) feature_list = range(total_feature_num) train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",", usecols=feature_list) lr_cf = LRCV(Cs=[1, 10], penalty="l2", tol=0.0001, max_iter=500, cv=3, scoring='roc_auc').fit(train_feature, train_label) score = lr_cf.scores_ print(train_feature) print(train_label) print(score) coef = (lr_cf.coef_[0]) fw = open(model_coef, "w+") fw.write(",".join(str(ele) for ele in coef)) fw.close() joblib.dump(lr_cf, model_file)
def train_lr_model(train_file,model_coef,model_file): ''' 训练模型 :param train_file: :param model_coef: :param model_file: :return: ''' total_feature_num=118 train_label=np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols=-1) #标签是最后一列 feature_list=list(range(total_feature_num-1)) #0-116 print(feature_list) train_feature=np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols=feature_list) #print(np.shape(train_feature)) #迭代停止条件tol=0.0001,最大迭代次数为500,交叉验证为5,即每次选择20%作为测试,由于这里数据不是很多,默认最优化方法使用拟牛顿法 lr_cf=LRCV(Cs=[1,10,100],penalty='l2',tol=0.0001,max_iter=500,cv=5,scoring="roc_auc").fit(train_feature,train_label) #Cs的值的倒数是正则化的参数,这里设置多个即1,0.1,0.01,正则化选择l2 scores=lr_cf.scores_.values() #模型训练的准确率,scores是一个5行3列的数组,即每次交叉验证和不同正则化参数下的准确率 print(scores) #1、将模型的参数输出 coef=lr_cf.coef_[0] fw=open(model_coef,"w+") fw.write(",".join(str(ele) for ele in coef)) fw.close() #2、将整个模型实例成一个对象输出 joblib.dump(lr_cf,model_file)
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file): """ GBDT和LR混合模型 :param train_file: file to training model :param feature_num_file: 特征数目 :param mix_tree_model_file: 树模型 :param mix_lr_model_file: lr模型 """ train_feature, train_label = get_train_data(train_file, feature_num_file) train_mat = xgb.DMatrix(train_feature, train_label) (tree_num, tree_depth, learning_rate) = 10, 4, 0.3 #获取最优参数 bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate) bst.save_model(mix_tree_model_file) #将树的模型保存 tree_leaf = bst.predict(train_mat, pred_leaf=True) print(tree_leaf[0]) #样本最后落在哪个节点上 #提取特征 total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth) #LR模型 lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, cv=5).fit(total_feature_list, train_label) scores = list(lr_clf.scores_.values())[0] print("diff: %s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))) print("Accuracy: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2)) lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, solver="liblinear", scoring="roc_auc").fit(train_feature, train_label) scores = list(lr_cf.scores_.values())[0] #print(scores) print("diff: %s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))) print("AUC: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2)) fw = open(mix_lr_model_file, "w+") coef = lr_clf.coef_[0] fw.write(",".join([str(ele) for ele in coef])) fw.close()
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file): ''' Args: train_file: file for training model feature_num_file: file to store total feature length mix_tree_model_file: tree part of mix model mix_lr_model_file: lr part of mix model ''' train_feature, train_label = get_train_data(train_file, feature_num_file) train_mat = xgb.DMatrix(train_feature, train_label) tree_num, tree_depth, learning_rate = 10, 4, 0.3 bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate) bst.save_model(mix_tree_model_file) tree_leaf = bst.predict(train_mat, pred_leaf=True) #print(tree_leaf[0]) #print(np.max(tree_leaf)) #sys.exit() total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth) lr_cf = LRCV(Cs=[1], penalty='l2', tol=0.0001, max_iter=500, cv=5).fit(total_feature_list, train_label) scores = list(lr_cf.scores_.values())[0] print('diff: %s' % (",".join([str(ele) for ele in scores.mean(axis=0)]))) print('accuracy: %s (+-%0.2f)' % (scores.mean(), scores.std() * 2)) lr_cf = LRCV(Cs=[1], penalty='l2', tol=0.0001, max_iter=500, cv=5, scoring='roc_auc').fit(train_feature, train_label) scores = list(lr_cf.scores_.values())[0] print('diff: %s' % (",".join([str(ele) for ele in scores.mean(axis=0)]))) print('auc: %s (+-%0.2f)' % (scores.mean(), scores.std() * 2)) fw = open(mix_lr_model_file, 'w+') coef = lr_cf.coef_[0] fw.write(','.join(str(ele) for ele in coef)) fw.close()
def __init__(self, Cs=10, fit_intercept=True, cv=None, dual=False, penalty='l2', scoring=None, solver='lbfgs', tol=0.0001, max_iter=100, class_weight=None, n_jobs=None, verbose=0, refit=True, intercept_scaling=1, multi_class='auto', random_state=None, l1_ratios=None): self.class_weight = class_weight self.tol = tol self.dual = dual self.multi_class = multi_class self.Cs = Cs self.random_state = random_state self.penalty = penalty self.max_iter = max_iter self.cv = cv self.fit_intercept = fit_intercept self.n_jobs = n_jobs self.refit = refit self.l1_ratios = l1_ratios self.solver = solver self.intercept_scaling = intercept_scaling self.verbose = verbose self.scoring = scoring self.model = LRCV(dual=self.dual, intercept_scaling=self.intercept_scaling, max_iter=self.max_iter, cv=self.cv, solver=self.solver, fit_intercept=self.fit_intercept, l1_ratios=self.l1_ratios, refit=self.refit, tol=self.tol, n_jobs=self.n_jobs, penalty=self.penalty, verbose=self.verbose, Cs=self.Cs, scoring=self.scoring, multi_class=self.multi_class, class_weight=self.class_weight, random_state=self.random_state)
def train_tree_and_lr_model(train_file,feature_num_file,mix_tree_model_file,mix_lr_model_file): ''' 树模型与lr是分开训练的,分别保存两个训练文件 混合模型原理https://zhuanlan.zhihu.com/p/42123341 :param train_file: :param feature_num_file: :param mix_tree_model_file: XGboost树模型文件 :param mix_lr_model_file: logistics模型 ''' #树模型的获取 train_feature,train_label=get_train_data(train_file,feature_num_file) train_mat=xgb.DMatrix(train_feature,train_label) (tree_depth,tree_num,learning_rate)=get_mix_model_tree_info() print(tree_depth,tree_num,learning_rate) bst=train_tree_model_core(train_mat,tree_depth,tree_num,learning_rate) bst.save_model(mix_tree_model_file) #logistic regression模型的获取:用树结构处理数据形成特征后fit logistic模型 tree_leaf=bst.predict(train_mat,pred_leaf=True) #样本落在哪个节点上 print(len(tree_leaf)) print (tree_leaf[0]) #第一个样本落在10棵树上每棵的叶子节点[15 18 15 15 23 27 13 17 28 21],深度为4,16个叶子节点,15个非叶子节点。 # 第一位为15,表示落在第一棵树上第一个叶子节点上;最后一个21,表示落在第七个叶子节点上 #实际中特征:样本=1:100,深度为4时,总特征=2**4*10=160,总样本3W条 #sys.exit() total_feature_list=get_gbdt_and_lr_feature(tree_leaf,tree_num,tree_depth) lr_clf=LRCV(Cs=[1.0],penalty='l2',dual=False,tol=0.0001,max_iter=500,cv=5) lr_clf=lr_clf.fit(total_feature_list,train_label) scores = list (lr_clf.scores_.values ())[0] print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)]))) print ("Accuracy:%f(+-%0.2f)" % (scores.mean(), scores.std() * 2)) lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, scoring='roc_auc', cv=5).fit( total_feature_list, train_label) scores = list (lr_clf.scores_.values ())[0] print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)]))) print ("AUC:%f,(+-%0.2f)" % (scores.mean(), scores.std() * 2)) fw = open(mix_lr_model_file, "w+") coef = lr_clf.coef_[0] fw.write(','.join([str(ele) for ele in coef]))
def train_lr_model(train_file,model_coef,model_file,feature_num_file): """ :param train_file: process file for lr train :param model_coef: w1, w2 ..... :param model_file: model pkl :param feature_num_file: file to record num of feature """ total_feature_num = GF.get_feature_num(feature_num_file) train_label = np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols= -1) feature_list = range(total_feature_num) train_feature = np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols=feature_list) lr_cf = LRCV(Cs=[1],penalty="l2",tol= 0.0001, max_iter=500,cv=5,solver="liblinear").fit(train_feature,train_label) scores = lr_cf.scores_.values()[0] print("diff: %s" %(",".join([str(ele) for ele in scores.mean(axis=0)]))) print("Accuracy: %s (+-%0.2f)" %(scores.mean(),scores.std()*2)) lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5,solver="liblinear",scoring="roc_auc").fit(train_feature, train_label) scores = lr_cf.scores_.values()[0] print("diff: %s" % (",".join([str(ele) for ele in scores.mean(axis=0)]))) print("AUC: %s (+-%0.2f)" %(scores.mean(),scores.std()*2)) coef = lr_cf.coef_[0] fw = open(model_coef,"w+") fw.write(",".join(str(ele) for ele in coef)) fw.close() joblib.dump(lr_cf,model_file)
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file): """ gbdt+lr 混合模型 ,分开 训练 顺序训练(耗时较长) :param train_file: 训练数据 :param feature_num_file: 特征维度文件 :param mix_tree_model_file: 树模型文件 :param mix_lr_model_file: 逻辑回归文件 :return: None """ train_feature, train_label = get_train_data(train_file, feature_num_file) train_mat = xgb.DMatrix(train_feature, train_label) # tree_num, tree_depth, learning_rate = 10,6, 0.3 (tree_depth, tree_num, learning_rate) = get_mix_model_tree_info() # 这里树的深度由 6 改为4,原因:如下: 深度为6:总共:127个节点,64个叶子节点,63个非叶子节点 # 1.训练出的label,没有落在叶子节点上(或者落在叶子节点上比较少) # 2. 特征与样本量的比值:1:100。 因为: 10颗数,深度为6,则叶子节点有 有640个维度,而样本有3万条,不满足 #训练树模型的代码 bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate) bst.save_model(mix_tree_model_file) tree_leaf = bst.predict(train_mat, pred_leaf=True) #预测最终结果落在哪一个叶子节点上 # print(tree_leaf) #[81 84 84 84 85 77 68 91 97 61] 代表10颗数,81代表最终1 落到那一颗叶子节点上 # print(np.max(tree_leaf)) # sys.exit() total_feature_list = get_gbdt_and_lr_featrue(tree_leaf, tree_num, tree_depth) #逻辑回归 lr_clf = LRCV(Cs=[1.0], penalty="l2", tol=0.00001, max_iter=500, cv=5, scoring='roc_auc').fit(total_feature_list, train_label) scores = lr_clf.scores_.values() scores = list(scores)[0] # 提取values值 print(scores) print("diff %s : " % (" ".join([str(ele) for ele in scores.mean(axis=0)]))) # 按照列求均值 print("AUC %s ,(+- %0.2f ):" % (scores.mean(), scores.std() * 2)) coef = lr_clf.coef_[0] with open(mix_lr_model_file, "w+", encoding="utf-8") as file: file.write(",".join([str(ele) for ele in coef]))
def train(path, param, gbdt_model, coef, lr_model): """ gbdt&lr model train Args: path: file path param: model parameter gbdt_model: gbdt model file coef: lr coef file lr_model: lr model file """ t = load(path, np.float64, ",") y_train = t[:, 0] X_train = t[:, 1:] dtrain = xgb.DMatrix(X_train, y_train) bst = xgb.train(param, dtrain) bst.save_model(gbdt_model) num_leaf = 2**(param["max_depth"] + 1) trans = bst.predict(dtrain, pred_leaf=True) X_lr_train = [] for t in trans: f = [] for x in t: f += [1 if x == i else 0 for i in range(num_leaf)] X_lr_train.append(f) X_lr_train = np.array(X_lr_train) y_lr_train = y_train lr = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(X_lr_train, y_lr_train) with open(coef, "wb") as fp: fp.write(str(lr.intercept_[0]) + ",") fp.write(",".join([str(_) for _ in lr.coef_[0]])) joblib.dump(lr, lr_model)
def LRCVpredictor(X_train, y_train, X_test): '''Logistic Regression Classifier Input traning data ,target, and test data Output prabability of each label for test data''' from sklearn.linear_model import LogisticRegressionCV as LRCV # Cross validation may not be needed for random forest classifier model = LRCV(random_state=1) model.fit(X_train, y_train) y_pred = model.predict(X_train) accuracy = metrics.accuracy_score(y_train, y_pred) logLoss = metrics.log_loss(y_train, y_pred) y_pred = model.predict(X_test) modelName = model.__class__.__name__ accModels[modelName] = accuracy predictions[modelName] = y_pred return y_pred, accuracy
def train(path, coef, model): """ lr model train Args: path: file path coef: model coef file model: model file """ t = load(path, np.int32, ",") y_train = t[:, 0] X_train = t[:, 1:] lr = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(X_train, y_train) with open(coef, "wb") as fp: fp.write(str(lr.intercept_[0])+",") fp.write(",".join([str(_) for _ in lr.coef_[0]])) joblib.dump(lr, model)
def run(self): with self.output().open('w') as fout: data = pd.read_csv('../Part3/csv/learning_data.csv') x = data.iloc[:, 3:].values training_index = [v for v in range(len(x)) if v % 5 == 0] # training set index testing_index = [v for v in range(len(x)) if v % 5 != 0] # testing set index x1 = x[training_index, :] # training X x2 = x[testing_index, :] # testing X y = data.iloc[:, 2:3].values y1 = y[training_index, :] # training Y y2 = y[testing_index, :] # testing Y lr = LRCV() # create logistic regression CV object lr.fit(x1, y1) # training model fout.write('The average accuracy of training set is:%s\n' % lr.score(x1, y1)) fout.write('The average accuracy of testing set is:%s\n' % lr.score(x2, y2)) fout.write(str(lr.coef_) + '\n') fout.write(str(lr.intercept_))
# 标准化向量,按列处理 scaler = preprocessing.StandardScaler().fit(x) xscaled = scaler.transform(x) scaler = preprocessing.StandardScaler().fit(xTest) xTestScaled = scaler.transform(xTest) times = 10 nSplit = 10 gbdtTrainAcc = 0 gbdtTestAcc = 0 lrTrainAcc = 0 lrTestAcc = 0 lr = LR(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5) lrcv = LRCV(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5) # # learning_curve测试模型 # numTrees = 200 # gbc = GradientBoostingClassifier(n_estimators=numTrees, learning_rate=0.01, subsample=0.5, random_state=0,\ # max_depth=20, min_samples_split=2) # numSection = 10 # trainS = np.zeros(numSection) # testS = np.zeros(numSection) # # for t in range(times): # state = np.random.get_state() # np.random.shuffle(xscaled) # np.random.set_state(state) # np.random.shuffle(y) # train_sizes, train_score, test_score = \
from sklearn.linear_model import LogisticRegressionCV as LRCV data = pandas.read_csv('csv/learning_data.csv') x = data.iloc[:, 3:].values training_index = [i for i in range(len(x)) if i % 20 != 0] # training set index testing_index = [i for i in range(len(x)) if i % 20 == 0] # testing set index x1 = x[training_index, :] # training X x2 = x[testing_index, :] # testing X y = data.iloc[:, 2:3].values y1 = y[training_index, :] # training Y y2 = y[testing_index, :] # testing Y lr = LRCV() # create logistic regression CV object lr.fit(x1, y1) # training model # judging accuracy print('The average accuracy of training set is:%s' % lr.score(x1, y1)) print('The average accuracy of testing set is:%s' % lr.score(x2, y2)) print('Coefficients:\n ', lr.coef_) # coefficient print('Constant value:\n ', lr.intercept_) # constant with open('result.txt', 'w') as file: file.write('The average accuracy of training set is:%s\n' % lr.score(x1, y1)) file.write('The average accuracy of testing set is:%s\n' % lr.score(x2, y2)) file.write('Coefficients:\n ' + str(lr.coef_) + '\n')
plt.legend(loc='upper left') plt.xlabel("DScore") plt.ylabel("Frequncy") plt.title(r'Floor Histogram') plt.savefig('fx_TQ_Hist.png', dpi=300) from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import LogisticRegressionCV as LRCV from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder from sklearn.compose import ColumnTransformer MinorData = pd.read_csv("/home/yang/GymClean/MinorData.csv") X_ohkey = pd.get_dummies(X) y_ohkey = pd.get_dummies(y) y_ohkey clf = LRCV(cv=5, random_state=0).fit(X_ohkey, y_ohkey['Yes']) clf1 = LR().fit(X_ohkey, y_ohkey['Yes']) y_pred = clf.predict(X_ohkey) y_pred1 = clf1.predict(X_ohkey) from sklearn.metrics import confusion_matrix confusion_matrix(y_ohkey['Yes'], y_pred) confusion_matrix(y_ohkey['Yes'], y_pred1) clf.fit clf.coef_ clf.get_params() X_ohkey.columns MinorCoeff = pd.DataFrame( [list(X_ohkey.columns), list(clf.coef_.tolist()[0])], index=['indicator', 'coefficients']).transpose() MajorData = pd.read_csv("/home/yang/GymClean/MajorData.csv")
log_f = (clf.score(X_test_scaled, Ytest) for clf in log_models) log_score = [sc for sc in log_f] log_titles = ('L1 Penalty', 'L2 Penalty', 'Elastic Net Penalty') fig, ax = plt.subplots() ax.barh([1, 2, 3], log_score) ax.set_xlabel('Fraction Correctly Identified') ax.set_title('Accuracy of Three Logistic Regression Models') ax.set_yticks([1, 2, 3]) ax.set_yticklabels(log_titles) ##see effect of regularization. See if we can improve each model with better c ##uses stratified k-folds for cross-validation from sklearn.linear_model import LogisticRegressionCV as LRCV log_cv = (LRCV(penalty='l1', random_state=0, solver='saga'), LRCV(penalty='l2', random_state=0), LRCV(penalty='elastic net', random_state=0, solver='saga')) log_cv_models = (clf.fit(X_train_scaled, np.ravel(Ytrain)) for clf in log_cv) log_cv_f = (clf.score(X_test_scaled, Ytest) for clf in log_cv_models) log_cv_score = [sc for sc in log_cv_f] log_titles = ('L1 Penalty', 'L2 Penalty', 'Elastic Net Penalty') model = LRCV().fit(X_train_scaled, np.ravel(Ytrain)) print(np.exp(model.coef_)) print(np.exp(model.intercept_)) fig, ax = plt.subplots() ax.barh([1, 2, 3], log_cv_score) ax.set_xlabel('Fraction Correctly Identified') ax.set_title('Accuracy of Three Logistic Regression Models (with CV)') ax.set_yticks([1, 2, 3])