Example #1
0
def train_lr_model(train_file, model_coef, model_file):
    """
    Args:
        train_file: process file for lr train
        model_coef: w1 w2
        model_file: model pkl
    """
    feature_count = 118
    feature_list = range(feature_count)
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    lr_cf = LRCV(Cs=[1, 10, 100], penalty="l2", tol=0.0001, max_iter=500,
                 cv=5).fit(train_feature, train_label)
    scores = lr_cf.scores_.values()
    print(scores)
    # scores = lr_cf.scores_.values()[0]
    print("diff:%s" % (",".join([str(ele) for ele in scores.mean(axis=0)])))
    print("Accuracy:%s" % (scores.mean()))
    lr_cf = LRCV(Cs=[1, 10, 100],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=5,
                 scoring="roc_auc").fit(train_feature, train_label)
    scores = lr_cf.scores_.values()
    print("AUC:%s" % (scores))
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file, mix_lr_model_file):
    """
    Args:
        train_file:file for training model
        feature_num_file:file to store total feature len
        mix_tree_model_file: tree part of the mix model
        mix_lr_model_file:lr part of the mix model
    """
    train_feature, train_label = get_train_data(train_file, feature_num_file)
    train_mat = xgb.DMatrix(train_feature, train_label)
    (tree_depth, tree_num, learning_rate) = get_mix_model_tree_info()
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(mix_tree_model_file)
    tree_leaf = bst.predict(train_mat, pred_leaf=True)
    total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num, tree_depth)
    lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, cv=5)\
        .fit(total_feature_list, train_label)
    scores = lr_clf.scores_.values()[0]
    print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)])))
    print ("Accuracy:%f(+-%0.2f)" % (scores.mean(), scores.std() * 2))
    lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, scoring='roc_auc', cv=5).fit(
        total_feature_list, train_label)
    scores = lr_clf.scores_.values()[0]
    print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)])))
    print ("AUC:%f,(+-%0.2f)" % (scores.mean(), scores.std() * 2))
    fw = open(mix_lr_model_file, "w+")
    coef = lr_clf.coef_[0]
    fw.write(','.join([str(ele) for ele in coef]))
Example #3
0
def train_lr_model(train_file, model_coef, model_file):
    """
    Agrs:
        train_file: process file for lr train
        model_coef: w1, w2 ...
        model_file: model_pkl
    """
    
    total_feature = 118
    train_label = np.genfromtxt(train_file, dtype=np.int32, delimiter=",",usecols=-1)
    feature_list = range(total_feature)
    train_feature = np.genfromtxt(train_file, dtype=np.int32, delimiter=",",usecols=feature_list)

    lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5).fit(train_feature, train_label)
    scores = list(lr_cf.scores_.values())[0]
    print("diff:%s"%(",".join([str(ele) for ele in scores.mean(axis=0)])))
    print("Accuracy:%s (+-%0.2f)"%(scores.mean(), scores.std()*2))

    lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(train_feature, train_label)
    scores = list(lr_cf.scores_.values())[0]
    print("diff:%s"%(",".join([str(ele) for ele in scores.mean(axis=0)])))
    print("AUC:%s (+-%0.2f)"%(scores.mean(), scores.std()*2))

    fw = open(model_coef, "w+", encoding="utf-8")
    fw.write(",".join([str(ele) for ele in lr_cf.coef_[0]]))
    fw.close()

    joblib.dump(lr_cf, model_file)
def train_lr_model(train_file,model_coef,model_file,feature_num_file):


    total_feature_num=GF.get_feature_num(feature_num_file)

    train_label = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=-1)
    feature_list = range (total_feature_num)
    train_feature = np.genfromtxt (train_file, dtype=np.int32, delimiter=",", usecols=feature_list)

    lr_cf=LRCV(Cs=[1],penalty='l2',tol=0.0001,max_iter=500,cv=5).fit(train_feature,train_label)
    scores=list(lr_cf.scores_.values())[0]
    print('diff:%s' %(','.join([str(ele) for ele in scores.mean(axis=0)])))
    print('Accuracy:%s (+-%0.2f)' %(scores.mean(),scores.std()*2))
    #平均值0.842616805029923上下0.01就可覆盖90%的值,说明0.842616805029923是很靠谱的

    lr_cf = LRCV (Cs=[1], penalty='l2', tol=0.0001, max_iter=500, cv=5,scoring='roc_auc').fit (train_feature, train_label)
    scores = list (lr_cf.scores_.values ())[0]
    print ('diff:%s' % (','.join ([str (ele) for ele in scores.mean (axis=0)])))
    print ('AUC:%s (+-%0.2f)' % (scores.mean (), scores.std () * 2))

    coef=lr_cf.coef_[0]
    fw=open(model_coef,'w+')
    fw.write(','.join(str(ele) for ele in coef))
    fw.close()
    joblib.dump(lr_cf,model_file)
Example #5
0
File: train.py Project: atm1992/LR
def train_lr_model(train_file, model_coef, model_file):
    """

    :param train_file: process file for lr training
    :param model_coef: w1, w2, ...
    :param model_file: model pkl
    """
    # 98+20=118. 所有离散特征的总维度为98,所有连续特征的总维度为20
    # 118 表示所有特征的总维度。label的维度为1,因此train_file.txt、test_file.txt的列数为119
    total_feature_num = FEATURE_NUM
    # usecols=-1 表示使用最后一列, 也就是label
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = list(range(total_feature_num))
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)

    # Cs - 正则化参数,分别为 1、0.1、0.01;penalty="l2" L2正则;tol 参数迭代停止的条件;max_iter 最大迭代次数;
    # cv=5 5折交叉验证,将训练数据分为5份,每次拿80%作为训练集,20%作为测试集,总共进行5次训练
    # 最优化方法solver选择拟牛顿法
    # 由结果可知,正则化参数为1时,模型的AUC最高。这里将[1, 10, 100]改为[1]
    lr_cf = LRCV(Cs=[1], penalty="l2", tol=1e-4, max_iter=500,
                 cv=5).fit(train_feature, train_label)
    # 得到一个5行3列的二维数组
    scores = list(lr_cf.scores_.values())[0]
    # 查看每一个正则化参数(1、0.1、0.01)所对应的5折交叉验证的平均准确率
    # 由结果可知,正则化参数为0.01时,模型的准确率最高
    print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)]))
    # 各个正则化参数的总平均准确率
    print("accuracy:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2))

    # 除了上述查看模型的准确率,还需要关心模型的AUC,scoring="roc_auc"
    lr_cf = LRCV(Cs=[1],
                 penalty="l2",
                 tol=1e-4,
                 max_iter=500,
                 cv=5,
                 scoring="roc_auc").fit(train_feature, train_label)
    scores = list(lr_cf.scores_.values())[0]
    # 由结果可知,正则化参数为1时,模型的AUC最高。与上述的准确率最优参数不一致,这里倾向于选择AUC最高的参数1
    print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)]))
    # 各个正则化参数的总平均AUC
    print("AUC:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2))

    coef = lr_cf.coef_[0]
    # 保存模型的训练参数
    with open(model_coef, "w+") as f:
        f.write(",".join([str(ele) for ele in coef]))

    # 将整个模型实例化到文件
    joblib.dump(lr_cf, model_file)
Example #6
0
def train_gbdt_and_lr_model(train_file, feature_num_file, mix_tree_model_file,
                            mix_lr_model_file):
    """

    :param train_file: file for training model
    :param feature_num_file: file to store total feature len
    :param mix_tree_model_file: tree part of the mix model
    :param mix_lr_model_file: lr part of the mix model
    """
    train_feature, train_label = get_train_data(train_file, feature_num_file)
    train_mat = xgb.DMatrix(train_feature, train_label)
    tree_num, tree_depth, learning_rate = get_mix_model_tree_info()
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(mix_tree_model_file)
    # tree_leaf是一个二维矩阵,行数为数据集中的样本个数(训练集为30162),列数为树的棵数
    # 表示每个样本在每棵树中所预测的叶节点index(该叶节点在整棵树节点中的下标index,根节点下标为0)
    tree_leaf = bst.predict(train_mat, pred_leaf=True)
    # print(tree_leaf[0])
    # 输出树中最大的叶节点索引,节点索引从0开始,即 树中共有np.max(tree_leaf) +1 个节点
    # print(np.max(tree_leaf))
    total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num,
                                                 tree_depth)
    lr_cf = LRCV(Cs=[1],
                 penalty="l2",
                 dual=False,
                 tol=1e-4,
                 max_iter=500,
                 cv=5).fit(total_feature_list, train_label)
    # 得到一个5行3列的二维数组
    scores = list(lr_cf.scores_.values())[0]
    # 查看每一个正则化参数(1、0.1、0.01)所对应的5折交叉验证的平均准确率
    # 由结果可知,正则化参数为0.01时,模型的准确率最高
    print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)]))
    # 各个正则化参数的总平均准确率
    print("accuracy:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2))

    # 除了上述查看模型的准确率,还需要关心模型的AUC,scoring="roc_auc"
    lr_cf = LRCV(Cs=[1],
                 penalty="l2",
                 dual=False,
                 tol=1e-4,
                 max_iter=500,
                 cv=5,
                 scoring="roc_auc").fit(total_feature_list, train_label)
    scores = list(lr_cf.scores_.values())[0]
    # 由结果可知,正则化参数为1时,模型的AUC最高。与上述的准确率最优参数不一致,这里倾向于选择AUC最高的参数1
    print("diff:", ",".join([str(ele) for ele in scores.mean(axis=0)]))
    # 各个正则化参数的总平均AUC
    print("AUC:{0} (+-{1:.3f})".format(scores.mean(), scores.std() * 2))

    with open(mix_lr_model_file, "w+") as f:
        f.write(",".join([str(ele) for ele in lr_cf.coef_[0]]))
Example #7
0
def train_lr_mode(train_file, model_coef, model_file, feature_num_file):

    tolal_feature_num = 118
    tolal_feature_num = utils.get_feature_num(feature_num_file)
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=',',
                                usecols=-1)
    feature_list = range(tolal_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.00001, max_iter=1000,
                 cv=5).fit(train_feature, train_label)
    #CS 为 正则化参数 1 或者0.1, 0.01  :[1,10,100] 的倒数
    #tol 参数迭代停止的条件:
    #cv=5 5折交叉验证
    #solver 梯度下降的方式:坐标轴下降法、拟牛顿法、随机梯度下降法(适应大数据量,随机抽取数据进行迭代);因为是l2正则化,所以只能选择拟牛顿法和随机梯度下降;
    #默认是拟牛顿法

    scores = lr_cf.scores_.values()
    scores = list(scores)[0]  #提取values值
    print(scores)
    print("diff %s : " %
          ("   ".join([str(ele) for ele in scores.mean(axis=0)])))  #按照列求均值
    print("Accuracy %s ,(+- %0.2f ): " % (scores.mean(), scores.std() * 2))

    #模型的auc   Cs=[1,10,100]
    lr_cf = LRCV(Cs=[1],
                 penalty="l2",
                 tol=0.00001,
                 max_iter=1000,
                 cv=5,
                 scoring='roc_auc').fit(train_feature, train_label)
    scores = lr_cf.scores_.values()
    scores = list(scores)[0]  # 提取values值
    print(scores)
    print("diff %s : " %
          ("  ".join([str(ele) for ele in scores.mean(axis=0)])))  # 按照列求均值
    print("AUC %s ,(+- %0.2f ):" % (scores.mean(), scores.std() * 2))
    # 发现 Cs=[1,10,100]  正则化参数  为 1的时候,效果最好,故而,后面全选1

    #保存模型文件
    coef = lr_cf.coef_[0]
    with open(model_coef, "w+", encoding='utf-8') as file:
        file.write(",".join(str(ele) for ele in coef))

    joblib.dump(lr_cf, model_file + "jb")
Example #8
0
def train_lr_model(train_file, model_coef, model_file):
    """

    :param train_file: process file for lr train
    :param model_coef:  w1,w2,
    :param model_file:  model pkl
    """
    total_feature_num = 118  # 离散化后的特征维度
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)  # delimiter=","声明分隔符是,
    feature_list = range(total_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)  # 118 维是样本特征
    lr_cf = LRCV(
        Cs=[1, 10, 100],
        penalty="l2",
        tol=0.0001,
        max_iter=500,
        cv=5,
    ).fit(train_feature,
          train_label)  # 正则化相关的参数 Cs是正则项系数,求倒数,tol=0.0001,残差收敛条件
    scores = lr_cf.scores_.values()[0]  # shape=[5,3]
    print(",".join([str(ele) for ele in scores.mean(axis=0)]))  # 对每列求平均
    print("diff:%s" % (",".join([str(ele)
                                 for ele in scores.mean(axis=0)])))  # 三个平均值
    print("Accurcy:%s%(+-%0.2f)" %
          (scores.mean(), scores.std() * 2))  # 一个平均值  正态分布,+-2*std 就是90%
    lr_cf = LRCV(Cs=[1, 10, 100],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=5,
                 scoring="roc_auc").fit(
                     train_feature,
                     train_label)  # 正则化相关的参数 Cs是正则项系数,求倒数,tol=0.0001,残差收敛条件
    scores = lr_cf.scores.values()[0]
    print("diff:%s" % (",".join([str(ele)
                                 for ele in scores.mean(axis=0)])))  # 三个平均值
    print("Auc:%s%(+-%0.2f)" % (scores.mean(), scores.std() * 2))  # 一个平均值
    coef = lr_cf.coef_[0]
    fw = open(model_coef, )
    fw.write(",".join(str(ele) for ele in coef))
    fw.close()

    joblib.dump(lr_cf, model_file)
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file,
                            mix_lr_model_file):
    """
    Args:
        train_file: file for training model
        feature_num_file: file to store total feature len
        mix_tree_model_file: tree part of the mix model
        mix_lr_model_file: lr part of the mix model
    """
    train_feature, train_label = get_train_data(train_file, feature_num_file)
    train_mat = xgb.DMatrix(train_feature, train_label)
    tree_num, tree_depth, learning_rate = 10, 6, 0.3
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(mix_tree_model_file)
    tree_leaf = bst.predict(train_mat, pred_leaf=True)
    print(tree_leaf[0])
    print(np.max(tree_leaf))
    total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num,
                                                 tree_depth)
    lr_cf = LRCV(Cs=[1, 10],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=3,
                 scoring='roc_auc').fit(total_feature_list, train_label)
    fw = open(mix_lr_model_file, "w+")
def train_lr_model(train_file, model_coef, model_file):
    """
    Args:
        train_file: process file for lr train
        model_coef: w1 w2 ...
        model_file: pkl
    Return:
    """
    total_feature_num = 32
    train_label = np.genfromtxt(train_file,
                                dtype=np.int32,
                                delimiter=",",
                                usecols=-1)
    feature_list = range(total_feature_num)
    train_feature = np.genfromtxt(train_file,
                                  dtype=np.int32,
                                  delimiter=",",
                                  usecols=feature_list)
    lr_cf = LRCV(Cs=[1, 10],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=3,
                 scoring='roc_auc').fit(train_feature, train_label)
    score = lr_cf.scores_
    print(train_feature)
    print(train_label)
    print(score)
    coef = (lr_cf.coef_[0])
    fw = open(model_coef, "w+")
    fw.write(",".join(str(ele) for ele in coef))
    fw.close()
    joblib.dump(lr_cf, model_file)
def train_lr_model(train_file,model_coef,model_file):
    '''
    训练模型
    :param train_file:
    :param model_coef:
    :param model_file:
    :return:
    '''
    total_feature_num=118
    train_label=np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols=-1)  #标签是最后一列
    feature_list=list(range(total_feature_num-1))  #0-116
    print(feature_list)
    train_feature=np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols=feature_list)
    #print(np.shape(train_feature))
    #迭代停止条件tol=0.0001,最大迭代次数为500,交叉验证为5,即每次选择20%作为测试,由于这里数据不是很多,默认最优化方法使用拟牛顿法
    lr_cf=LRCV(Cs=[1,10,100],penalty='l2',tol=0.0001,max_iter=500,cv=5,scoring="roc_auc").fit(train_feature,train_label)   #Cs的值的倒数是正则化的参数,这里设置多个即1,0.1,0.01,正则化选择l2
    scores=lr_cf.scores_.values()   #模型训练的准确率,scores是一个5行3列的数组,即每次交叉验证和不同正则化参数下的准确率
    print(scores)
    #1、将模型的参数输出
    coef=lr_cf.coef_[0]
    fw=open(model_coef,"w+")
    fw.write(",".join(str(ele) for ele in coef))
    fw.close()
    #2、将整个模型实例成一个对象输出
    joblib.dump(lr_cf,model_file)
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file,
                            mix_lr_model_file):
    """
	GBDT和LR混合模型
	:param train_file: file to training model
	:param feature_num_file: 特征数目
	:param mix_tree_model_file: 树模型
	:param mix_lr_model_file: lr模型
	"""
    train_feature, train_label = get_train_data(train_file, feature_num_file)
    train_mat = xgb.DMatrix(train_feature, train_label)
    (tree_num, tree_depth, learning_rate) = 10, 4, 0.3  #获取最优参数
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(mix_tree_model_file)  #将树的模型保存
    tree_leaf = bst.predict(train_mat, pred_leaf=True)
    print(tree_leaf[0])  #样本最后落在哪个节点上
    #提取特征
    total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num,
                                                 tree_depth)
    #LR模型
    lr_clf = LRCV(Cs=[1.0],
                  penalty='l2',
                  dual=False,
                  tol=0.0001,
                  max_iter=500,
                  cv=5).fit(total_feature_list, train_label)
    scores = list(lr_clf.scores_.values())[0]
    print("diff: %s" % (",".join([str(ele) for ele in scores.mean(axis=0)])))
    print("Accuracy: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2))
    lr_cf = LRCV(Cs=[1],
                 penalty="l2",
                 tol=0.0001,
                 max_iter=500,
                 cv=5,
                 solver="liblinear",
                 scoring="roc_auc").fit(train_feature, train_label)
    scores = list(lr_cf.scores_.values())[0]
    #print(scores)
    print("diff: %s" % (",".join([str(ele) for ele in scores.mean(axis=0)])))
    print("AUC: %s (+-%0.2f)" % (scores.mean(), scores.std() * 2))
    fw = open(mix_lr_model_file, "w+")
    coef = lr_clf.coef_[0]
    fw.write(",".join([str(ele) for ele in coef]))
    fw.close()
Example #13
0
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file,
                            mix_lr_model_file):
    '''
    Args:
        train_file: file for training model
        feature_num_file: file to store total feature length
        mix_tree_model_file: tree part of mix model
        mix_lr_model_file: lr part of mix model
    '''
    train_feature, train_label = get_train_data(train_file, feature_num_file)
    train_mat = xgb.DMatrix(train_feature, train_label)
    tree_num, tree_depth, learning_rate = 10, 4, 0.3
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(mix_tree_model_file)
    tree_leaf = bst.predict(train_mat, pred_leaf=True)
    #print(tree_leaf[0])
    #print(np.max(tree_leaf))
    #sys.exit()

    total_feature_list = get_gbdt_and_lr_feature(tree_leaf, tree_num,
                                                 tree_depth)

    lr_cf = LRCV(Cs=[1], penalty='l2', tol=0.0001, max_iter=500,
                 cv=5).fit(total_feature_list, train_label)
    scores = list(lr_cf.scores_.values())[0]
    print('diff: %s' % (",".join([str(ele) for ele in scores.mean(axis=0)])))
    print('accuracy: %s (+-%0.2f)' % (scores.mean(), scores.std() * 2))

    lr_cf = LRCV(Cs=[1],
                 penalty='l2',
                 tol=0.0001,
                 max_iter=500,
                 cv=5,
                 scoring='roc_auc').fit(train_feature, train_label)
    scores = list(lr_cf.scores_.values())[0]
    print('diff: %s' % (",".join([str(ele) for ele in scores.mean(axis=0)])))
    print('auc: %s (+-%0.2f)' % (scores.mean(), scores.std() * 2))

    fw = open(mix_lr_model_file, 'w+')
    coef = lr_cf.coef_[0]
    fw.write(','.join(str(ele) for ele in coef))
    fw.close()
Example #14
0
 def __init__(self,
              Cs=10,
              fit_intercept=True,
              cv=None,
              dual=False,
              penalty='l2',
              scoring=None,
              solver='lbfgs',
              tol=0.0001,
              max_iter=100,
              class_weight=None,
              n_jobs=None,
              verbose=0,
              refit=True,
              intercept_scaling=1,
              multi_class='auto',
              random_state=None,
              l1_ratios=None):
     self.class_weight = class_weight
     self.tol = tol
     self.dual = dual
     self.multi_class = multi_class
     self.Cs = Cs
     self.random_state = random_state
     self.penalty = penalty
     self.max_iter = max_iter
     self.cv = cv
     self.fit_intercept = fit_intercept
     self.n_jobs = n_jobs
     self.refit = refit
     self.l1_ratios = l1_ratios
     self.solver = solver
     self.intercept_scaling = intercept_scaling
     self.verbose = verbose
     self.scoring = scoring
     self.model = LRCV(dual=self.dual,
                       intercept_scaling=self.intercept_scaling,
                       max_iter=self.max_iter,
                       cv=self.cv,
                       solver=self.solver,
                       fit_intercept=self.fit_intercept,
                       l1_ratios=self.l1_ratios,
                       refit=self.refit,
                       tol=self.tol,
                       n_jobs=self.n_jobs,
                       penalty=self.penalty,
                       verbose=self.verbose,
                       Cs=self.Cs,
                       scoring=self.scoring,
                       multi_class=self.multi_class,
                       class_weight=self.class_weight,
                       random_state=self.random_state)
Example #15
0
def train_tree_and_lr_model(train_file,feature_num_file,mix_tree_model_file,mix_lr_model_file):
    '''
    树模型与lr是分开训练的,分别保存两个训练文件
    混合模型原理https://zhuanlan.zhihu.com/p/42123341
    :param train_file:
    :param feature_num_file:
    :param mix_tree_model_file: XGboost树模型文件
    :param mix_lr_model_file: logistics模型
    '''
    #树模型的获取
    train_feature,train_label=get_train_data(train_file,feature_num_file)
    train_mat=xgb.DMatrix(train_feature,train_label)
    (tree_depth,tree_num,learning_rate)=get_mix_model_tree_info()
    print(tree_depth,tree_num,learning_rate)
    bst=train_tree_model_core(train_mat,tree_depth,tree_num,learning_rate)
    bst.save_model(mix_tree_model_file)

    #logistic regression模型的获取:用树结构处理数据形成特征后fit logistic模型
    tree_leaf=bst.predict(train_mat,pred_leaf=True)      #样本落在哪个节点上
    print(len(tree_leaf))
    print (tree_leaf[0])
    #第一个样本落在10棵树上每棵的叶子节点[15 18 15 15 23 27 13 17 28 21],深度为4,16个叶子节点,15个非叶子节点。
    # 第一位为15,表示落在第一棵树上第一个叶子节点上;最后一个21,表示落在第七个叶子节点上
    #实际中特征:样本=1:100,深度为4时,总特征=2**4*10=160,总样本3W条
    #sys.exit()
    total_feature_list=get_gbdt_and_lr_feature(tree_leaf,tree_num,tree_depth)
    lr_clf=LRCV(Cs=[1.0],penalty='l2',dual=False,tol=0.0001,max_iter=500,cv=5)
    lr_clf=lr_clf.fit(total_feature_list,train_label)
    scores = list (lr_clf.scores_.values ())[0]
    print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)])))
    print ("Accuracy:%f(+-%0.2f)" % (scores.mean(), scores.std() * 2))
    lr_clf = LRCV(Cs=[1.0], penalty='l2', dual=False, tol=0.0001, max_iter=500, scoring='roc_auc', cv=5).fit(
        total_feature_list, train_label)
    scores = list (lr_clf.scores_.values ())[0]
    print ("diffC:%s" % (','.join([str(ele) for ele in scores.mean(axis=0)])))
    print ("AUC:%f,(+-%0.2f)" % (scores.mean(), scores.std() * 2))
    fw = open(mix_lr_model_file, "w+")
    coef = lr_clf.coef_[0]
    fw.write(','.join([str(ele) for ele in coef]))
def train_lr_model(train_file,model_coef,model_file,feature_num_file):
	"""
	:param train_file: process file for lr train
	:param model_coef: w1, w2 .....
	:param model_file: model pkl
	:param feature_num_file: file to record num of feature
	"""
	total_feature_num = GF.get_feature_num(feature_num_file)
	train_label = np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols= -1)
	feature_list = range(total_feature_num)
	train_feature = np.genfromtxt(train_file,dtype=np.int32,delimiter=",",usecols=feature_list)
	lr_cf = LRCV(Cs=[1],penalty="l2",tol= 0.0001, max_iter=500,cv=5,solver="liblinear").fit(train_feature,train_label)
	scores = lr_cf.scores_.values()[0]
	print("diff: %s" %(",".join([str(ele) for ele in scores.mean(axis=0)])))
	print("Accuracy: %s (+-%0.2f)" %(scores.mean(),scores.std()*2))
	lr_cf = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5,solver="liblinear",scoring="roc_auc").fit(train_feature, train_label)
	scores = lr_cf.scores_.values()[0]
	print("diff: %s" % (",".join([str(ele) for ele in scores.mean(axis=0)])))
	print("AUC: %s (+-%0.2f)" %(scores.mean(),scores.std()*2))
	coef = lr_cf.coef_[0]
	fw = open(model_coef,"w+")
	fw.write(",".join(str(ele) for ele in coef))
	fw.close()
	joblib.dump(lr_cf,model_file)
Example #17
0
def train_tree_and_lr_model(train_file, feature_num_file, mix_tree_model_file,
                            mix_lr_model_file):
    """
    gbdt+lr  混合模型  ,分开 训练 顺序训练(耗时较长)
    :param train_file: 训练数据
    :param feature_num_file:  特征维度文件
    :param mix_tree_model_file:  树模型文件
    :param mix_lr_model_file:   逻辑回归文件
    :return:  None
    """

    train_feature, train_label = get_train_data(train_file, feature_num_file)
    train_mat = xgb.DMatrix(train_feature, train_label)
    # tree_num, tree_depth, learning_rate = 10,6, 0.3
    (tree_depth, tree_num, learning_rate) = get_mix_model_tree_info()
    # 这里树的深度由 6 改为4,原因:如下:  深度为6:总共:127个节点,64个叶子节点,63个非叶子节点
    # 1.训练出的label,没有落在叶子节点上(或者落在叶子节点上比较少)
    # 2. 特征与样本量的比值:1:100。 因为: 10颗数,深度为6,则叶子节点有 有640个维度,而样本有3万条,不满足
    #训练树模型的代码
    bst = train_tree_model_core(train_mat, tree_depth, tree_num, learning_rate)
    bst.save_model(mix_tree_model_file)
    tree_leaf = bst.predict(train_mat, pred_leaf=True)  #预测最终结果落在哪一个叶子节点上
    # print(tree_leaf) #[81 84 84 84 85 77 68 91 97 61] 代表10颗数,81代表最终1 落到那一颗叶子节点上
    # print(np.max(tree_leaf))
    # sys.exit()
    total_feature_list = get_gbdt_and_lr_featrue(tree_leaf, tree_num,
                                                 tree_depth)

    #逻辑回归
    lr_clf = LRCV(Cs=[1.0],
                  penalty="l2",
                  tol=0.00001,
                  max_iter=500,
                  cv=5,
                  scoring='roc_auc').fit(total_feature_list, train_label)
    scores = lr_clf.scores_.values()
    scores = list(scores)[0]  # 提取values值
    print(scores)
    print("diff %s : " %
          ("  ".join([str(ele) for ele in scores.mean(axis=0)])))  # 按照列求均值
    print("AUC %s ,(+- %0.2f ):" % (scores.mean(), scores.std() * 2))
    coef = lr_clf.coef_[0]
    with open(mix_lr_model_file, "w+", encoding="utf-8") as file:
        file.write(",".join([str(ele) for ele in coef]))
def train(path, param, gbdt_model, coef, lr_model):
    """
    gbdt&lr model train
    Args:
        path: file path
        param: model parameter
        gbdt_model: gbdt model file
        coef: lr coef file
        lr_model: lr model file
    """
    t = load(path, np.float64, ",")
    y_train = t[:, 0]
    X_train = t[:, 1:]

    dtrain = xgb.DMatrix(X_train, y_train)
    bst = xgb.train(param, dtrain)
    bst.save_model(gbdt_model)

    num_leaf = 2**(param["max_depth"] + 1)
    trans = bst.predict(dtrain, pred_leaf=True)

    X_lr_train = []
    for t in trans:
        f = []
        for x in t:
            f += [1 if x == i else 0 for i in range(num_leaf)]
        X_lr_train.append(f)

    X_lr_train = np.array(X_lr_train)
    y_lr_train = y_train

    lr = LRCV(Cs=[1],
              penalty="l2",
              tol=0.0001,
              max_iter=500,
              cv=5,
              scoring="roc_auc").fit(X_lr_train, y_lr_train)

    with open(coef, "wb") as fp:
        fp.write(str(lr.intercept_[0]) + ",")
        fp.write(",".join([str(_) for _ in lr.coef_[0]]))

    joblib.dump(lr, lr_model)
Example #19
0
def LRCVpredictor(X_train, y_train, X_test):
    '''Logistic Regression Classifier
    Input traning data ,target, and test data
    Output prabability of each label for test data'''
    from sklearn.linear_model import LogisticRegressionCV as LRCV

    # Cross validation may not be needed for random forest classifier
    model = LRCV(random_state=1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy = metrics.accuracy_score(y_train, y_pred)
    logLoss = metrics.log_loss(y_train, y_pred)

    y_pred = model.predict(X_test)
    modelName = model.__class__.__name__
    accModels[modelName] = accuracy
    predictions[modelName] = y_pred

    return y_pred, accuracy
Example #20
0
File: lr.py Project: xiadx/rec-sys
def train(path, coef, model):
    """
    lr model train
    Args:
        path: file path
        coef: model coef file
        model: model file
    """
    t = load(path, np.int32, ",")
    y_train = t[:, 0]
    X_train = t[:, 1:]

    lr = LRCV(Cs=[1], penalty="l2", tol=0.0001, max_iter=500, cv=5, scoring="roc_auc").fit(X_train, y_train)

    with open(coef, "wb") as fp:
        fp.write(str(lr.intercept_[0])+",")
        fp.write(",".join([str(_) for _ in lr.coef_[0]]))

    joblib.dump(lr, model)
Example #21
0
    def run(self):
        with self.output().open('w') as fout:
            data = pd.read_csv('../Part3/csv/learning_data.csv')
            x = data.iloc[:, 3:].values
            training_index = [v for v in range(len(x))
                              if v % 5 == 0]  # training set index
            testing_index = [v for v in range(len(x))
                             if v % 5 != 0]  # testing set index
            x1 = x[training_index, :]  # training X
            x2 = x[testing_index, :]  # testing X

            y = data.iloc[:, 2:3].values
            y1 = y[training_index, :]  # training Y
            y2 = y[testing_index, :]  # testing Y

            lr = LRCV()  # create logistic regression CV object
            lr.fit(x1, y1)  # training model
            fout.write('The average accuracy of training set is:%s\n' %
                       lr.score(x1, y1))
            fout.write('The average accuracy of testing set is:%s\n' %
                       lr.score(x2, y2))
            fout.write(str(lr.coef_) + '\n')
            fout.write(str(lr.intercept_))
Example #22
0
# 标准化向量,按列处理
scaler = preprocessing.StandardScaler().fit(x)
xscaled = scaler.transform(x)
scaler = preprocessing.StandardScaler().fit(xTest)
xTestScaled = scaler.transform(xTest)

times = 10
nSplit = 10
gbdtTrainAcc = 0
gbdtTestAcc = 0
lrTrainAcc = 0
lrTestAcc = 0

lr = LR(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5)
lrcv = LRCV(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5)

# # learning_curve测试模型
# numTrees = 200
# gbc = GradientBoostingClassifier(n_estimators=numTrees, learning_rate=0.01, subsample=0.5, random_state=0,\
#                                  max_depth=20, min_samples_split=2)
# numSection = 10
# trainS = np.zeros(numSection)
# testS = np.zeros(numSection)
#
# for t in range(times):
#     state = np.random.get_state()
#     np.random.shuffle(xscaled)
#     np.random.set_state(state)
#     np.random.shuffle(y)
#     train_sizes, train_score, test_score = \
from sklearn.linear_model import LogisticRegressionCV as LRCV

data = pandas.read_csv('csv/learning_data.csv')

x = data.iloc[:, 3:].values
training_index = [i for i in range(len(x))
                  if i % 20 != 0]  # training set index
testing_index = [i for i in range(len(x)) if i % 20 == 0]  # testing set index
x1 = x[training_index, :]  # training X
x2 = x[testing_index, :]  # testing X

y = data.iloc[:, 2:3].values
y1 = y[training_index, :]  # training Y
y2 = y[testing_index, :]  # testing Y

lr = LRCV()  # create logistic regression CV object
lr.fit(x1, y1)  # training model

# judging accuracy
print('The average accuracy of training set is:%s' % lr.score(x1, y1))
print('The average accuracy of testing set is:%s' % lr.score(x2, y2))

print('Coefficients:\n ', lr.coef_)  # coefficient
print('Constant value:\n ', lr.intercept_)  # constant

with open('result.txt', 'w') as file:
    file.write('The average accuracy of training set is:%s\n' %
               lr.score(x1, y1))
    file.write('The average accuracy of testing set is:%s\n' %
               lr.score(x2, y2))
    file.write('Coefficients:\n ' + str(lr.coef_) + '\n')
Example #24
0
plt.legend(loc='upper left')
plt.xlabel("DScore")
plt.ylabel("Frequncy")
plt.title(r'Floor Histogram')
plt.savefig('fx_TQ_Hist.png', dpi=300)

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import LogisticRegressionCV as LRCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
MinorData = pd.read_csv("/home/yang/GymClean/MinorData.csv")
X_ohkey = pd.get_dummies(X)
y_ohkey = pd.get_dummies(y)
y_ohkey
clf = LRCV(cv=5, random_state=0).fit(X_ohkey, y_ohkey['Yes'])
clf1 = LR().fit(X_ohkey, y_ohkey['Yes'])
y_pred = clf.predict(X_ohkey)
y_pred1 = clf1.predict(X_ohkey)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_ohkey['Yes'], y_pred)
confusion_matrix(y_ohkey['Yes'], y_pred1)
clf.fit
clf.coef_
clf.get_params()
X_ohkey.columns
MinorCoeff = pd.DataFrame(
    [list(X_ohkey.columns), list(clf.coef_.tolist()[0])],
    index=['indicator', 'coefficients']).transpose()

MajorData = pd.read_csv("/home/yang/GymClean/MajorData.csv")
Example #25
0
log_f = (clf.score(X_test_scaled, Ytest) for clf in log_models)
log_score = [sc for sc in log_f]

log_titles = ('L1 Penalty', 'L2 Penalty', 'Elastic Net Penalty')

fig, ax = plt.subplots()
ax.barh([1, 2, 3], log_score)
ax.set_xlabel('Fraction Correctly Identified')
ax.set_title('Accuracy of Three Logistic Regression Models')
ax.set_yticks([1, 2, 3])
ax.set_yticklabels(log_titles)

##see effect of regularization. See if we can improve each model with better c
##uses stratified k-folds for cross-validation
from sklearn.linear_model import LogisticRegressionCV as LRCV
log_cv = (LRCV(penalty='l1', random_state=0,
               solver='saga'), LRCV(penalty='l2', random_state=0),
          LRCV(penalty='elastic net', random_state=0, solver='saga'))
log_cv_models = (clf.fit(X_train_scaled, np.ravel(Ytrain)) for clf in log_cv)
log_cv_f = (clf.score(X_test_scaled, Ytest) for clf in log_cv_models)
log_cv_score = [sc for sc in log_cv_f]
log_titles = ('L1 Penalty', 'L2 Penalty', 'Elastic Net Penalty')

model = LRCV().fit(X_train_scaled, np.ravel(Ytrain))
print(np.exp(model.coef_))
print(np.exp(model.intercept_))

fig, ax = plt.subplots()
ax.barh([1, 2, 3], log_cv_score)
ax.set_xlabel('Fraction Correctly Identified')
ax.set_title('Accuracy of Three Logistic Regression Models (with CV)')
ax.set_yticks([1, 2, 3])