def get_support_fields(X,Y):
    '''
    Function for getting support fields
    '''
    rlr = RLR() #建立随机逻辑回归模型,筛选变量
    rlr.fit(X, Y) #训练模型
    rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
    print rlr.scores_
    print(u'有效特征为:%s' % (','.join(data.columns[rlr.get_support()])).decode('utf-8'))
    X = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征
    return X
Exemple #2
0
	def randomized_Logistic_regression(self):
		X = self.data[:,1:len(self.data[0])]
		y = self.data[:,0]
		randomized_logistic = RandomizedLogisticRegression()
		randomized_logistic.fit(X,y)
		a = randomized_logistic.get_support()
		selected = np.where(a)
Exemple #3
0
 def randomized_Logistic_regression(self):
     X = self.data[:, 1:len(self.data[0])]
     y = self.data[:, 0]
     randomized_logistic = RandomizedLogisticRegression()
     randomized_logistic.fit(X, y)
     a = randomized_logistic.get_support()
     selected = np.where(a)
def get_features(X_train, y_train, names, selection_threshold=0.2):
    print('\ngetting features with randomized logistic regression...')
    print('using a selection threshold of {}'.format(selection_threshold))
    randomized_logistic = RandomizedLogisticRegression(
        selection_threshold=selection_threshold)
    randomized_logistic.fit(X_train, y_train)
    mask = randomized_logistic.get_support()
    features = np.array(names)[mask]
    print('found {} ngrams:'.format(len([f for f in features])))
    print([f for f in features])
    return features
Exemple #5
0
def getElgiibleFeatures(allFeatureParam, allLabelParam):
    '''
    reff for paper : 
    http://scikit-learn.org/stable/modules/feature_selection.html#randomized-l1
    http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RandomizedLogisticRegression.html
  '''

    logiRegObj = RandomizedLogisticRegression()
    logiRegObj.fit(allFeatureParam, allLabelParam)
    ### Output ###
    #print "Model score: ", logiRegObj.scores_
    eligible_indices = logiRegObj.get_support(indices=True)
    return eligible_indices
def logistic(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression as LR
    from sklearn.linear_model import RandomizedLogisticRegression as RLR
    #特征工程
    rlr = RLR()
    rlr.fit(X_train, y_train)
    print(rlr.get_support())
    x = X_train[X_train.columns[rlr.get_support()]].as_matrix()
    x_test = X_test[X_test.columns[rlr.get_support()]].as_matrix()
    '''
    x=X_train
    x_test=X_test
    '''
    #逻辑回归
    lr = LR()
    lr.fit(x, y_train)
    pred_prob_train = lr.predict_proba(x)
    pred_prob = lr.predict_proba(x_test)
    print('logistic')
    predicts = lr.predict(x_test)
    metrics_result(y_test, predicts)

    return pred_prob, pred_prob_train
Exemple #7
0
def logistic_regression():
    # 参数初始化
    filename = SRC_PATH + '/data/bankloan.xls'
    data = pd.read_excel(filename)
    print data.head()
    print data.tail()

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    print x, y

    rlr = RLR()  # 建立随机逻辑回归模型,筛选变量
    rlr.fit(x, y)  # 训练模型
    rlr.get_support()  # 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
    print(u'通过随机逻辑回归模型筛选特征结束。')

    # print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
    # x = data[data.columns[rlr.get_support()]].as_matrix()  # 筛选好特征

    lr = LR()  # 建立逻辑货柜模型
    lr.fit(x, y)  # 用筛选后的特征数据来训练模型
    print(u'逻辑回归模型训练结束。')
    print(u'模型的平均正确率为:%s' % lr.score(x, y))  # 给出模型的平均正确率,本例为81.4%
def programmer_1():
    filename = "data/bankloan.xls"
    data = pd.read_excel(filename)

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()
    rlr.fit(x, y)
    rlr_support = rlr.get_support()
    support_col = data.drop('违约', axis=1).columns[rlr_support]

    print(
        "rlr_support_columns: {columns}".format(columns=','.join(support_col)))
    x = data[support_col].as_matrix()

    lr = LR()
    lr.fit(x, y)

    print("lr: {score}".format(score=lr.score(x, y)))
Exemple #9
0
def programmer_1():
    filename = "data/bankloan.xls"
    data = pd.read_excel(filename)

    x = data.iloc[:, :8].as_matrix()
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()
    rlr.fit(x, y)
    rlr_support = rlr.get_support()
    support_col = data.drop('违约', axis=1).columns[rlr_support]

    print(
        "rlr_support_columns: {columns}".format(columns=','.join(support_col)))
    x = data[support_col].as_matrix()

    lr = LR()
    lr.fit(x, y)

    print("lr: {score}".format(score=lr.score(x, y)))
def tipdm_chapter5_test():
	# 参数初始化
	filename = '../../../MyFile/chapter5/data/bankloan.xls'
	data = pd.read_excel(filename)
	x = data.iloc[:,:8].as_matrix()
	y = data.iloc[:,8].as_matrix()

	# feature selection
	rlr = RLR()	# 建立随机逻辑回归模型,筛选变量
	rlr.fit(x, y)	# 训练模型
	features = rlr.get_support()	# 获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
	print(u'通过随机逻辑回归模型筛选特征结束。')
	print(u'有效特征为: {0}'.format(','.join(data.columns[features])))
	x = data[data.columns[features]].as_matrix()	# 筛选好特征

	# training and test
	lr = LR()	# 建立逻辑货柜模型
	lr.fit(x, y)	# 用筛选后的特征数据来训练模型
	print(u'逻辑回归模型训练结束。')
	print(u'模型的平均正确率为: {0}'.format(lr.score(x, y))) # 给出模型的平均正确率
Exemple #11
0
def programmer_1():
    # 参数初始化
    filename = r'bankloan.xls'
    data = pd.read_excel(filename)
    x = data.iloc[:, :8].as_matrix()  # 使用pandas读取文件  就可以不用管label column标签
    y = data.iloc[:, 8].as_matrix()

    rlr = RLR()  # 建立随机逻辑回归模型,进行特征选择和变量筛选
    rlr.fit(x, y)  # 训练模型
    egeList = rlr.get_support()  # 获取筛选后的特征
    egeList = np.append(
        egeList, False)  # 往numpy数组中 添加一个False元素  使用np.append(array,ele)方法
    print("rlr.get_support():")
    print(egeList)
    print(u'随机逻辑回归模型特征选择结束!!!')
    print(u'有效特征为:%s' % ','.join(data.columns[egeList]))
    x = data[data.columns[egeList]].as_matrix()  # 筛选好特征值

    lr = LR()  # 建立逻辑回归模型
    lr.fit(x, y)  # 用筛选后的特征进行训练
    print(u'逻辑回归训练模型结束!!!')
    print(u'模型的平均正确率:%s' % lr.score(x, y))  # 给出模型的平均正确率,本例为81.4%
Exemple #12
0
 def pick_variables(self,
                    descover=True,
                    method="rlr",
                    threshold=0.25,
                    auto_pick=True):  #默认阈值0.25
     #挑选变量助手(特征选择)
     if method == "rlr":
         """
         #顶层特征选择算法
         #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
         #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
         #不同的子集上建立模型,然后汇总最终确定特征得分
         稳定性选择是一种基于二次抽样和选择算法相结合较新的方法,选择算法可以是回归、SVM或其他类似的方法。
         它的主要思想是在不同的数据子集和特征子集上运行特征选择算法,不断的重复,最终汇总特征选择结果,
         比如可以统计某个特征被认为是重要特征的频率(被选为重要特征的次数除以它所在的子集被测试的次数)。
         理想情况下,重要特征的得分会接近100%。稍微弱一点的特征得分会是非0的数,而最无用的特征得分将会接近于0。
         RandomizedLogisticRegression()
         fit(X, y)	Fit the model using X, y as training data.
         fit_transform(X[, y])	Fit to data, then transform it.
         get_params([deep])	Get parameters for this estimator.
         get_support([indices])	Get a mask, or integer index, of the features selected
         inverse_transform(X)	Reverse the transformation operation
         set_params(**params)	Set the parameters of this estimator.
         transform(X)	Reduce X to the selected features.
         """
         rlr = RandomizedLogisticRegression(
             selection_threshold=threshold)  #随机逻辑回归
         rlr.fit(self.X_train, self.y_train)
         scoretable = pd.DataFrame(rlr.all_scores_,
                                   index=self.X_train.columns,
                                   columns=['var_score'])  #汇总最终确定特征得分
         columns_need = list(self.X_train.columns[rlr.get_support(
         )])  #	Get a mask, or integer index, of the features selected
         self.X_train = self.X_train[columns_need]
         self.X_test = self.X_test[columns_need]
         columns_need.append("y")
         if auto_pick:
             self.picked_data = self.data[columns_need]
         return scoretable
Exemple #13
0
    def data_proc(self):
        self.load_data()
        # iloc,完全基于位置的索引,[]中的第一个值是从第几行到第几行,第二个是从第几列到第几列
        x = self.data.iloc[:, :8].as_matrix()
        y = self.data.iloc[:, 8].as_matrix()
        #先使用随机变量模型进行属性的筛选
        rlr = RLR()
        rlr.fit(x, y)  #训练模型
        rlr.get_support()  #获取特征筛选结果,也可以通过.scores获得各个特征的分数

        print("有效特征为%s" % ','.join(self.data.columns[rlr.get_support()]))
        x = self.data[data.columns[rlr.get_support()]].as_matrix()  #筛选之后的特征
        rlr.get_support()
        lr = LR(class_weight={
            0: 0.9,
            1: 0.1
        })  # 分类权重,避免误分类代价比较高时使用,class_weight='balanced'自行处理,或者像代码中那样设置
        #lr.fit(x, y,sample_weight=[1,2,3,5,4,9,8,10])
        lr.fit(x, y, sample_weight=[1, 2, 3, 5, 4])  #样本权重,设置每一行数据的重要性,一行数据一个值
        result = lr.predict([[24, 2, 2, 0, 28, 17.3, 1.79, 3.06]])
        print('模型的正确率是:%s,预测结果是 %d' % (lr.score(x, y), result))
Exemple #14
0
import pandas as pda
fname = "C:/Users/Administrator/Desktop/data/luqu.xls"
dataf = pda.read_excel(fname)
#DataFrame.as_matrix: Convert the frame to its Numpy-array representation
#DataFrame.iloc: Purely integer-location based indexing for selection by position
x = dataf.iloc[:, 1:4].as_matrix()
y = dataf.iloc[:, 0:1].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
r1 = RLR()
r1.fit(x, y)
eff = r1.get_support()#find the effective features, remove noneffective ones
#print(dataf.columns[eff])
t = dataf[dataf.columns[r1.get_support()]].as_matrix()
r2 = LR()
r2.fit(t, y)
print("training ends")
print("accuracy: " + str(r2.score(x,y))) #score():Returns the mean accuracy on the given test data and labels
#-*- coding: utf-8 -*-
#逻辑回归 自动建模
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
#参数初始化
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()#8个属性
y = data.iloc[:,8].as_matrix()#第九列  结果标签

#稳定性选择方法  挑选特征
rlr = RLR(selection_threshold=0.5) #建立随机逻辑回归模型,筛选变量  特征筛选用了默认阈值0.25
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选结果
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))

x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征,重新训练模型
lr = LR() #建立逻辑货柜模型
lr.fit(x, y) #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y))
Exemple #16
0
#-*- coding:utf-8 -*-
# Peishichao
import pandas as pd

filename = '../data/bankloan.xls'
data = pd.read_excel(filename)

x = data.iloc[:, :8].as_matrix()

y = data.iloc[:, 8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR

from sklearn.linear_model import RandomizedLogisticRegression as RLR

rlr = RLR()

rlr.fit(x, y)

rlr.get_support()
print(rlr.get_support())
print('end')

#print('Feature: %s ' % ','.join(data.columns[rlr.get_support()]))

x = data[data.columns[rlr.get_support()]].as_matrix()
print(x)
lr = LR()
lr.fit(x, y)
print('end')
print('accur: %s' % lr.score(x, y))
Exemple #17
0
def runTest(featmat_train, outcome_train_lbl, featmat_test, outcome_test_lbl,
            sel, paramsDict, bestmodelnum):
    print("Running Test for #{0} ({1})".format(TEST_PERSON_NUM,
                                               TEST_PERSON_DEVICE_ID))
    X_train_allfg = featmat_train.values
    Y_train = outcome_train_lbl.values
    #     Y_train = Y_train.reshape(Y_train.size, 1)# does this help?
    featnames_allfg = featmat_train.columns
    X_test_allfg = featmat_test.values
    Y_test = outcome_test_lbl.values
    Y_true = Y_test[0]
    sel_featnames_per_fg = {}
    sel_featnames_list_ordered = []
    sel_X_train = []
    sel_X_test = []
    countNumSel = 0
    fgi = 0
    for s in suffix_list:
        fgi = fgi + 1
        #    print fgi,
        suffix_list_str = ",".join(s)
        fgidxs = fgColIdxs[suffix_list_str]
        X_train = X_train_allfg[:, fgidxs]
        X_test = X_test_allfg[:, fgidxs]
        featnames_fg = featnames_allfg[fgidxs]
        # continue if empty
        if X_train.shape[1] == 0:
            continue
        ## scaling
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        # variance thresholding
        vartransform = VarianceThreshold()
        X_train = vartransform.fit_transform(X_train)
        X_test = vartransform.transform(X_test)
        varthres_support = vartransform.get_support()
        featnames_fg = featnames_fg[varthres_support]
        ## feature selection
        if sel == "rlog":
            #print (X_train.shape)
            randomized_rlog = RandomizedLogisticRegression(**paramsDict)
            X_train = randomized_rlog.fit_transform(X_train, Y_train)
            X_test = randomized_rlog.transform(X_test)
            chosen_col_idxs = randomized_rlog.get_support()
            #print (len(featnames_fg))
            #print (len(chosen_col_idxs))

            if len(chosen_col_idxs) > 0:
                featnames_fg_chosen = list(featnames_fg[chosen_col_idxs])
                sel_featnames_per_fg[suffix_list_str] = featnames_fg_chosen
                sel_featnames_list_ordered = sel_featnames_list_ordered + featnames_fg_chosen
                sel_X_train.append(X_train)
                sel_X_test.append(X_test)
                countNumSel = countNumSel + len(featnames_fg_chosen)
        else:
            raise ("Unrecognized sel (feature selection algorithm)")
    ## feature selection:  sel{sel{fg1}.....sel{fg45}}
    X_train_concat = np.hstack(sel_X_train)
    X_test_concat = np.hstack(sel_X_test)
    print("\nSum of number of features selected from all fgs = {0}".format(
        countNumSel))
    print("Concatenated X_train has {0} features".format(
        X_train_concat.shape[1]))
    print("Concatenated X_test has {0} features".format(
        X_test_concat.shape[1]))
    if sel == "rlog":
        randomized_rlog = RandomizedLogisticRegression(**paramsDict)
        X_train_concat = randomized_rlog.fit_transform(X_train_concat, Y_train)
        X_test_concat = randomized_rlog.transform(X_test_concat)
        chosen_col_idxs = randomized_rlog.get_support()
        sel_featnames_list_ordered = np.array(sel_featnames_list_ordered)
        chosen_col_idxs = np.array(chosen_col_idxs)
        chosen_cols_final = sel_featnames_list_ordered[chosen_col_idxs]
    else:
        raise ("Unrecognized sel (feature selection algorithm)")
    print("Final number of features in model = {0}".format(
        X_train_concat.shape[1]))
    # GBCT
    if modelname == "GBC":
        clf = GradientBoostingClassifier(random_state=0)
    elif modelname == "LOGR":
        clf = LogisticRegression(random_state=0,
                                 C=paramsDict["C"],
                                 tol=1e-3,
                                 penalty="l1",
                                 n_jobs=paramsDict["n_jobs"],
                                 intercept_scaling=1,
                                 class_weight="balanced")
    else:
        raise ("Unrecognized model name")
    clf.fit(X_train_concat, Y_train)
    pred = clf.predict(X_test_concat)
    pred_proba = clf.predict_proba(X_test_concat)
    Y_pred = pred[0]
    Y_pred_proba = pred_proba[0][1]
    ## Logging test_person_test.csv - outputs 1 line only
    ## did, sel, selParams, Y_pred, Y_pred_proba, Y_true, chosen_cols_final, suffix_list_str : sel_featnames_per_fg[suffix_list_str] in separate columns
    chosen_cols_final_str = ",".join(chosen_cols_final)
    paramsDict_str = ','.join("%s:%r" % (key, val)
                              for (key, val) in paramsDict.iteritems())
    fgIdxs_str = ','.join("%s:%r" % (key, val)
                          for (key, val) in fgIdxs.iteritems())
    cnts_per_lbl_dict = getValueCounts(outcome_train_lbl, outcome_test_lbl)
    cnts_per_lbl_str = ','.join("%s:%r" % (key, val)
                                for (key,
                                     val) in cnts_per_lbl_dict.iteritems())
    dfout = pd.DataFrame({
        "did": [TEST_PERSON_DEVICE_ID],
        "cnts_per_lbl": [cnts_per_lbl_str],
        "sel": [sel],
        "selParams": [paramsDict_str],
        "Y_pred": [Y_pred],
        "Y_pred_proba": [Y_pred_proba],
        "Y_true": [Y_true],
        "fgIdxs": [fgIdxs_str],
        "sel_final": [chosen_cols_final_str]
    })
    dfout = dfout.set_index("did")
    cols = [
        "cnts_per_lbl", "sel", "selParams", "Y_pred", "Y_pred_proba", "Y_true",
        "fgIdxs", "sel_final"
    ]
    for s in suffix_list:
        suffix_list_str = ",".join(s)
        if suffix_list_str in sel_featnames_per_fg:
            sel_feats_fg_str = ",".join(sel_featnames_per_fg[suffix_list_str])
        else:
            sel_feats_fg_str = ""
        dfcol = pd.DataFrame({
            "did": [TEST_PERSON_DEVICE_ID],
            "sel_{0}".format(suffix_list_str): [sel_feats_fg_str]
        })
        dfcol = dfcol.set_index("did")
        dfout = pd.concat([dfout, dfcol], axis=1)
        cols.append("sel_{0}".format(suffix_list_str))
    dfout.to_csv(
        folderpath +
        "{0}_test_model{1}.csv".format(TEST_PERSON_DEVICE_ID, bestmodelnum),
        columns=cols,
        header=True)
    print("{0} minutes elapsed since start of program ".format(
        (time.time() - STARTTIME) / 60.0))
    return (Y_pred, Y_pred_proba)
#-*- coding: utf-8 -*-
#逻辑回归 自动建模
import pandas as pd

#参数初始化
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
rlr = RLR()  #建立随机逻辑回归模型,筛选变量
rlr.fit(x, y)  #训练模型
rlr.get_support()  #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix()  #筛选好特征

lr = LR()  #建立逻辑货柜模型
lr.fit(x, y)  #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y))  #给出模型的平均正确率,本例为81.4%
Index(['年龄', '教育', '工龄', '地址', '收入', '负债率', '信用卡负债', '其他负债', '违约'], dtype='object')

'''
features = b_data.iloc[:,:8]
#print(type(features)) #<class 'pandas.core.frame.DataFrame'>
features = features.as_matrix() #从pandas数据框转到numpy的ndarray
#print(type(features)) #<class 'numpy.ndarray'>
labels = b_data.iloc[:,8].as_matrix()

randomized_logistic = RandomizedLogisticRegression() #随机logistic回归模型,用于筛选变量
randomized_logistic.fit(features,labels) #训练随机logistic回归模型
print(randomized_logistic.scores_) #获取各个特征的分数
'''
[ 0.105  0.085  1.     0.425  0.     1.     0.545  0.03 ]
'''
print(randomized_logistic.get_support()) #随机logistic回归模型的筛选结果
'''
[False False  True  True False  True  True False]
'''
#随机logistic回归模型属于稳定性选择中的一种
print('(稳定性选择)有效特征:%s'%','.join(b_data.columns[:-1][randomized_logistic.get_support()]))
'''
(稳定性选择)有效特征:工龄,地址,负债率,信用卡负债
'''
feat_1 = b_data[b_data.columns[:-1][randomized_logistic.get_support()]].as_matrix()

estimator = SVR(kernel="linear")
RFE_selector = RFE(estimator=estimator, n_features_to_select=None, step=1)
RFE_selector.fit(features,labels)
print(RFE_selector.support_)
'''
# -*- coding:utf-8 -*-
import pandas as pd
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR 
rlr = RLR() #建立随机逻辑回归模型,复筛选变量
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选变量
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix() #筛选锟斤拷锟斤拷锟斤拷

lr = LR() #建立逻辑回归模型
lr.fit(x, y) #训练模型
print(u'模型的平均正确率:%s' % lr.score(x, y))
Exemple #21
0
    X = Fwe.transform(X)
    featureNames = featureNames[Fwe.get_support()]
    print("F-test filter ->", X.shape)

    FeatSelection_SVM = True
    FeatSelection_RandLogReg = False

    if FeatSelection_RandLogReg == True:
        LogRegFeats = RandomizedLogisticRegression(C=5,
                                                   scaling=0.5,
                                                   sample_fraction=0.8,
                                                   n_resampling=60,
                                                   selection_threshold=0.2,
                                                   n_jobs=-1)
        X = LogRegFeats.fit_transform(X, y)
        featureNames = featureNames[LogRegFeats.get_support()]
        print("RandomizedLogisticRegression Feature Selection ->:", X.shape)

    elif FeatSelection_SVM == True:
        X = LinearSVC(C=1, penalty="l1", dual=False,
                      class_weight='auto').fit_transform(X, y)
        # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
        featureNames = featureNames[LogRegFeats.get_support()]
        print("SVC Transformed X:", X.shape)
    '''
    print("Plot #Feats vs Classification performance:")
    PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100))
    '''

    KFilt = None
    # KFilt=200
Exemple #22
0
# -*- coding: utf-8 -*-
"""
Created on Wed Jan 24 22:27:16 2018

@author: fan
第一题:
data1 是40名癌症病人的一些生存资料,其中,X1表示生活行动能力评分(1~100),X2表示病人的年龄,X3表示由诊断到直入研究时间(月);X4表示肿瘤类型,X5把ISO两种疗法(“1”是常规,“0”是试验新疗法);Y表示病人生存时间(“0”表示生存时间小于200天,“1”表示生存时间大于或等于200天)
试建立Y关于X1~X5的logistic回归模型
"""

from numpy import *
import pandas as pd
data = pd.read_table('data1.txt', encoding='gbk')
x = data.iloc[:, 1:6].as_matrix()
y = data.iloc[:, 6].as_matrix()
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

rlr = RLR()
rlr.fit(x, y)
rlr.get_support()
fit_x = data[data.columns[rlr.get_support()]].as_matrix()  #筛选好特征
lr = LR()
lr.fit(fit_x, y)
lr.score(fit_x, y)
#正确率水平为 75%
'''
第二题:
data2 是关于重伤病人的一些基本资料。自变量X是病人的住院天数,因变量Y是病人出院后长期恢复
的预后指数,指数数值越大表示预后结局越好。
'''
    for i in range(100):
        print("Working on: %s (%d of 100)" % (st, (i + 1)))
        rlr = RandomizedLogisticRegression(
            n_resampling=5000, C=lr_mean.C, selection_threshold=st, n_jobs=2)
        rlr.fit(X, y)
        X_rlr = rlr.transform(X)

        if X_rlr.size:
            cv_scores_rlr = cross_val_score(
                lr_mean, X_rlr, y, scoring="roc_auc", cv=StratifiedKFold(9))

            rlr_tmp = {
                "st": st,
                "cv_score": cv_scores_rlr.mean(),
                "cv_std": cv_scores_rlr.std(),
                "n_features": sum(rlr.get_support())
            }
            rlr_grid_search = rlr_grid_search.append(
                rlr_tmp, ignore_index=True)

rlr_grid_search_mean = rlr_grid_search.groupby(by="st").mean()
rlr_grid_search_mean["n_feat_std"] =\
    rlr_grid_search.groupby(by="st").std()["n_features"]
rlr_grid_search_mean["cv_score_std"] = rlr_grid_search.groupby(
    by="st").std()["cv_score"]

rlr_grid_search_mean.to_csv("ispc_grid_search_mean.csv", index=False)

rlr = RandomizedLogisticRegression(
    n_resampling=5000, C=lr_mean.C, selection_threshold=0.75)
rlr.fit(X, y)
Exemple #24
0
#!/usr/bin/env python
# _*_ UTF-8 _*_

import pandas as pda

fname = "F:/python_workspace/file/logic/luqu.csv"
dataf = pda.read_csv(fname)

# [行,列]
x = dataf.iloc[:, 1:4].as_matrix()
y = dataf.iloc[:, 0:1].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

# 建立一个逻辑回归模型
r1 = RLR()
# 训练模型
r1.fit(x, y)
# 特征值筛选,获取有效特征。
r1.get_support()
# print(dataf.columns[r1.get_support()])
# 将可用的特征值参数转换成数组,用来预测y值。
t = dataf[dataf.columns[r1.get_support()]].as_matrix()

r2 = LR()
# 建立xy之间的关系并进行训练。
r2.fit(t, y)
print("训练结束")
print("模型正确率为:" + str(r2.score(x, y)))
Exemple #25
0
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
Exemple #26
0
#             vnum = len(allvector)
#             allvector = npy.array(allvector).T
#             for index in range(0,len(TestData)):
#                 vector = list(allvector[index])
#                 p = p*vector.count(TestData[index])/vnum
#             lbDict[thislb] = p*alllabel
#         thislabel = sorted(lbDict,key = lambda x:lbDict[x],reversed=True)[0]
#         return thislabel
#
# # by1 = Bayes()
# # by1.fit()

import pandas as pda
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

fname = ''
dataf = pda.read_csv(fname)
x = dataf.iloc[:, 1:4].as_matrix
y = dataf.iloc[:, 0:1].as_matrix
r1 = RLR()
r1.fit(x, y)
r1.get_support()  #特征筛选
# print(dataf.columns[r1.get_support()])
t = dataf[dataf.columns[r1.get_support()]].as_matrix()
r2 = LR()
r2.fit(t, y)
print('训练结束')
print('模型正确率为' + str(r2.score(x, y)))

import matplotlib
filename = r'D:\DataAnalysis\Python_practice\chapter5\demo\data\bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:, :
              8]  #.as_matrix() #选取自变量,在书中将df转化为矩阵(.as_matrix)进行运算。而本程序使用的参数可以为dataframe,故可以不转化
y = data.iloc[:, 8]  #.as_matrix() #loc是根据条件选取,iloc是根据索引进行选取切片

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

rlr = RLR()
# 建立随机逻辑回归模型,筛选变量
# 可以使用参数设置阈值: selection_threshold = 0.5 ,默认0.25(即得分<0.25的特征值会被剔除)

rlr.fit(x, y)  #训练模型

rlr.get_support()  # 获取特征筛选结果,也可以通过 .scores_方法获取各个特征的分数

filter_columns = data.columns[0:8][rlr.get_support()]  #选取特征字段数据
print(u'---------start-----------')
print(u'有效特征为: %s' % ','.join(filter_columns))

x = data[filter_columns]  #.as_matrix(columns=None)
lr = LR()  # 建立逻辑回归模型
lr.fit(x, y)  # 用筛选后的特征数据来训练模型

predictions = lr.predict(x)  #对数据进行预测,将训练模型运用于数据集x
data['预测值'] = [int(np.round(x)) for x in predictions]

print(u'---------end-----------')
print(u'模型的平均正确率为%s' % lr.score(x, y))
Exemple #28
0
def hyperparameterSearch(training_set_path, cat, rl, bu):
    print("Importing descriptors from the training set.")
    X, y, labels = import_descriptors(
        training_set_path, "*_%s_%s_train_descriptors_N20.txt" % (rl, bu))
    print("Number of features: %d." % X.shape[-1])

    print("Scaling data.")
    min_max_scaler = MinMaxScaler()
    X_scale = min_max_scaler.fit_transform(X.todense())

    print("Performing feature selection with randomized logistic regression.")
    # set n_jobs=-1 to parallelize the Randomized Logistic Regression
    # however, there is a bug in the current version of skitlearn (0.18.1) which results in the following message:
    # ValueError: assignment destination is read-only, when parallelizing with n_jobs > 1
    feature_selector = RandomizedLogisticRegression(n_jobs=1)
    X_scale = feature_selector.fit_transform(X_scale, y)
    print("Reduced number of features: %d." % X_scale.shape[-1])

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the RBF kernel."
    )
    param_dist_rbf = {
        'kernel': ['rbf'],
        'C': expon(scale=2000),
        'gamma': expon(scale=.01)
    }
    random_sv_rbf = RandomizedSearchCV(SVC(),
                                       param_distributions=param_dist_rbf,
                                       n_iter=100,
                                       scoring='f1',
                                       cv=LeaveOneGroupOut(),
                                       n_jobs=-1,
                                       error_score=0,
                                       iid=False,
                                       refit=False)
    random_sv_rbf.fit(X_scale, y, groups=labels)

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the linear kernel."
    )
    param_dist_linear = {'C': expon(scale=2000)}
    random_sv_linear = RandomizedSearchCV(
        LinearSVC(),
        param_distributions=param_dist_linear,
        n_iter=100,
        scoring='f1',
        cv=LeaveOneGroupOut(),
        n_jobs=-1,
        error_score=0,
        iid=False,
        refit=False)
    random_sv_linear.fit(X_scale, y, groups=labels)

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the polynomial kernel."
    )
    param_dist_poly = {
        'kernel': ['poly'],
        'C': expon(scale=2000),
        'degree': randint(2, 11),
        'coef0': uniform(loc=-2, scale=4),
        'gamma': expon(scale=.01)
    }
    random_sv_poly = RandomizedSearchCV(SVC(),
                                        param_distributions=param_dist_poly,
                                        n_iter=100,
                                        scoring='f1',
                                        cv=LeaveOneGroupOut(),
                                        n_jobs=-1,
                                        error_score=0,
                                        iid=False,
                                        refit=False)
    random_sv_poly.fit(X_scale, y, groups=labels)

    print(
        "Running randomized hyper-parameter search with Leave-One-Out validation for the sigmoid kernel."
    )
    param_dist_sigmoid = {
        'kernel': ['sigmoid'],
        'C': expon(scale=2000),
        'coef0': uniform(loc=-2, scale=4),
        'gamma': expon(scale=.01)
    }
    random_sv_sigmoid = RandomizedSearchCV(
        SVC(),
        param_distributions=param_dist_sigmoid,
        n_iter=100,
        scoring='f1',
        cv=LeaveOneGroupOut(),
        n_jobs=-1,
        error_score=0,
        iid=False,
        refit=False)
    random_sv_sigmoid.fit(X_scale, y, groups=labels)

    with open(
            "%sbest_parameters_test_%s_%s_%s.txt" %
        (training_set_path, cat, rl, bu), "w") as best_params:

        extracted_features = [
            "%d" % (x + 1) for x in feature_selector.get_support(indices=True)
        ]

        print(
            "Best parameters found on training set with the RBF kernel:\n%s %s"
            % (random_sv_rbf.best_params_, random_sv_rbf.best_score_))
        best_params.write(
            "Best parameters found on training set with the RBF kernel:\n%s %s\n"
            % (random_sv_rbf.best_params_, random_sv_rbf.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, random_sv_rbf.best_params_["kernel"]))
        best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
                          (cat, rl, bu, random_sv_rbf.best_params_["kernel"]))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_rbf.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_rbf.best_params_["C"]))
        print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_rbf.best_params_["gamma"]))
        best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_rbf.best_params_["gamma"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_rbf.cv_results_['mean_test_score']
        stds = random_sv_rbf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_rbf.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))

        print(
            "Best parameters found on training set with the linear kernel:\n%s %s"
            % (random_sv_linear.best_params_, random_sv_linear.best_score_))
        best_params.write(
            "Best parameters found on training set with the linear kernel:\n%s %s\n"
            % (random_sv_linear.best_params_, random_sv_linear.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, 'linear'))
        best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
                          (cat, rl, bu, 'linear'))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_linear.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_linear.best_params_["C"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_linear.cv_results_['mean_test_score']
        stds = random_sv_linear.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_linear.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))

        print(
            "Best parameters found on training set with the polynomial kernel:\n%s %s"
            % (random_sv_poly.best_params_, random_sv_poly.best_score_))
        best_params.write(
            "Best parameters found on training set with the polynomial kernel:\n%s %s\n"
            % (random_sv_poly.best_params_, random_sv_poly.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, random_sv_poly.best_params_["kernel"]))
        best_params.write("\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["kernel"]))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_poly.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["C"]))
        print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_poly.best_params_["gamma"]))
        best_params.write("gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["gamma"]))
        print("degree[(\"%s\", \"%s\", \"%s\")] = %d" %
              (cat, rl, bu, random_sv_poly.best_params_["degree"]))
        best_params.write("degree[(\"%s\", \"%s\", \"%s\")] = %d\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["degree"]))
        print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_poly.best_params_["coef0"]))
        best_params.write("coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_poly.best_params_["coef0"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_poly.cv_results_['mean_test_score']
        stds = random_sv_poly.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_poly.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))

        print(
            "Best parameters found on training set with the sigmoid kernel:\n%s %s"
            % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_))
        best_params.write(
            "Best parameters found on training set with the sigmoid kernel:\n%s %s\n"
            % (random_sv_sigmoid.best_params_, random_sv_sigmoid.best_score_))
        print("kernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"]))
        best_params.write(
            "\nkernel[(\"%s\", \"%s\", \"%s\")] = \"%s\"\n" %
            (cat, rl, bu, random_sv_sigmoid.best_params_["kernel"]))
        print("C[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["C"]))
        best_params.write("C[(\"%s\", \"%s\", \"%s\")] = %f\n" %
                          (cat, rl, bu, random_sv_sigmoid.best_params_["C"]))
        print("gamma[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"]))
        best_params.write(
            "gamma[(\"%s\", \"%s\", \"%s\")] = %f\n" %
            (cat, rl, bu, random_sv_sigmoid.best_params_["gamma"]))
        print("coef0[(\"%s\", \"%s\", \"%s\")] = %f" %
              (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"]))
        best_params.write(
            "coef0[(\"%s\", \"%s\", \"%s\")] = %f\n" %
            (cat, rl, bu, random_sv_sigmoid.best_params_["coef0"]))
        print("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
              (cat, rl, bu, ", ".join(extracted_features)))
        best_params.write("features[(\"%s\", \"%s\", \"%s\")] = [%s]\n" %
                          (cat, rl, bu, ", ".join(extracted_features)))
        print("Random LOOCV scores on development set:")
        best_params.write("Random LOOCV scores on development set:\n")
        means = random_sv_sigmoid.cv_results_['mean_test_score']
        stds = random_sv_sigmoid.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds,
                                     random_sv_sigmoid.cv_results_['params']):
            print("%0.5f (stdev %0.5f) for %r" % (mean, std, params))
            best_params.write("%0.5f (stdev %0.5f) for %r\n" %
                              (mean, std, params))
# 代码清单5-1 逻辑回归代码

import pandas as pd
# 参数初始化
fileName = 'data/bankloan.xls'
data = pd.read_excel(fileName)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

# 逻辑回归模型
from sklearn.linear_model import LogisticRegression as LR
# 随机逻辑回归模型
from sklearn.linear_model import RandomizedLogisticRegression as RLR
# 建立随机逻辑回归模型,筛选变量
rlr = RLR()
# 训练模型
rlr.fit(x,y)
# 获取特筛选结果,也可以通过.score_方法获取各个特征的分数
rlr.get_support()
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为 %s' %'.'.join(data.columns[rlr.get_support()]))
# 筛选好特征
x = data[data.columns[rlr.get_support()]].as_matrix()

# 建立逻辑回归模型
lr = LR()
# 用筛选后的特征数据来训练模型
lr.fit(x,y)
print(u'逻辑回归模型训练结束。')
# 给出模型的平均正确率,本例为81.48
print(u'模型的平均正确率为 %s' %lr.score(x,y))
Exemple #30
0
    # 'Normalize/Scale features if needed. Our data is standardized by default'
    # X = StandardScaler(copy=False).fit_transform(X)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    featureNames=featureNames[Fwe.get_support()]
    print("F-test filter ->",X.shape)

    FeatSelection_SVM=True
    FeatSelection_RandLogReg=False

    if FeatSelection_RandLogReg == True:
        LogRegFeats = RandomizedLogisticRegression(C=5, scaling=0.5,
         sample_fraction=0.8, n_resampling=60, selection_threshold=0.2,n_jobs=-1)
        X = LogRegFeats.fit_transform(X,y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print("RandomizedLogisticRegression Feature Selection ->:",X.shape)

    elif FeatSelection_SVM == True:
        X= LinearSVC(C=1, penalty="l1", dual=False,class_weight='auto').fit_transform(X, y)
        # X= LogisticRegression(C=0.01,class_weight='auto').fit_transform(X, y)
        featureNames=featureNames[LogRegFeats.get_support()]
        print ("SVC Transformed X:",X.shape)

    '''
    print("Plot #Feats vs Classification performance:")
    PlotPerfPercentFeatures(X_LR,y,est=SVC(C=100))
    '''

    KFilt=None
    # KFilt=200
Exemple #31
0
import pandas as pd

filename = 'bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

rlr = RLR()
rlr.fit(x, y)
rlr.fit(x, y)
rlr.get_support()
print("end search useful_data")
print(u'end search useful data: %s' % ''.join(data.columns[rlr.get_support()]))

x = data[data.columns[rlr.get_support()]].as_matrix()

lr = LR()
lr.fit(x, y)
print()
print('%s' % lr.score(x, y))
# -*- coding:utf-8 -*-
# 逻辑回归:自动建模
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

data = pd.read_excel("c://mldata//bankloan.xls", header=0)
# x = data.iloc[:, :8].as_matrix()
# y = data.iloc[:, 8].as_matrix()   和下边的两种读取数据的方式,都会带来精度的影响
train_data = data.values  # 将读取的数据其转换为矩阵形式
train_x = train_data[0::, :8]
train_label = train_data[0::, 8]

rlr = RLR()  # 建立随机回归模型,筛选变量
rlr.fit(train_x, train_label)  # 训练模型
rlr.get_support()  # 获取特征筛选结果
print u"特征筛选结束"
print u"有效特征为:%s" % u'、'.join(data.columns[rlr.get_support()])

x = data[data.columns[rlr.get_support()]].as_matrix()  # 筛选好的特征

lr = LR()
lr.fit(x, train_label)  # 用筛选好的特征数据来训练模型
print u'逻辑回归训练结束'
print u'模型的平均正确率为:%s' % lr.score(x, train_label)
Exemple #33
0
def pick_variables(x,y,descover=True,method="rlr",threshold=0.25,sls=0.05):#默认阈值0.25
    #挑选变量助手
    if method == "rlr":
        #随机逻辑回归选择与y线性关系的变量(稳定性选择1)。
        #在不同数据子集和特征子集上运行特征选择算法(rlr),最终汇总选择结果
        rlr = RandomizedLogisticRegression(selection_threshold=threshold)
        rlr.fit(x,y)
        scoretable = pd.DataFrame(rlr.all_scores_,index = x.columns,columns = ['var_score'])
        columns_need = list(x.columns[rlr.get_support()])
        x = x[columns_need]
    #向后淘汰
    if method =="bs"  and x.shape[1] > 1: 
        #提取X,y变量名
        data = pd.concat([x, y], axis=1)#合并数据

        var_list = x.columns
        response = y.name
        #首先对所有变量进行模型拟合
        while True:
            formula = "{} ~ {} + 1".format(response, ' + '.join(var_list))
            mod = smf.logit(formula, data).fit()
            print(mod.summary2())
            p_list = mod.pvalues.sort_values()
            if p_list[-1] > sls:
                #提取p_list中最后一个index
                var = p_list.index[-1]
                #var_list中删除
                var_list = var_list.drop(var)           
            else:
                break
        x=x[var_list]
    #向前选择    
    if method =="fs":   
        data = pd.concat([x, y], axis=1)
        response=y.name
        remaining = set(x.columns)
        selected = []
        current_score, best_new_score = 0.0, 0.0
        while remaining and current_score == best_new_score:
            scores_with_candidates = []
            for candidate in remaining:
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
                mod = smf.logit(formula, data).fit()
                score = mod.prsquared
                scores_with_candidates.append((score, candidate))
            scores_with_candidates.sort(reverse=False)
            best_new_score, best_candidate = scores_with_candidates.pop()
            if current_score < best_new_score:
                remaining.remove(best_candidate)
                selected.append(best_candidate)
                current_score = best_new_score               
        print(len(selected))
        x=x[selected]

    #rsquared_adj prsquared
    if method =="fs_bs":  
        data = pd.concat([x, y], axis=1)
        response=y.name

        remaining = set(x.columns)
        selected = []
        current_score, best_new_score = 0.0, 0.0
        while remaining and current_score == best_new_score:
            scores_with_candidates = []
            for candidate in remaining:
                formula = "{} ~ {} + 1".format(response, ' + '.join(selected + [candidate]))
                mod = smf.logit(formula, data).fit()
                score = mod.prsquared
                scores_with_candidates.append((score, candidate))
            scores_with_candidates.sort(reverse=False)
            best_new_score, best_candidate = scores_with_candidates.pop()
            if current_score < best_new_score:
                print("===========================")
                remaining.remove(best_candidate)
                selected.append(best_candidate)
                current_score = best_new_score
            
            formula2= "{} ~ {} + 1".format(response, ' + '.join(selected))
            mod2 = smf.logit(formula2,data).fit()
            p_list = mod2.pvalues.sort_values()
            if p_list[-1] > sls:
                #提取p_list中最后一个index
                var = p_list.index[-1]
                #var_list中删除
                selected.remove(var)
                print(p_list[-1])
                formula3= "{} ~ {} + 1".format(response, ' + '.join(selected))
                
                mod3 = smf.logit(formula3, data).fit()
                best_new_score = mod3.prsquared
                current_score = best_new_score 
        print(len(selected))
        x=x[selected]

    '''
    注意这里调用的是statsmodels.api里的逻辑回归。这个回归模型可以获取每个变量的显著性p值,p值越大越不显著,当我们发现多于一个变量不显著时,
    不能一次性剔除所有的不显著变量,因为里面可能存在我们还未发现的多变量的多重共线性,我们需要迭代的每次剔除最不显著的那个变量。 
    上面迭代的终止条件: 
    ①剔除了所有的不显著变量 
    ②剔除了某一个或某几个变量后,剩余的不显著变量变得显著了。(说明之前存在多重共线性)
    '''
    if method =="rfc":   
        RFC = RandomForestClassifier(n_estimators=200,max_depth=5,class_weight="balanced")
        RFC_Model = RFC.fit(x,y)
        features_rfc = x.columns
        featureImportance = {features_rfc[i]:RFC_Model.feature_importances_[i] for i in range(len(features_rfc))}
        featureImportanceSorted = sorted(featureImportance.items(),key=lambda x: x[1], reverse=True)
        features_selection = [k[0] for k in featureImportanceSorted[:15]] 
        x = x[features_selection]
        x['intercept'] = [1]*x.shape[0]
        LR = sm.Logit(y, x).fit()
        summary = LR.summary()
        print(summary)
        x=x.drop("intercept",axis=1)
    return x
Exemple #34
0
#-*- coding:utf-8 -*-
import pandas as pd

#参数初始化
filename = 'E:\\3data-mining\\2py-testing\\data and code\\chapter5\\demo\\data\\bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()

#从sklearn包中导入逻辑回归模型
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

rlr = RLR()  #建立随机逻辑回归模型,筛选变量
rlr.fit(x, y)  #训练模型
rlr.get_support()  #获取特征筛选结果
print(u'通过随机逻辑回归模型筛选特征结果')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support(indices=True)]))
x = data[data.columns[rlr.get_support(indices=True)]].as_matrix()  #筛选好特征
lr = LR()  #建立逻辑货柜模型
lr.fit(x, y)  #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束')
print(u'模型的平均正确率:%s' % lr.score(x, y))  #给出模型的正确率
Exemple #35
0
#clf = svm.SVC(kernel='linear')
#rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(5),
#              scoring='accuracy')
#rfecv.fit(train_data, emot)
##x_label = range(1, len(rfecv.grid_scores_) + 1)
##y_label = rfecv.grid_scores_
##有效特征标签
#support=rfecv.support_
##获取有效特征数据
#train_data=rfecv.transform(train_data)

#特征筛选,使用RLR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
rlr = RLR()
rlr.fit(train_data, probs)
rlr.get_support()

#准备回归分类器
import sklearn
from sklearn import gaussian_process, kernel_ridge, isotonic
from sklearn.ensemble import ExtraTreesClassifier
Regressors = {
    #        'pls':cross_decomposition.PLSRegression(),报错
    'gradient boosting': ensemble.GradientBoostingRegressor(),
    #        'gaussian':gaussian_process.GaussianProcessRegressor(),报错
    #        'isotonic':isotonic.IsotonicRegression(),报错
    'kernelridge': kernel_ridge.KernelRidge(),
    'ARD': linear_model.ARDRegression(),
    'bayesianridge': linear_model.BayesianRidge(),
    #        'elasticnet':linear_model.ElasticNet(),#报错
    'HuberRegressor': linear_model.HuberRegressor(),
#-*- coding: utf-8 -*-
#逻辑回归 自动建模
import pandas as pd

#参数初始化
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR 
rlr = RLR() #建立随机逻辑回归模型,筛选变量
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix() #筛选好特征

lr = LR() #建立逻辑货柜模型
lr.fit(x, y) #用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4%
Exemple #37
0
import pandas as pd
from sklearn.linear_model import RandomizedLogisticRegression as RLR
from sklearn.linear_model import LogisticRegression as LR

#读取数据
data = pd.read_csv("C:/Users/T/Desktop/python视频/luqu.csv")
x = data.iloc[:, 1:4].as_matrix()
y = data.iloc[:, :1].as_matrix()

#随机Logistic模型,用于筛选变量
f1 = RLR()
f1.fit(x, y)
f1.get_support()  #筛选出的变量

#Logistic模型
f2 = LR()
f2.fit(x, y)
f2.score(x, y)  #准确率
Exemple #38
0
'''第五章'''

'''Logistic回归'''
import pandas as pd
filename = 'C:/Users/Administrator/Desktop/chapter5/demo/data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()
x.shape
y.shape

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
rlr = RLR()
rlr.fit(x,y)
rlr.get_support()
#rlr.scores_   特征分数
print('通过随机逻辑回归模型筛选特征结束。')
print('有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix()
lr = LR()
lr.fit(x,y)
print('逻辑回归模型训练结束')
print('模型的平均正确率为:%s' % lr.score(x,y))


'''K-Means聚类'''

import pandas as pd
#参数初始化
inputfile = 'C:/Users/Administrator/Desktop/chapter5/demo/data/consumption_data.xls' #销量及其他属性数据
Exemple #39
0
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR

lessonPath = 'E:\\BaiduNetdiskDownload\\sourceCode\\week8\\lesson2.csv'
luquPath = 'E:\\BaiduNetdiskDownload\\sourceCode\\week8\\luqu2.csv'

dataLuqu = pd.read_csv(luquPath)
# 特征
x = dataLuqu.iloc[:, 1:4].as_matrix()
y = dataLuqu.iloc[:, 0:1].as_matrix()

r1 = RLR()
r1.fit(x, y)
# 特征筛选
r1.get_support()
t = dataLuqu.columns[r1.get_support()].as_matrix()

r2 = LR()
r2.fit(t, y)
print('训练结束')
print('模型正确率: ' + str(r2.score(x, y)))
# 这里会报经过方法在后面版本已经被修改,但是我还没有好的合适的新方法
from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR
# 逻辑回归,自动建模
fileName = './bankloan.xls'
data = pd.read_excel(fileName)
# 取前8列
x = data.iloc[:, :8].values
# 取最后一列
y = data.iloc[:, 8].values

print(x)
print(y)
# 建立随机逻辑回归模型,筛选变量
rlr = RLR()
# 训练模型
rlr.fit(x, y)
# 获取特征筛选结果
rlr.get_support(indices=True)
print(rlr.get_support(indices=True))
print("通过随机逻辑回归模型筛选特征结果")
print('有效特征为: %s' % ','.join(data.columns[rlr.get_support(indices=True)]))
# 筛选好特征
x = data[data.columns[rlr.get_support(indices=True)]].values
# 建立逻辑回归模型
lr = LR(solver='liblinear')
# 训练模型
lr.fit(x, y)
print("逻辑回归模型训练结束")
print('平均准确率为: %s' % lr.score(x, y))