Beispiel #1
0
    MALE_FEM.append(in_data[i][4])

Data = pd.DataFrame(index=range(1, 712))

Data['TOT_POP'] = pd.Series(TOT_POP, index=Data.index)
Data['PCT_U18'] = pd.Series(PCT_U18, index=Data.index)
Data['PC_18_65'] = pd.Series(PC_18_65, index=Data.index)
Data['PCT_O65'] = pd.Series(PCT_O65, index=Data.index)
Data['MALE_FEM'] = pd.Series(MALE_FEM, index=Data.index)

X = np.log(Data[['TOT_POP', 'PCT_U18', 'PC_18_65', 'PCT_O65']])
y = np.log(Data['MALE_FEM'])

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42)

from sklearn.linear_model import LinearRegression as LR

lm = LR(normalize=True, fit_intercept=True)
lm_fit = lm.fit(X_train, y_train)

Predicts = lm.predict(np.array(X_test))

import sklearn.metrics as po

print(po.r2_score(np.exp(y_test), np.exp(Predicts)))
# -*- coding:utf-8 -*-
import pandas as pd
filename = '../data/bankloan.xls'
data = pd.read_excel(filename)
x = data.iloc[:,:8].as_matrix()
y = data.iloc[:,8].as_matrix()

from sklearn.linear_model import LogisticRegression as LR
from sklearn.linear_model import RandomizedLogisticRegression as RLR 
rlr = RLR() #建立随机逻辑回归模型,复筛选变量
rlr.fit(x, y) #训练模型
rlr.get_support() #获取特征筛选变量
print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()]))
x = data[data.columns[rlr.get_support()]].as_matrix() #筛选锟斤拷锟斤拷锟斤拷

lr = LR() #建立逻辑回归模型
lr.fit(x, y) #训练模型
print(u'模型的平均正确率:%s' % lr.score(x, y))
def time_accuracy(subject, bands, ec, kwargs, has_data, folds=10):
    """
    Classify data independently at each point in time.
    """
    def reshape_time(X):
        n_ex = X.shape[0]
        Xp = X.reshape(n_ex, 1, -1, 258)
        return np.transpose(Xp, (0, 1, 3, 2))

    ds = ec.ECoG(subject, bands, 'train', **kwargs)
    X_shape = ds.get_topological_view().shape
    n_time = 258
    ca = np.zeros((10, n_time))
    va = np.zeros((10, n_time))
    cva = np.zeros((10, n_time))
    c_va = np.zeros((10, n_time))
    for fold in range(folds):
        kwargs_copy = copy.deepcopy(kwargs)
        print('fold: {}'.format(fold))
        cv_ds = ec.ECoG(subject, bands, 'train', fold=fold, **kwargs_copy)
        kwargs_copy['consonant_prediction'] = True
        c_ds = ec.ECoG(subject, bands, 'train', fold=fold, **kwargs_copy)
        kwargs_copy['consonant_prediction'] = False
        kwargs_copy['vowel_prediction'] = True
        v_ds = ec.ECoG(subject, bands, 'train', fold=fold, **kwargs_copy)
        # Consonants
        c_ts = c_ds.get_test_set()
        c_vs = c_ds.get_valid_set()
        c_train_X = reshape_time(
            np.concatenate(
                (c_ds.get_topological_view(), c_vs.get_topological_view()),
                axis=0))
        c_train_y = np.concatenate((c_ds.y, c_vs.y), axis=0)
        c_test_X = reshape_time(c_ts.get_topological_view())
        c_test_y = c_ts.y
        # Vowels
        v_ts = v_ds.get_test_set()
        v_vs = v_ds.get_valid_set()
        v_train_X = reshape_time(
            np.concatenate(
                (v_ds.get_topological_view(), v_vs.get_topological_view()),
                axis=0))
        v_train_y = np.concatenate((v_ds.y, v_vs.y), axis=0)
        v_test_X = reshape_time(v_ts.get_topological_view())
        v_test_y = v_ts.y
        # CV
        cv_ts = cv_ds.get_test_set()
        cv_vs = cv_ds.get_valid_set()
        cv_train_X = reshape_time(
            np.concatenate(
                (cv_ds.get_topological_view(), cv_vs.get_topological_view()),
                axis=0))
        cv_train_y = np.concatenate((cv_ds.y, cv_vs.y), axis=0)
        cv_test_X = reshape_time(cv_ts.get_topological_view())
        cv_test_y = cv_ts.y
        assert np.all(c_train_X == v_train_X)
        assert np.all(c_train_X == cv_train_X)
        assert np.all(c_test_X == v_test_X)
        assert np.all(c_test_X == cv_test_X)
        for tt in range(n_time):
            X_train = c_train_X[:, 0, tt]
            c_cl = LR(solver='lbfgs',
                      multi_class='multinomial').fit(X_train,
                                                     c_train_y.ravel())
            v_cl = LR(solver='lbfgs',
                      multi_class='multinomial').fit(v_train_X[:, 0, tt],
                                                     v_train_y.ravel())
            """
            cv_cl = LR(solver='lbfgs', multi_class='multinomial').fit(cv_train_X[:, 0, tt],
                                                                      cv_train_y.ravel())
            cva[fold, tt] = cv_cl.score(cv_test_X[:, 0, tt], cv_test_y.ravel())
                                                                      """
            ca[fold, tt] = c_cl.score(c_test_X[:, 0, tt], c_test_y.ravel())
            pc = c_cl.predict_proba(c_test_X[:, 0, tt])
            va[fold, tt] = v_cl.score(v_test_X[:, 0, tt], v_test_y.ravel())
            pv = v_cl.predict_proba(v_test_X[:, 0, tt])
            pcv = (pc[:, np.newaxis, :] * pv[..., np.newaxis]).reshape(
                pc.shape[0], -1)[:, has_data].argmax(axis=1)
            c_va[fold, tt] = np.equal(pcv.ravel(), cv_test_y.ravel()).mean()
    return ca, va, cva, c_va
Beispiel #4
0
X = breast_data.data

y = breast_data.target
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=420)

# %%

# 不同的梯度下降的步长在训练集和测试的预测表现
l2 = []
l2test = []
l2_big = []
for i in np.arange(1, 201, 10):
    lrl2 = LR(penalty="l2", solver="liblinear", C=0.9,
              max_iter=i)  # penalty是正则化
    lrl2 = lrl2.fit(X_train, y_train)
    l2.append(accuracy_score(lrl2.predict(X_train), y_train))  # 训练集的评估分数
    l2test.append(accuracy_score(lrl2.predict(X_test), y_test))  # 测试集的评估分数

# %%
for i in np.arange(1, 201, 10):
    lrl2 = LR(penalty="l2", solver="liblinear", C=0.9, max_iter=500)
    lrl2 = lrl2.fit(X_train, y_train)
    # l2_big.append(accuracy_score(lrl2.predict(X_train), y_train))  # 训练集的评估分数
    l2_big.append(accuracy_score(lrl2.predict(X_test), y_test))  # 测试集的评估分数
# %%
graph = [l2, l2test, l2_big]
color = ["black", "gray", "red"]
label = ["L2", "L2test", "l2_big"]
	def run_CV(self):

		cvIter = 0
		
		totalInstanceNum = len(self.label)
		print("totalInstanceNum\t", totalInstanceNum)
		indexList = [i for i in range(totalInstanceNum)]

		print("featureNum", len(self.fn[0]))
		# print("non zero feature num", sum(self.fn[0]))

		totalTransferNumList = []
		# np.random.seed(3)
		# np.random.shuffle(indexList)

		random.shuffle(indexList)

		foldNum = 10
		foldInstanceNum = int(totalInstanceNum*1.0/foldNum)
		foldInstanceList = []

		for foldIndex in range(foldNum-1):
			foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum]
			foldInstanceList.append(foldIndexInstanceList)

		foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):]
		foldInstanceList.append(foldIndexInstanceList)
		# kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
		cvIter = 0
		# random.seed(3)
		totalAccList = [0 for i in range(10)]

		coefList = [0 for i in range(10)]

		posRatioList = []

		for foldIndex in range(foldNum):
			self.m_clf = LR(random_state=3)

			train = []
			for preFoldIndex in range(foldIndex):
				train.extend(foldInstanceList[preFoldIndex])

			test = foldInstanceList[foldIndex]
			for postFoldIndex in range(foldIndex+1, foldNum):
				train.extend(foldInstanceList[postFoldIndex])

			trainNum = int(totalInstanceNum*0.9)
			
			# print(test)
			fn_test = self.fn[test]

			label_test = self.label[test]

			sampledTrainNum = len(train)
			# sampledTrainNum = 100
			train_sampled = random.sample(train, sampledTrainNum)

			fn_train = self.fn[train_sampled]
			label_train = self.transferLabel[train_sampled]

			self.m_clf.fit(fn_train, label_train)

			label_preds = self.m_clf.predict(fn_test)
			acc = accuracy_score(label_test, label_preds)

			testOneNum = np.sum(label_test==self.transferLabel[test])
			testNum = len(fn_test)

			posRatio = testOneNum*1.0/testNum
			posRatioList.append(posRatio)

			totalAccList[cvIter] = acc
			
			cvIter += 1      
		
		totalACCFile = modelVersion+".txt"
		f = open(totalACCFile, "w")
		for i in range(10):
			f.write(str(totalAccList[i]))
			# for j in range(totalAlNum):
			# 	f.write(str(totalAccList[i][j])+"\t")
			f.write("\n")
		f.close()

		print("posRatioList", posRatioList, np.mean(posRatioList), np.sqrt(np.var(posRatioList)))
		print("acc", np.mean(totalAccList), np.sqrt(np.var(totalAccList)))
Beispiel #6
0
# Author:马肖
# E-mail:[email protected]
# Github:https://github.com/Albertsr

import pickle
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import train_test_split

X, y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=2018)
clf = LR().fit(X_train, y_train)

# 运用pickle序列化机器学习模型, 保存为字符串形式
s = pickle.dumps(clf)

# 反序列化
clf_load = pickle.loads(s)

# 输出模型预测精度
print(clf_load.score(X_test, y_test))

# 用dump(object, file) 将模型保存至磁盘
with open('clf_pickle', 'wb') as model:
    pickle.dump(clf, model)

# 运用pickle调用模型,并输出模型结果
with open('clf_pickle', 'rb') as model:
Beispiel #7
0
            else:
                new_sentence += " "
        data[sess].append(new_sentence)

    chronology = list(data.keys())

    for i in range(len(data.keys())):
        for j in range(i + 1, len(data.keys())):
            if chronology[i] > chronology[j]:
                chronology[i], chronology[j] = chronology[j], chronology[i]

    date = [chronology[len(chronology) * i // 6 - 1] for i in range(1, 7)]

    clf_option = [
        Boosting(),
        LR(n_jobs=-1),
        NB(), LinearSVC(),
        Neighbors(),
        RFC()
    ]
    mre_pred = []

    for iter in tqdm(range(5)):
        query = "Select * from berita WHERE Date <= " + str(
            date[iter]) + " AND Title LIKE '%ekono%' "
        c.execute(query)
        train_data = c.fetchall()

        query = "Select * from berita WHERE Date <= " + str(
            date[iter]) + " AND NOT Title LIKE '%ekono%' "
        c.execute(query)
labels = np.zeros(numPoints)
test_point = np.zeros(numDims)

#loop variable to index data and labels
i = 0

#read in the data and labels from the file
for line in file_data:
    split_line = line.strip("\n").split(" ")
    labels[i] = int(split_line[numDims])
    for j in range(0, numDims):
        data[i][j] = float(split_line[j])
    i = i + 1

file_data.close()

start_time = time.time()

logistic = LR(n_jobs=8)
logistic.fit(data, labels)

end_time = time.time() - start_time

print "Time take for logistic regression: ", end_time, " seconds"
print "Number of Iterations: ", logistic.n_iter_
logistic.predict([test_point])
outfile = open('out8/out100k.txt', 'a')
outfile.write(str(end_time))
outfile.write('\n')
outfile.close()
Beispiel #9
0
def train_predict_lr_forward(train_file,
                             test_file,
                             predict_valid_file,
                             predict_test_file,
                             C,
                             n_fold=5):

    feature_name = os.path.basename(train_file)[:-8]
    algo_name = 'lr_forward_{}'.format(C)
    model_name = '{}_{}'.format(algo_name, feature_name)
    logging.basicConfig(format='%(asctime)s   %(levelname)s   %(message)s',
                        level=logging.DEBUG,
                        filename='{}.log'.format(model_name))

    logging.info("Loading training and test data...")
    X_trn, y_trn = load_data(train_file, dense=True)
    X_tst, _ = load_data(test_file, dense=True)

    logging.info('Normalizing data')
    scaler = StandardScaler()
    X_trn = scaler.fit_transform(X_trn)
    X_tst = scaler.transform(X_tst)

    cv = StratifiedKFold(y_trn,
                         n_folds=n_fold,
                         shuffle=True,
                         random_state=2015)

    selected_features = []
    features_to_test = [
        x for x in range(X_trn.shape[1]) if x not in selected_features
    ]

    auc_cv_old = .5
    is_improving = True
    while is_improving:
        auc_cvs = []
        for feature in features_to_test:
            logging.info('{}'.format(selected_features + [feature]))
            X = X_trn[:, selected_features + [feature]]

            p_val = np.zeros_like(y_trn)
            for i, (i_trn, i_val) in enumerate(cv, start=1):
                clf = LR(C=C, class_weight='auto', random_state=2014)
                clf.fit(X[i_trn], y_trn[i_trn])
                p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

            auc_cv = AUC(y_trn, p_val)
            logging.info('AUC CV: {:.6f}'.format(auc_cv))
            auc_cvs.append(auc_cv)

        auc_cv_new = max(auc_cvs)
        if auc_cv_new > auc_cv_old:
            auc_cv_old = auc_cv_new
            feature = features_to_test.pop(auc_cvs.index(auc_cv_new))
            selected_features.append(feature)
            logging.info('selected features: {}'.format(selected_features))
        else:
            is_improving = False
            logging.info(
                'final selected features: {}'.format(selected_features))

    logging.info('saving selected features as a file')
    with open('{}_selected.txt'.format(model_name), 'w') as f:
        f.write('{}\n'.format(selected_features))

    X = X_trn[:, selected_features]
    logging.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1]))

    p_val = np.zeros_like(y_trn)
    for i, (i_trn, i_val) in enumerate(cv, start=1):
        logging.info('Training CV #{}'.format(i))
        clf = LR(C=C, class_weight='auto', random_state=2015)
        clf.fit(X[i_trn], y_trn[i_trn])
        p_val[i_val] = clf.predict_proba(X[i_val])[:, 1]

    auc_cv = AUC(y_trn, p_val)
    logging.info('AUC CV: {:.6f}'.format(auc_cv))
    logging.info("Writing test predictions to file")
    np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',')

    logging.info('Retraining with 100% data...')
    clf.fit(X, y_trn)
    p_tst = clf.predict_proba(X_tst[:, selected_features])[:, 1]
    np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
Beispiel #10
0
# 7.3 线性回归的诊断
# 7.3.1 残差分析
# In[13]:
# ols类计算 线性回归模型 并得到 预测值 和 残差
ana1 = ols('avg_exp ~ Age + Income + dist_home_val', data=exp).fit()
exp['Pred'] = ana1.predict(exp)
exp['resid'] = ana1.resid  # 残差随着x的增大呈现 喇叭口形状,出现异方差
exp.plot('Pred', 'resid',
         kind='scatter')  # Pred = β*Income,随着预测值的增大,残差resid呈现 喇叭口形状
ana1.summary()
# In[]:
Xtrain = exp[['Age', 'Income', 'dist_home_val']]
Ytrain = exp[['avg_exp']]

reg = LR().fit(Xtrain, Ytrain)
yhat = reg.predict(Xtrain)  # 预测我们的yhat
print(reg.score(Xtrain, Ytrain))

predict = pd.DataFrame(yhat, columns=['Pred'])
print(Ytrain.dtypes, predict.dtypes)

y = Ytrain.copy()
ft.recovery_index([y])
# resid = pd.DataFrame((y['avg_exp'] - predict["Pred"]), columns=['resid'])
resid = pd.DataFrame(y['avg_exp'].sub(predict["Pred"]), columns=['resid'])

resid_1 = pd.concat([predict, resid], axis=1)
resid_1.plot('Pred', 'resid', kind='scatter')

print(ft.r2_score_customize(Ytrain, yhat, 1))
Beispiel #11
0
    def fit(self, X, y):  # -1 for unlabeled
        unlabeledX = X[y == -1, :]
        labeledX = X[y != -1, :]
        labeledy = y[y != -1]

        M = unlabeledX.shape[0]

        # train on labeled data
        self.model.fit(labeledX, labeledy)

        unlabeledy = self.predict(unlabeledX)

        #re-train, labeling unlabeled instances pessimistically

        # pessimistic soft labels ('weights') q for unlabelled points, q=P(k=0|Xu)
        f = lambda softlabels, grad=[
        ]: self.discriminative_likelihood_objective(
            self.model,
            labeledX,
            labeledy=labeledy,
            unlabeledData=unlabeledX,
            unlabeledWeights=numpy.vstack((softlabels, 1 - softlabels)).T,
            gradient=grad)  #- supLL
        lblinit = numpy.random.random(len(unlabeledy))

        try:
            self.it = 0
            opt = nlopt.opt(nlopt.GN_DIRECT_L_RAND, M)
            opt.set_lower_bounds(numpy.zeros(M))
            opt.set_upper_bounds(numpy.ones(M))
            opt.set_min_objective(f)
            opt.set_maxeval(self.max_iter)
            self.bestsoftlbl = opt.optimize(lblinit)
            print(" max_iter exceeded.")
        except Exception as e:
            print(e)
            self.bestsoftlbl = self.bestlbls

        if numpy.any(self.bestsoftlbl != self.bestlbls):
            self.bestsoftlbl = self.bestlbls
        ll = f(self.bestsoftlbl)

        unlabeledy = (self.bestsoftlbl < 0.5) * 1
        uweights = numpy.copy(
            self.bestsoftlbl
        )  # large prob. for k=0 instances, small prob. for k=1 instances
        uweights[unlabeledy == 1] = 1 - uweights[
            unlabeledy
            == 1]  # subtract from 1 for k=1 instances to reflect confidence
        weights = numpy.hstack((numpy.ones(len(labeledy)), uweights))
        labels = numpy.hstack((labeledy, unlabeledy))
        if self.use_sample_weighting:
            self.model.fit(numpy.vstack((labeledX, unlabeledX)),
                           labels,
                           sample_weight=weights)
        else:
            self.model.fit(numpy.vstack((labeledX, unlabeledX)), labels)

        if self.verbose > 1:
            print("number of non-one soft labels: ",
                  numpy.sum(self.bestsoftlbl != 1), ", balance:",
                  numpy.sum(self.bestsoftlbl < 0.5), " / ",
                  len(self.bestsoftlbl))
            print("current likelihood: ", ll)

        if not getattr(self.model, "predict_proba", None):
            # Platt scaling
            self.plattlr = LR()
            preds = self.model.predict(labeledX)
            self.plattlr.fit(preds.reshape(-1, 1), labeledy)

        return self
Beispiel #12
0
# sns.distplot(df['Price'])
# plt.show()

# sns.heatmap(df.corr(),annot=True)
# plt.show()

# REGERESSION
X = df[df.columns[range(5)]]
y = df['Price']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=101)

lm = LR()
lm.fit(X_train, y_train)

print("Intecept: {}\n".format(lm.intercept_))
cdf = pd.DataFrame(lm.coef_, X.columns, columns=['Coeff'])
print("Coefficients:\n{}\n".format(cdf))

pred = pd.DataFrame({'A': lm.predict(X_test), 'B': y_test})
# print(pred)
print("Correlation:\n{}\n".format(pred.corr()))

# BOSTON
from sklearn.datasets import load_boston

boston = load_boston()
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR
from sklearn.preprocessing import PolynomialFeatures
data = pd.read_csv("positions.csv")

print(data.columns)
print("************\n")
#print(data.describe())

#X ekseni için
Level = data.iloc[:, 1].values.reshape(-1, 1)
#Y ekseni için
Salary = data.iloc[:, 2].values.reshape(-1, 1)

#BU kısımda Linear bir doğru üzerinden tahmin yaptık ve linear bir line çizdik.
regression = LR()
#X ve Y eksini fitledik.
regression.fit(Level, Salary)

# BUrada Level i 8.3 olan bir kişi için tahmini Salary fiyatını Hesapladık.
tahmin = regression.predict([[8.3]])

#burada linear bir doğrusallık yaptığımız için tahmin sağlıklı bir tahmin değil.
print("Tahmini Salary fiyatı:" + str(tahmin))

# Bu Polinomal bir eğri ile tahmin yapıtık.

regressionPoly = PolynomialFeatures(degree=4)
# burada Level değeri üzerinde polynomal bir eğri oluşturduk.
levelPoly = regressionPoly.fit_transform(Level)
Beispiel #14
0
    '/Users/jiangjiantao/Downloads/79079699_2_py大作业_调查_5_5.csv',
    encoding='GBK')
print(sourceData)
y = read_csv('/Users/jiangjiantao/Downloads/yyyyyyyyyyyyyyy.csv',
             encoding='GBK')
print(y.shape)

XTrain, XTest, YTrain, YTest = train_test_split(sourceData,
                                                y,
                                                test_size=0.6,
                                                random_state=420)
for i in [XTrain, XTest]:
    i.index = range(i.shape[0])
XTrain.shape

reg = LR().fit(XTrain, YTrain)
yHat = reg.predict(XTest)
print(yHat)

reg.coef_
[*zip(XTrain.columns, reg.coef_)]
print(reg.intercept_)

from sklearn.metrics import mean_squared_error as MSE
print(MSE(yHat, YTest))
y.max()
y.min()
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
print(
    cross_val_score(reg, sourceData, y, cv=5,
Beispiel #15
0
xfit = Xo[:nfit]
yfit = Y[:nfit]
xval = Xo[nfit:]
yval = Y[nfit:]
'''2. 在已有的数据集上进行算法的验证和测试'''

from sklearn.svm import SVR
from sklearn.svm import NuSVR
model = SVR()
from sklearn.linear_model import *
from sklearn.tree import DecisionTreeRegressor as dtr
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.cross_validation import StratifiedKFold
from sklearn.linear_model import LogisticRegression as LR
model = LR(C=0.004)
model = LR(C=0.01, penalty='l1')
from sklearn.linear_model import BayesianRidge as BR
model = BR(alpha_1=1e2,
           alpha_2=3e2,
           lambda_1=1e-9,
           lambda_2=1e-9,
           compute_score=False)
from sklearn.linear_model import (LinearRegression, Lasso, RandomizedLasso,
                                  Ridge)
from sklearn.feature_selection import (RFE, f_regression)
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neighbors import NearestNeighbors
def whole_data_models(train_indep, train_dep, num_bootstraps, num_trees,
                      master_train, test_data):
    model_rf = RF(n_estimators=num_trees,
                  criterion='gini',
                  bootstrap=True,
                  n_jobs=-1,
                  min_samples_leaf=5)
    model_lr = LR()
    model_dt = DT()

    indep_columns = master_train.columns.tolist()
    indep_columns.remove('churn_flag')
    indep_columns.remove('sl_uuid')
    #indep_columns = ['customer_life']
    '''try:
		columns = pk.load(open('columns.pk','r'))
		indep_columns = columns
	except:
		print ("not  created yet")
	'''
    master_indep = master_train[indep_columns]
    master_dep = master_train['churn_flag'].tolist()
    model_rf.fit(master_indep, master_dep)
    model_lr.fit(master_indep, master_dep)
    model_dt.fit(master_indep, master_dep)
    #Printing out accuracies from train set
    print model_rf.score(master_indep, master_dep)
    print model_lr.score(master_indep, master_dep)
    print model_dt.score(master_indep, master_dep)
    train_pred_rf = model_rf.predict(master_indep)
    train_pred_lr = model_lr.predict(master_indep)
    train_pred_dt = model_dt.predict(master_indep)
    score_rf = PRFS(y_true=master_dep, y_pred=train_pred_rf, average='binary')
    score_lr = PRFS(y_true=master_dep, y_pred=train_pred_lr, average='binary')
    score_dt = PRFS(y_true=master_dep, y_pred=train_pred_dt, average='binary')

    print('Random forest precision and recall:\t' + str(score_rf[0]) + '\t' +
          str(score_rf[1]))
    print('Logistic regression precision and recall:\t' + str(score_lr[0]) +
          '\t' + str(score_lr[1]))
    print('Decision tree precision and recall:\t' + str(score_dt[0]) + '\t' +
          str(score_dt[1]))

    #Evaluating test data
    indep_columns = master_train.columns.tolist()
    indep_columns.remove('churn_flag')
    indep_columns.remove('sl_uuid')
    #indep_columns = ['customer_life']
    master_indep = test_data[indep_columns]
    master_dep = test_data['churn_flag'].tolist()
    #model_rf.fit(master_indep, master_dep)
    #model_lr.fit(master_indep, master_dep)
    #model_dt.fit(master_indep, master_dep)
    #Printing out accuracies from train set
    print('TEST DATA')
    print model_rf.score(master_indep, master_dep)
    print model_lr.score(master_indep, master_dep)
    print model_dt.score(master_indep, master_dep)
    train_pred_rf = model_rf.predict(master_indep)
    train_pred_lr = model_lr.predict(master_indep)
    train_pred_dt = model_dt.predict(master_indep)
    score_rf = PRFS(y_true=master_dep, y_pred=train_pred_rf)
    score_lr = PRFS(y_true=master_dep, y_pred=train_pred_lr)
    score_dt = PRFS(y_true=master_dep, y_pred=train_pred_dt)
    prediction = model_rf.predict(master_indep)
    master_indep['actual'] = master_dep
    master_indep['prediction'] = prediction
    master_indep.to_excel('master_indep.xlsx')

    print('Random forest precision and recall:\t' + str(score_rf[0]) + '\t' +
          str(score_rf[1]))
    print('Logistic regression precision and recall:\t' + str(score_lr[0]) +
          '\t' + str(score_lr[1]))
    print('Decision tree precision and recall:\t' + str(score_dt[0]) + '\t' +
          str(score_dt[1]))
    print('importance of variables:\t')
    col = indep_columns
    imp = model_rf.feature_importances_.tolist()
    di = {'columns': col, 'importance': imp}
    df = pd.DataFrame(di)
    df.to_excel('variable_importance.xlsx')
    df = df[df['importance'] > 0.00001]
    columns = df['columns'].tolist()
    pk.dump(columns, open('columns.pk', 'w'))
    print(df)
#Y-axis setup.
ax.set_ylabel("Price", fontsize=22)
ax.set_ylim(0, 800000)
ax.yaxis.set_major_formatter(SMF('${x:,.0f}'))

ax.tick_params(axis='both', which='major', labelsize=14)

#Legend setup.
exp3._legend.remove()
ax.legend(loc='upper left',
          title='Overall House Quality',
          labels=[
              'Very Poor', 'Poor', 'Fair', 'Below Average', 'Average',
              'Above Average', 'Good', 'Very Good', 'Excellent',
              'Very Excellent'
          ],
          ncol=2,
          title_fontsize=18,
          fontsize=14)

############## HYPOTHESIS TESTING ##################
### Significance Test
x = df['Overall Qual'].to_numpy().reshape(-1, 1)
y = df['SalePrice'].to_numpy().reshape(-1, )
reg = LR().fit(x, y)
r_sq = reg.score(x, y)
p_val = fr(x, y)[1][0]

print("The R-squared is {}.".format(round(r_sq, 3)))
print("The p_value is {}.".format(p_val))
Beispiel #18
0
def generatePredictWithLostic(x, y, x_predict):
    model = LR()
    model.fit(x, y)
    y_predict = model.predict_proba(x_predict)

    return pd.DataFrame(y_predict)
    def run_CV(self):

        cvIter = 0

        totalInstanceNum = len(self.label)
        print("totalInstanceNum\t", totalInstanceNum)
        indexList = [i for i in range(totalInstanceNum)]

        totalTransferNumList = []
        # np.random.seed(3)
        random.shuffle(indexList)

        foldNum = 10
        foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum)
        foldInstanceList = []

        for foldIndex in range(foldNum - 1):
            foldIndexInstanceList = indexList[foldIndex *
                                              foldInstanceNum:(foldIndex + 1) *
                                              foldInstanceNum]
            foldInstanceList.append(foldIndexInstanceList)

        foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):]
        foldInstanceList.append(foldIndexInstanceList)
        # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
        cvIter = 0
        totalAccList = [[] for i in range(10)]
        totalNewClassFlagList = [[] for i in range(10)]
        for foldIndex in range(foldNum):

            if self.m_multipleClass:
                self.m_clf = LR(multi_class="multinomial",
                                solver='lbfgs',
                                random_state=3,
                                fit_intercept=False)
            else:
                self.m_clf = LR(random_state=3)

            train = []
            for preFoldIndex in range(foldIndex):
                train.extend(foldInstanceList[preFoldIndex])
            for postFoldIndex in range(foldIndex + 1, foldNum):
                train.extend(foldInstanceList[postFoldIndex])

            fn_train = self.fn[train]

            test = foldInstanceList[foldIndex]

            fn_test = self.fn[test]
            label_test = self.label[test]

            featureDim = len(fn_train[0])
            self.init_confidence_bound(featureDim)

            initExList = []
            initExList = self.pretrainSelectInit(train, foldIndex)

            fn_init = self.fn[initExList]
            label_init = self.label[initExList]

            print("initExList\t", initExList, label_init)
            queryIter = 3
            labeledExList = []
            unlabeledExList = []

            labeledExList.extend(initExList)
            unlabeledExList = list(set(train) - set(labeledExList))

            while queryIter < rounds:
                fn_train_iter = []
                label_train_iter = []

                fn_train_iter = self.fn[labeledExList]
                label_train_iter = self.label[labeledExList]

                self.m_clf.fit(fn_train_iter, label_train_iter)

                idx = self.select_example(unlabeledExList)
                self.update_select_confidence_bound(idx)
                # print(queryIter, "idx", idx, self.label[idx])
                # self.update_select_confidence_bound(idx)

                labeledExList.append(idx)
                unlabeledExList.remove(idx)

                acc = self.get_pred_acc(fn_test, label_test, labeledExList)
                totalAccList[cvIter].append(acc)
                queryIter += 1

            cvIter += 1

        totalACCFile = modelVersion + "_acc.txt"
        totalACCFile = os.path.join(fileSrc, totalACCFile)

        f = open(totalACCFile, "w")
        for i in range(10):
            totalAlNum = len(totalAccList[i])
            for j in range(totalAlNum):
                f.write(str(totalAccList[i][j]) + "\t")
            f.write("\n")
        f.close()
	def run_CV(self):

		cvIter = 0
		
		totalInstanceNum = len(self.m_targetLabel)
		print("totalInstanceNum\t", totalInstanceNum)
		indexList = [i for i in range(totalInstanceNum)]

		totalTransferNumList = []
		np.random.seed(3)
		np.random.shuffle(indexList)

		foldNum = 10
		foldInstanceNum = int(totalInstanceNum*1.0/foldNum)
		foldInstanceList = []

		for foldIndex in range(foldNum-1):
			foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum]
			foldInstanceList.append(foldIndexInstanceList)

		foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):]
		foldInstanceList.append(foldIndexInstanceList)
		# kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
		# random.seed(3)
		totalAccList = [[] for i in range(10)]
		humanAccList = [[] for i in range(10)]
		totalExtraAccList = []
		# self.get_base_learners()

		correctTransferRatioList = []
		totalTransferNumList = []
		correctTransferLabelNumList = []
		correctUntransferRatioList = []

		totalAuditorPrecisionList = []
		totalAuditorRecallList = []
		totalAuditorAccList = []


		for foldIndex in range(foldNum):
			
			# self.clf = LinearSVC(random_state=3)

			if self.m_multipleClass:
				self.m_clf = LR(multi_class="multinomial", solver='lbfgs',random_state=3,  fit_intercept=False)
			else:
				self.m_clf = LR(random_state=3)
			self.m_judgeClassifier = LR(random_state=3)

			train = []
			for preFoldIndex in range(foldIndex):
				train.extend(foldInstanceList[preFoldIndex])

			test = foldInstanceList[foldIndex]
			for postFoldIndex in range(foldIndex+1, foldNum):
				train.extend(foldInstanceList[postFoldIndex])

			trainNum = int(totalInstanceNum*0.9)

			targetNameFeatureTrain = self.m_targetNameFeature[train]
			targetLabelTrain = self.m_targetLabel[train]
			# targetDataFeatureTrain = self.m_targetDataFeature[train]

			targetNameFeatureTest = self.m_targetNameFeature[test]
			targetLabelTest = self.m_targetLabel[test]

			transferLabelTest = self.m_transferLabel[test]
			# targetDataFeatureTest = self.m_targetDataFeature[test]

			# sourceUniqueClass = np.unique(self.m_sourceLabel)

			initExList = []
			initExList = self.pretrainSelectInit(train, foldIndex)

			targetNameFeatureInit = self.m_targetNameFeature[initExList]
			targetLabelInit = self.m_targetLabel[initExList]

			print("initExList\t", initExList, targetLabelInit)

			queryIter = 0
			labeledExList = []
			unlabeledExList = []
			###labeled index
			labeledExList.extend(initExList)
			unlabeledExList = list(set(train)-set(labeledExList))

			activeLabelNum = 3.0
			transferLabelNum = 0.0
			transferFeatureList = []
			transferFlagList = []

			featureDim = len(targetNameFeatureTrain[0])
			self.init_confidence_bound(featureDim, labeledExList, unlabeledExList)

			targetNameFeatureIter = targetNameFeatureInit
			targetLabelIter = targetLabelInit

			correctTransferLabelNum = 0.0
			wrongTransferLabelNum = 0.0
			correctUntransferLabelNum = 0.0
			wrongUntransferLabelNum = 0.0

			# auditorPrecisionList = []
			# auditorRecallList = []
			auditorAccList = []
			extraAccList = []

			self.m_clf.fit(targetNameFeatureInit, targetLabelInit)

			while activeLabelNum < rounds:

				# targetNameFeatureIter = self.m_targetNameFeature[labeledExList]
				# targetLabelIter = self.m_targetLabel[labeledExList]

				# self.m_clf.fit(targetNameFeatureIter, targetLabelIter) 

				exId = self.select_example(unlabeledExList) 
				
				exLabel = -1
				
				self.m_strongLabeledIDList.append(exId)
				self.update_select_confidence_bound(exId)
				self.update_judge_confidence_bound(exId)
				activeLabelNum += 1.0
				activeLabelFlag = True

				exLabel = self.m_targetLabel[exId]
				
				transferLabel = self.m_transferLabel[exId]
				if transferLabel == exLabel:
					# correctUntransferLabelNum += 1.0
					transferFlagList.append(1.0)
					transferFeatureList.append(self.m_targetNameFeature[exId])
				else:
					# wrongUntransferLabelNum += 1.0
					transferFlagList.append(0.0)
					transferFeatureList.append(self.m_targetNameFeature[exId])

					# auditorPrecision = 0.0
					# if correctTransferLabelNum+wrongTransferLabelNum > 0.0:
					# 	auditorPrecision = correctTransferLabelNum*1.0/(correctTransferLabelNum+wrongTransferLabelNum)

				auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest)
				# print("auditorAcc", auditorAcc)
				auditorAccList.append(auditorAcc)

				labeledExList.append(exId)
				unlabeledExList.remove(exId)

				# acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter)
				# totalAccList[cvIter].append(acc)
				extraAcc = self.addExtraExample(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest)
				extraAccList.append(extraAcc)
					# humanAccList[cvIter].append(acc)
				queryIter += 1

			# totalAuditorPrecisionList.append(auditorPrecisionList)
			# totalAuditorRecallList.append(auditorRecallList)
			totalAuditorAccList.append(auditorAccList)
			totalExtraAccList.append(extraAccList)
			

			cvIter += 1      
		
		# print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList)))

		# print("extraList", extraAccList, np.mean(extraAccList), np.sqrt(np.var(extraAccList)))
		# print("correct ratio\t", np.mean(correctTransferRatioList), np.sqrt(np.var(correctTransferRatioList)))
		# print("untransfer correct ratio\t", np.mean(correctUntransferRatioList), np.sqrt(np.var(correctUntransferRatioList)))

		# AuditorPrecisionFile = modelVersion+"_auditor_precision.txt"
		# writeFile(totalAuditorPrecisionList, AuditorPrecisionFile)

		# AuditorRecallFile = modelVersion+"_auditor_recall.txt"
		# writeFile(totalAuditorRecallList, AuditorRecallFile)

		AuditorAccFile = modelVersion+"_auditor_acc.txt"
		writeFile(totalAuditorAccList, AuditorAccFile)

		# totalACCFile = modelVersion+"_acc.txt"
		# writeFile(totalAccList, totalACCFile)

		# humanACCFile = modelVersion+"_human_acc.txt"
		# writeFile(humanAccList, humanACCFile)

		extraACCFile = modelVersion+"_extra_acc.txt"
		writeFile(totalExtraAccList, extraACCFile)
Beispiel #21
0
import pandas as pd
from sklearn.linear_model import LogisticRegression as LR

inputpath = 'D:/PythonProject/python02\逻辑回归/data/bankloan.xls'
data = pd.read_excel(inputpath)
x = data.iloc[:, :8].values
y = data.iloc[:, 8].values
lr = LR(solver='liblinear')
lr = lr.fit(x, y)
print('模型的平均准确度为:%s' % lr.score(x, y))
Beispiel #22
0
import operator
from functools import reduce
from sklearn.linear_model import LogisticRegression as LR
from sklearn.ensemble import RandomForestClassifier as RF
import pickle

data = pd.read_csv('Train_vec.csv')
data = np.array(data)

x_train = data[:, :9]
y_train = data[:, 9]

print('x_train[1] : ', x_train[1])
print('y_train[1] : ', y_train[1])

lr = LR(random_state=1, solver='lbfgs', C=1.0, multi_class='ovr')
lr.fit(x_train, y_train)

preds = lr.predict(x_train)
preds_prob = lr.predict_proba(x_train)
lr_score = lr.score(x_train, y_train)

print('LR - Predictions (' + str(len(preds)) + ') : ', preds)
#print('LR - Prediction Probability : ', preds_prob)
print('LR - Scores : ', lr_score)

#print('Are all predictions of LR are correct : ', reduce(operator.and_, y_train == preds))

idxs = [x for x in range(0, len(preds)) if preds[x] == 1]

x_train_2 = x_train  #[idxs]
Beispiel #23
0
# =============================================================================
# linear regression
# =============================================================================
# case2) 
 # - train -> X(feature) , y(target)분리 
 # - 학습(fit)후 test_set (test2) y값 예측
train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.25, random_state=0)
print(len(train_x))
print(len(train_y))

print(len(test_x))
print(len(test_y))

linear= lr()
    linear.fit(train_x, train_y)
LR = LR()


pd.Series(y)

x2 = sm.add_constant(X)
model = sm.OLS(y, x2)
result = model.fit()
print(result.summary())

y_pred = linear.predict(test_x)
print(y_pred)
print(list(test_y))

print('정확도 : ' metrics.accuracy_score(y_test, y_pred))
print('정확도 :', metrics.accuracy_score(y_test, y_pred))
Beispiel #24
0
Y = data['future48_AKI2_overlap'].values
Ytrain, Ytest = Y[:cutoff], Y[cutoff:]
IDtrain = data['PAT_ENC_CSN_ID'].values[:cutoff]

selectTrain, selectTest = np.isfinite(Ytrain), np.isfinite(Ytest)
Xtrain, Ytrain, IDtrain = Xtrain[
    selectTrain, :], Ytrain[selectTrain], IDtrain[selectTrain]
Xtest, Ytest = Xtest[selectTest, :], Ytest[selectTest]

pIndexSub = getPatientIndices(IDtrain)
sampleWeights = np.zeros(len(Xtrain))
for i in tqdm(range(len(pIndexSub))):
    start, stop, length = pIndexSub[i, :]
    sampleWeights[start:stop] = 1 / length

#X = np.concatenate((valueFeatures, timeFeatures, data2[['los']].values,
#                    data2[['creatinine']].values), axis = 1)

##############################################################################

from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import roc_auc_score as AUC
from Helper.utilities import showCoef
model = LR(class_weight='balanced', C=1e-1)

model.fit(Xtrain, Ytrain)  #,sample_weight = sampleWeights)
P = model.predict_proba(Xtest)[:, 1]
model.coef_
print(AUC(Ytest, P))  #, sample_weight = sampleWeights))

## performance is around 0.83 currently
target, features = targetFeatureSplit(data)

### training-testing split needed in regression, just like classification
from sklearn.cross_validation import train_test_split
feature_train, feature_test, target_train, target_test = train_test_split(
    features, target, test_size=0.5, random_state=42)
train_color = "b"
test_color = "r"

### Your regression goes here!
### Please name it reg, so that the plotting code below picks it up and
### plots it correctly. Don't forget to change the test_color above from "b" to
### "r" to differentiate training points from test points.

from sklearn.linear_model import LinearRegression as LR
reg = LR().fit(feature_train, target_train)
print "Coeff: ", reg.coef_
print "intercept: ", reg.intercept_
print "Train Score: ", reg.score(feature_train, target_train)
print "Test Score: ", reg.score(feature_test, target_test)

### draw the scatterplot, with color-coded training and testing points
import matplotlib.pyplot as plt
for feature, target in zip(feature_test, target_test):
    plt.scatter(feature, target, color=test_color)
for feature, target in zip(feature_train, target_train):
    plt.scatter(feature, target, color=train_color)

### labels for the legend
plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
Beispiel #26
0
    def fit(self, X, y):  # -1 for unlabeled
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples in the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target vector relative to X
            Must be 0 or 1 for labeled and -1 for unlabeled instances

        Returns
        -------
        self : object
            Returns self.
        """

        # http://www.fabiangieseke.de/index.php/code/qns3vm

        unlabeledX = X[y == -1, :].tolist()
        labeledX = X[y != -1, :].tolist()
        labeledy = y[y != -1]

        print 1

        # convert class 0 to -1 for tsvm
        labeledy[labeledy == 0] = -1
        labeledy = labeledy.tolist()

        print 2

        if 'rbf' in self.kernel.lower():
            self.model = QN_S3VM(labeledX,
                                 labeledy,
                                 unlabeledX,
                                 self.random_generator,
                                 lam=self.C,
                                 lamU=self.lamU,
                                 kernel_type="RBF",
                                 sigma=self.gamma)
        else:
            self.model = QN_S3VM(labeledX,
                                 labeledy,
                                 unlabeledX,
                                 self.random_generator,
                                 lam=self.C,
                                 lamU=self.lamU)

        print 3

        self.model.train()

        print 4

        # probabilities by Platt scaling
        if self.probability:
            self.plattlr = LR()
            preds = self.model.mygetPreds(labeledX)
            self.plattlr.fit(preds.reshape(-1, 1), labeledy)
def svd_accuracy(file_name, ec, kwargs, folds=10, max_svs=10, max_init=15):
    """
    Classify data based on svd features.
    """
    kwargs['condense'] = False
    ds = ec.ECoG(file_name, which_set='train', **kwargs)
    n_classes = int(np.around(ds.y.max() + 1))
    max_svs = min(max_svs, n_classes)

    init_list = np.arange(0, n_classes - max_svs + 1)
    init_list = init_list[init_list < max_init]
    nsvs_list = np.arange(1, max_svs + 1)

    pa = np.inf * np.ones((folds, len(nsvs_list), len(init_list)))
    ma = np.inf * np.ones((folds, len(nsvs_list), len(init_list)))
    va = np.inf * np.ones((folds, len(nsvs_list), len(init_list)))
    u_s = np.zeros((folds, n_classes, n_classes))
    s_s = np.zeros((folds, n_classes))
    v_s = np.zeros((folds, n_classes, ds.X.shape[1]))
    ohf = OneHotFormatter(n_classes)

    for fold in range(folds):
        kwargs_copy = copy.deepcopy(kwargs)
        print('fold: {}'.format(fold))
        ds = ec.ECoG(file_name,
                     which_set='train',
                     fold=fold,
                     center=False,
                     **kwargs_copy)
        # CV
        ts = ds.get_test_set()
        vs = ds.get_valid_set()
        train_X = np.concatenate((ds.X, vs.X), axis=0)
        train_mean = train_X.mean(axis=0)
        train_X = train_X - train_mean
        train_y = np.concatenate((ds.y, vs.y), axis=0)
        test_X = ts.X - train_mean
        test_y = ts.y
        y_oh = ohf.format(train_y, mode='concatenate')
        c_yx = (y_oh - y_oh.mean(axis=0)).T.dot(train_X) / train_X.shape[0]
        u, s, v = np.linalg.svd(c_yx, full_matrices=False)
        u_s[fold] = u
        s_s[fold] = s
        v_s[fold] = v
        for ii, n_svs in enumerate(nsvs_list):
            for jj, sv_init in enumerate(init_list):
                vp = v[sv_init:sv_init + n_svs]
                train_proj = train_X.dot(vp.T)
                test_proj = test_X.dot(vp.T)
                cl = LR(solver='lbfgs',
                        multi_class='multinomial').fit(train_proj,
                                                       train_y.ravel())
                y_hat = cl.predict(test_proj)
                p_results = []
                m_results = []
                v_results = []
                for y, yh in zip(test_y.ravel(), y_hat.ravel()):
                    pr = place_equiv(y, yh)
                    if pr is not None:
                        p_results.append(pr)
                    mr = manner_equiv(y, yh)
                    if mr is not None:
                        m_results.append(mr)
                    vr = vowel_equiv(y, yh)
                    if vr is not None:
                        v_results.append(vr)
                pa[fold, ii, jj] = np.array(p_results).mean()
                ma[fold, ii, jj] = np.array(m_results).mean()
                va[fold, ii, jj] = np.array(v_results).mean()
    return pa, ma, va, u_s, s_s, v_s, init_list, nsvs_list
    def run_CV(self):

        cvIter = 0

        totalInstanceNum = len(self.m_targetLabel)
        print("totalInstanceNum\t", totalInstanceNum)
        indexList = [i for i in range(totalInstanceNum)]

        totalTransferNumList = []
        np.random.seed(3)
        np.random.shuffle(indexList)

        foldNum = 10
        foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum)
        foldInstanceList = []

        for foldIndex in range(foldNum - 1):
            foldIndexInstanceList = indexList[foldIndex *
                                              foldInstanceNum:(foldIndex + 1) *
                                              foldInstanceNum]
            foldInstanceList.append(foldIndexInstanceList)

        foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):]
        foldInstanceList.append(foldIndexInstanceList)

        totalAccList = [[] for i in range(10)]
        humanAccList = [[] for i in range(10)]

        correctTransferRatioList = []
        totalTransferNumList = []
        correctUntransferRatioList = []

        totalAuditorPrecisionList = []
        totalAuditorRecallList = []
        totalAuditorAccList = []

        for foldIndex in range(foldNum):

            self.m_clf = LR(random_state=3)
            self.m_judgeClassifier = LR(random_state=3)
            self.m_weakOracle = LR(random_state=3)

            train = []
            for preFoldIndex in range(foldIndex):
                train.extend(foldInstanceList[preFoldIndex])

            test = foldInstanceList[foldIndex]
            for postFoldIndex in range(foldIndex + 1, foldNum):
                train.extend(foldInstanceList[postFoldIndex])

            trainNum = int(totalInstanceNum * 0.9)

            targetNameFeatureTrain = self.m_targetNameFeature[train]
            targetLabelTrain = self.m_targetLabel[train]
            transferLabelTrain = self.m_transferLabel[train]
            # targetDataFeatureTrain = self.m_targetDataFeature[train]

            self.m_weakOracle.fit(targetNameFeatureTrain, transferLabelTrain)

            targetNameFeatureTest = self.m_targetNameFeature[test]
            targetLabelTest = self.m_targetLabel[test]

            transferLabelTest = self.m_transferLabel[test]

            initExList = []
            initExList = self.pretrainSelectInit(train)
            # random.seed(101)
            # initExList = random.sample(train, 3)

            targetNameFeatureInit = self.m_targetNameFeature[initExList]
            targetLabelInit = self.m_targetLabel[initExList]

            print("initExList\t", initExList, targetLabelInit)

            queryIter = 0
            labeledExList = []
            unlabeledExList = []
            ###labeled index
            labeledExList.extend(initExList)
            unlabeledExList = list(set(train) - set(labeledExList))

            activeLabelNum = 3.0
            transferLabelNum = 0.0
            transferFeatureList = []
            transferFlagList = []

            featureDim = len(targetNameFeatureTrain[0])
            self.init_confidence_bound(featureDim, labeledExList,
                                       unlabeledExList)

            targetNameFeatureIter = targetNameFeatureInit
            targetLabelIter = targetLabelInit

            correctTransferLabelNum = 0.0
            wrongTransferLabelNum = 0.0
            correctUntransferLabelNum = 0.0
            wrongUntransferLabelNum = 0.0

            # auditorPrecisionList = []
            # auditorRecallList = []
            auditorAccList = []

            while activeLabelNum < rounds:

                # targetNameFeatureIter = self.m_targetNameFeature[labeledExList]
                # targetLabelIter = self.m_targetLabel[labeledExList]

                self.m_clf.fit(targetNameFeatureIter, targetLabelIter)

                exId = self.select_example(unlabeledExList)
                self.update_select_confidence_bound(exId)

                # print(idx)
                activeLabelFlag = False
                transferLabelFlag, transferLabel = self.get_transfer_flag(exId)

                exLabel = -1
                if transferLabelFlag:
                    self.m_weakLabeledIDList.append(exId)
                    transferLabelNum += 1.0
                    activeLabelFlag = False

                    exLabel = transferLabel
                    targetNameFeatureIter = np.vstack(
                        (targetNameFeatureIter,
                         self.m_targetNameFeature[exId]))
                    targetLabelIter = np.hstack((targetLabelIter, exLabel))
                    # targetNameFeatureIter.append(self.m_targetNameFeature[exId])
                    # targetLabelIter.append(exLabel)

                    if exLabel == self.m_targetLabel[exId]:
                        print("correct transfer queryIter\t", queryIter)
                        correctTransferLabelNum += 1.0
                    else:
                        wrongTransferLabelNum += 1.0
                        print("query iteration", queryIter,
                              "error transfer label\t", exLabel, "true label",
                              self.m_targetLabel[exId])
                else:
                    self.m_strongLabeledIDList.append(exId)
                    # self.update_judge_confidence_bound(exId)
                    activeLabelNum += 1.0
                    activeLabelFlag = True

                    exLabel = self.m_targetLabel[exId]
                    targetNameFeatureIter = np.vstack(
                        (targetNameFeatureIter,
                         self.m_targetNameFeature[exId]))
                    targetLabelIter = np.hstack((targetLabelIter, exLabel))
                    # targetNameFeatureIter.append(self.m_targetNameFeature[exId])
                    # targetLabelIter.append(exLabel)

                    if transferLabel == exLabel:
                        correctUntransferLabelNum += 1.0
                        transferFlagList.append(1.0)
                        transferFeatureList.append(
                            self.m_targetNameFeature[exId])
                    else:
                        wrongUntransferLabelNum += 1.0
                        transferFlagList.append(0.0)
                        transferFeatureList.append(
                            self.m_targetNameFeature[exId])

                    auditorAcc = self.getAuditorMetric(transferFeatureList,
                                                       transferFlagList,
                                                       targetNameFeatureTest,
                                                       transferLabelTest,
                                                       targetLabelTest)
                    print("auditorAcc", auditorAcc)

                    auditorAccList.append(auditorAcc)

                labeledExList.append(exId)
                unlabeledExList.remove(exId)

                acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest,
                                        targetNameFeatureIter, targetLabelIter)
                totalAccList[cvIter].append(acc)
                if activeLabelFlag:
                    humanAccList[cvIter].append(acc)
                queryIter += 1

            totalAuditorAccList.append(auditorAccList)

            transferLabelNum = len(self.m_weakLabeledIDList)
            totalTransferNumList.append(transferLabelNum)

            cvIter += 1

        print("transfer num\t", np.mean(totalTransferNumList),
              np.sqrt(np.var(totalTransferNumList)))

        AuditorAccFile = modelVersion + "_auditor_acc.txt"
        writeFile(totalAuditorAccList, AuditorAccFile)

        totalACCFile = modelVersion + "_acc.txt"
        writeFile(totalAccList, totalACCFile)

        humanACCFile = modelVersion + "_human_acc.txt"
        writeFile(humanAccList, humanACCFile)
from sklearn.preprocessing import MinMaxScaler
scalar = MinMaxScaler(feature_range=(-1, 1))
scalar_fit = scalar.fit(X)
dmin = scalar.data_min_
dmax = scalar.data_max_
Xnorm = scalar.transform(X)

# sample weights
yrat = np.sum(y == 1) / len(y)
xrat = 1 - yrat
s_weights = np.zeros(len(y))
s_weights[y == 0] = yrat
s_weights[y == 1] = xrat

# Logistic Regression
clf = LR(penalty='l2', class_weight='balanced').fit(Xnorm, y)
preds = clf.predict_proba(Xnorm)[:, 1]
class_preds = np.round(preds)
ll = log_loss(y, preds, s_weights)
accuracy = np.sum(class_preds == y) / len(y)
prfs = precision_recall_fscore_support(class_preds, y, average='weighted')

# plot hist of preds
import matplotlib.pyplot as plt
plt.hist(preds, bins=20)
plt.title("Label prediction distribution (balanced dataset)")
plt.xlabel("Label prediction")
plt.ylabel("Count")
plt.show()

# feature ranking
Beispiel #30
0
def main():
    # load data
    # training data
    data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps',
                                    'zip.train'),
                       header=None,
                       delimiter=' ').iloc[:, :-1]
    y_train = data.pop(0).values
    X_train = data.values

    # test data
    data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps',
                                    'zip.test'),
                       header=None,
                       delimiter=' ')
    y_test = data.pop(0).values
    X_test = data.values

    pca = PCA(n_components=.95)
    pca.fit(X_train)

    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)

    svm_errs = []
    with tqdm(desc="Problem 1", total=len(C_VALS)) as pbar:
        for C in C_VALS:
            svm = SVC(C=C, kernel='linear', decision_function_shape='ovo')
            svm.fit(X_train, y_train)
            pbar.update(1)

            svm_errs.append(1 - svm.score(X_test, y_test))

    lr = OVO(LR(solver='lbfgs', max_iter=5000))
    lr.fit(X_train, y_train)
    lr_score = lr.score(X_test, y_test)
    err_plot([svm_errs], ["SVM"],
             lr=1. - lr_score,
             title="One vs. One Linear SVM",
             out='hw7/ovo_linear_svm.pdf')

    ovo_svm_errs = []
    with tqdm(desc="Problem 2", total=len(C_VALS)) as pbar:
        for C in C_VALS:
            svm = OVO(SVC(C=C, kernel='poly', degree=3, gamma='auto'))
            svm.fit(X_train, y_train)
            pbar.update(1)

            ovo_svm_errs.append(1 - svm.score(X_test, y_test))

    err_plot([ovo_svm_errs], ["OvO SVM"],
             lr=1. - lr_score,
             title="One vs. One Cubic SVM",
             out='hw7/ovo_cubic_svm.pdf')

    ovr_svm_errs = []
    with tqdm(desc="Problem 3", total=len(C_VALS)) as pbar:
        for C in C_VALS:
            svm = OVR(SVC(C=C, kernel='poly', degree=3, gamma='auto'))
            svm.fit(X_train, y_train)
            pbar.update(1)

            ovr_svm_errs.append(1 - svm.score(X_test, y_test))

    err_plot([ovo_svm_errs, ovr_svm_errs], ["OvO SVM", "OvR SVM"],
             lr=1. - lr_score,
             title="One vs. Rest Cubic SVM/OvO Cubic",
             out='hw7/ovr_cubic_svm.pdf')

    n = 5
    # ensuring that we have at least n neighbors for all classes in the
    # sample
    while True:
        index = np.random.choice(X_train.shape[0], 100, replace=False)

        X_sample = X_train[index]
        y_sample = y_train[index]

        # can use a list comprehension to check
        if all([
                len(X_sample[y_sample == y_i]) >= n
                for y_i in np.unique(y_sample)
        ]):
            break

    dists = []
    for X_i, y_i in zip(X_sample, y_sample):
        X_cls = X_sample[y_sample == y_i]
        nbrs = NearestNeighbors(n_neighbors=n)
        nbrs.fit(X_cls)
        try:
            distances, _ = nbrs.kneighbors(X_i.reshape(1, -1))
        except ValueError as err:
            raise err
        # nee to use reshape b/c single sample
        dists.append(distances[-1])

    global SIGMA
    SIGMA = np.mean(dists)

    ovo_gauss_svm_errs = []
    with tqdm(desc="Problem 4 (SVM)", total=len(C_VALS),
              file=sys.stdout) as pbar:
        for C in C_VALS:
            svm = OVO(SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA**2)))
            #            svm = SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA ** 2),
            #                      decision_function_shape='ovo')
            svm.fit(X_train, y_train)
            score = svm.score(X_test, y_test)
            pbar.update(1)

            ovo_gauss_svm_errs.append(1 - score)

    knn_errs = []
    with tqdm(desc="Problem 4 (kNN)",
              total=len(np.arange(3, 11)),
              file=sys.stdout) as pbar:
        for k in np.arange(3, 11):
            knn = KNeighborsClassifier(n_neighbors=k, weights=gaussian)
            knn.fit(X_train, y_train)
            pbar.update(1)

            knn_errs.append((k, 1 - knn.score(X_test, y_test)))

    err_plot([ovo_gauss_svm_errs], ["OvO SVM"],
             knn=knn_errs,
             title="One vs. One Gaussian SVM with kNN",
             out='hw7/ovo_gaussian_svm_knn.pdf')

    ovr_gauss_svm_errs = []
    with tqdm(desc="Problem 5", total=len(C_VALS), file=sys.stdout) as pbar:
        for C in C_VALS:
            svm = OVR(SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA**2)))
            #            svm = SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA ** 2),
            #                      decision_function_shape='ovr')
            svm.fit(X_train, y_train)
            score = svm.score(X_test, y_test)
            pbar.update(1)

            ovr_gauss_svm_errs.append(1 - score)

    err_plot([ovr_gauss_svm_errs], ["OvR SVM"],
             knn=knn_errs,
             title="One vs. Rest Gaussian SVM with kNN",
             out='hw7/ovr_gaussian_svm_knn.pdf')

    err_plot([
        svm_errs, ovo_svm_errs, ovr_svm_errs, ovo_gauss_svm_errs,
        ovr_gauss_svm_errs
    ], [
        "Linear SVM", "OvO Cubic SVM", "OvR Cubic SVM", "OvO Gaussian SVM",
        "OvR Gaussian SVM"
    ],
             lr=1. - lr_score,
             knn=knn_errs,
             title="Multiclass SVM Kernels",
             out='hw7/all_svm_knn.pdf')

    min_idx = np.argmin(svm_errs)
    min_lin_err = svm_errs[min_idx]
    min_lin_c = np.log2(C_VALS[min_idx])
    print("Min Linear SVM Error = {0:.4f}".format(min_lin_err))
    print("Min Linear SVM log2(C) = {0}".format(min_lin_c))
    print("LR Error = {0:.4f}".format(1. - lr_score))

    min_idx = np.argmin(ovo_svm_errs)
    min_lin_err = ovo_svm_errs[min_idx]
    min_lin_c = np.log2(C_VALS[min_idx])
    print("Min OvO Cubic SVM Error = {0:.4f}".format(min_lin_err))
    print("Min OvO Cubic SVM log2(C) = {0}".format(min_lin_c))

    min_idx = np.argmin(ovr_svm_errs)
    min_lin_err = ovr_svm_errs[min_idx]
    min_lin_c = np.log2(C_VALS[min_idx])
    print("Min OvR Cubic SVM Error = {0:.4f}".format(min_lin_err))
    print("Min OvR Cubic SVM log2(C) = {0}".format(min_lin_c))

    min_idx = np.argmin(knn_errs)
    min_lin_k, min_lin_err = knn_errs[min_idx]
    print("Min kNN Error = {0:.4f}".format(min_lin_err))
    print("Min kNN log2(C) = {0}".format(min_lin_k))

    min_idx = np.argmin(ovo_gauss_svm_errs)
    min_lin_err = ovo_gauss_svm_errs[min_idx]
    min_lin_c = np.log2(C_VALS[min_idx])
    print("Min OvO Gaussian SVM Error = {0:.4f}".format(min_lin_err))
    print("Min OvO Gaussian SVM log2(C) = {0}".format(min_lin_c))

    min_idx = np.argmin(ovr_gauss_svm_errs)
    min_lin_err = ovr_gauss_svm_errs[min_idx]
    min_lin_c = np.log2(C_VALS[min_idx])
    print("Min OvR Gaussian SVM Error = {0:.4f}".format(min_lin_err))
    print("Min OvR Gaussian SVM log2(C) = {0}".format(min_lin_c))

    print("sigma = {0:.4f}".format(SIGMA))