# AGE

# FILL OUT AGE VIA LINEAR REGRESSION
# CREATE THE TRAINING SETS AND SET SEX TO BE 0 OR 1
X_train_age = train.dropna(subset=['Age']).drop(
    ['Cabin', 'Age', 'Name', 'Ticket', 'Embarked'], axis=1)
X_train_age['Sex'] = X_train_age['Sex'].map({'male': 0, 'female': 1})
y_train_age = train.dropna(subset=['Age'])['Age']
# PREPARE THE PREDICTION SET AND SET SEX TO BE 0 OR 1
X_pred_age = train[np.invert(train.index.isin(X_train_age.index))].drop(
    ['Cabin', 'Age', 'Name', 'Ticket', 'Embarked'], axis=1)
X_pred_age['Sex'] = X_pred_age['Sex'].map({'male': 0, 'female': 1})

# CREATE AND FIT THE MODEL
lm = LR()
lm.fit(X_train_age, y_train_age)

# PREDICT AGES AND INSERT
train.loc[np.isnan(train['Age']), 'Age'] = lm.predict(X_pred_age)


# IMPUTANCE
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]

    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
Beispiel #2
0
def calculate_probability_distribution(tree , instances , index , cal_method =None):

	if cal_method == None :
		return tree.distribution_for_instance(instances.get_instance(index))

	elif cal_method == 'Platt' :

		p_train = np.zeros(shape=(instances.num_instances,1))
		y_train = np.zeros(shape=(instances.num_instances,1))

		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    p_train[i] = [ (dist[1] - 0.5)*2.0 ]
		    y_train[i] = [instance.get_value(instance.class_index)]

		# print("p_train ====>>>" , p_train)
		# print("y_train ====>>>" , y_train)

		dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0
		tmp = np.zeros(shape=(1,1))
		tmp[0] = [dist]

		print(np.sum(y_train))
		if np.sum(y_train) in [len(y_train),0]:
			print("all one class")
			for ins in instances : 
				print("ins ===> " , ins)
			return tree.distribution_for_instance(instances.get_instance(index))

		else :

			warnings.filterwarnings("ignore", category=FutureWarning)
			lr = LR(solver='lbfgs')                                                      
			lr.fit( p_train , np.ravel(y_train,order='C') )

			return lr.predict_proba( tmp.reshape(1, -1))[0]


	elif cal_method == 'Isotonic' :

		p_train = np.zeros(shape=(instances.num_instances,1))
		y_train = np.zeros(shape=(instances.num_instances,1))

		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    p_train[i] = [ dist[1] ]
		    y_train[i] = [instance.get_value(instance.class_index)]


		dist = tree.distribution_for_instance(instances.get_instance(index))[1]
		tmp = np.zeros(shape=(1,1))
		tmp[0] = [dist]

		print(np.sum(y_train))
		if np.sum(y_train) in [len(y_train),0]:
			print("all one class")
			for ins in instances : 
				print("ins ===> " , ins)
			return tree.distribution_for_instance(instances.get_instance(index))

		else :

			ir = IR( out_of_bounds = 'clip' )
			ir.fit(np.ravel(p_train,order='C')  , np.ravel(y_train,order='C'))

			p = ir.transform( np.ravel(tmp,order='C'))[0]
			return [p,1-p]
			
	# elif cal_method == 'ProbabilityCalibrationTree' :
	# 	pass


	elif cal_method == 'ICP' :


		pass
	elif cal_method == 'Venn1' :
		calibrPts = []
		
		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    score = dist[0] if  dist[1] < dist[0] else dist[1]
		    calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) 
		    

		dist = (tree.distribution_for_instance(instances.get_instance(index)))
		score = dist[0] if dist[1] < dist[0] else dist[1]
		tmp = [score]

		p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp)
		print("Vennnnnn =========>>>>>>>>>>>>  ", p0, "  , ",p1)
		return [p0,p1]
		pass
    thresholds = np.concatenate([mate_dists, nonmate_dists])
    thresholds.sort()
    thresholds = np.insert(thresholds, 0, 0)  # add 0 threshold
    thresholds = np.around(thresholds, 4)
    thresholds = np.unique(thresholds)

    fp = np.sum(nonmate_dists[:, np.newaxis] <= thresholds[np.newaxis, :],
                axis=0)
    fpr = fp.astype(np.float) / len(nonmate_dists)
    chosen_index = np.argmin(abs(fpr - 1e-4))
    thresh = thresholds[chosen_index]

    tp = np.sum(mate_dists[:, np.newaxis] <= thresholds[np.newaxis, :], axis=0)
    tpr = tp.astype(np.float) / len(mate_dists)

    lr = LR(fit_intercept=False)
    dists = np.concatenate([mate_dists, nonmate_dists]) - thresh

    # y = classification where 1 is nonmate
    y = np.ones(dists.shape, dtype=np.int)
    y[:len(mate_dists)] = 0
    lr.fit(dists[:, np.newaxis], y)

    # Prob = 1 / (1 + exp(- alpha * dist))
    alpha = lr.coef_[0, 0]

    print("\nNet %s threshold=%f, \tplatt's scaling=%f" % (
        net,
        thresh,
        alpha,  # lr.intercept_
    ))
import numpy as np
import pandas as pd
import sys
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

filename = 'telco.xls'
data = pd.read_excel(filename)
data.head()

x = data.iloc[:, :37].as_matrix()
y = data.iloc[:, 37].as_matrix()

from sklearn.linear_model import LogisticRegression as LR

lr = LR()  # 建立逻辑回归模型
lr.fit(x, y)  # 用筛选后的特征数据来训练模型
print(u'逻辑回归模型训练结束。')
print(u'模型的平均正确率为:%s' % lr.score(x, y))  # 给出模型的平均正确率,本例为77.8%


def cm_plot(y, yp):
    from sklearn.metrics import confusion_matrix  # 导入混淆矩阵函数

    cm = confusion_matrix(y, yp)  # 混淆矩阵

    import matplotlib.pyplot as plt  # 导入作图库
    plt.matshow(cm, cmap=plt.cm.Greens)  # 画混淆矩阵图,配色风格使用cm.Greens,更多风格请参考官网。
    plt.colorbar()  # 颜色标签

    for x in range(len(cm)):  # 数据标签
                marker=m)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.title('Linear Discriminant Analysis - 2 discriminants')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

pause()

# let's diverge a bit from the book and run Logistic Regression on the
# transformed data set and test set.

A = X_train_lda.shape
X_train_lda = X_train_lda.reshape(A[0], A[1])
lr = LR(multi_class='ovr', solver='lbfgs', C=.05)
lr.fit(X_train_lda, y_train)

plot_decision_regions(X_train_lda, y_train, classifier=lr)
plt.xlabel('LD 1')
plt.ylabel('LD 2')
plt.xlim((-3, 3))
plt.ylim((-3, 3))
plt.title('Logistic Regression with LDA k = 2 wine data set')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()

pause()

# now use the test data to predict and compare to the class
Beispiel #6
0
ridge = Ridge(alpha=0.0001).fit(X_train, y_train)
print("Ridge Score train set : {}".format(ridge.score(X_train,
                                                      y_train.ravel())))
print("Ridge Score test set :  {}\n ".format(ridge.score(X_test, y_test)))

lasso = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Lasso 0.01 Score train set : {}".format(lasso.score(X_train, y_train)))
print("Lasso 0.01 Score test set :  {}\n ".format(lasso.score(X_test, y_test)))

lasso_ = Lasso(alpha=0.00001, max_iter=100000).fit(X_train, y_train)
print("Lasso 0.00001 Score train set : {}".format(
    lasso_.score(X_train, y_train)))
print("Lasso 0.00001 Score test set :  {}\n ".format(
    lasso_.score(X_test, y_test)))

LinReg = LR().fit(X_train, y_train)
print("Linear Regression Train set : {}".format(LinReg.score(X_train,
                                                             y_train)))
print("Linear Regression Test set :  {}\n ".format(LinReg.score(
    X_test, y_test)))

n_esti = 100
forest = RandomForestRegressor(n_estimators=n_esti, random_state=0)
forest.fit(X_train, y_train)
print("Forest n_esti {} Score train set : {}".format(
    n_esti, forest.score(X_train, y_train)))
print("Forest n_esti {} Score test set :  {}\n ".format(
    n_esti, forest.score(X_test, y_test)))

lr, max_depth = 0.1, 5
gbrt_mdlow = GradientBoostingRegressor(random_state=0,
    df.to_csv(f'../data/{fn}_all.csv')
    gc.collect()


df = initial_df('../data/use_for_predictions.csv')
df, y = bin_df_get_y(df)
ac = ['diff', 'color', 'time', 'game_time', 'weekday', 'elo', 'opp_elo',
      'game_num']
df = df[ac].copy()
X = df.values

# Linear Discriminant Analysis
ld_cls = LDA(solver='lsqr')

# Logistic Regression
lr_cls = LR(C=0.01, max_iter=50, tol=7.5e-3, class_weight=None,
            solver='saga', random_state=5, multi_class='ovr')

# KNeighbors Classifier
kn_cls = KNNc(n_neighbors=41, weights='uniform', algorithm='brute',
              metric='chebyshev')

# Ridge Classifier
rd_cls = RdC(fit_intercept=False, class_weight=None, solver='lsqr',
             random_state=5)

# Random Forest Classifier
rf_cls = RFC(n_estimators=200, max_depth=10, min_samples_split=2,
             min_samples_leaf=3, max_features=None, class_weight=None,
             criterion='entropy', random_state=5)

# Extra Trees Classifier
Beispiel #8
0
y = np.zeros(num)
yTest = np.zeros(num)
for i in range(tmp.size):
    if tmp[i] >= 0.5:
        y[i] = 1
    if tmpTest[i] >= 0.5:
        yTest[i] = 1

# 同序打乱
state = np.random.get_state()
np.random.shuffle(x)
np.random.set_state(state)
np.random.shuffle(y)

scaler = preprocessing.StandardScaler().fit(x)
xscaled = scaler.transform(x)
scaler = preprocessing.StandardScaler().fit(xTest)
xTestScaled = scaler.transform(xTest)

lr = LR(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5)
lrcv = LRCV(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5)

lrcv.fit(xscaled, y)
yPred = lrcv.predict(xscaled)
yTestPred = lrcv.predict(xTestScaled)

acc1 = (yPred == y).sum()/yPred.size
acc2 = (yTestPred == yTest).sum()/yTestPred.size

print('LR train acc is {:.4f}'.format(acc1))  # 精度
print('LR test acc is {:.4f}'.format(acc2))  # 精度
Beispiel #9
0
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression as LR

y_df = pd.read_csv("bias_score_reg.csv")

y_data = y_df["x-values"].values.tolist()

x_data = []

for i in range(len(y_data)):
    x_data.append(i)

x_data = np.array(x_data).reshape(-1, 1)
y_data = np.array(y_data).reshape(-1, 1)

bias_model = LR()

bias_model.fit(x_data, y_data)

print(bias_model.coef_)
tsne_data = tsne_visual(tsne_data)
plt.show()

#2. Dimension reduction with various models: RF; LR; XGBOOST; GradB; All w/ RFE as it is more conservative
data = data.iloc[:, :-1]

X_train, X_test, y_train, y_test = train_test_split(data,
                                                    targets,
                                                    test_size=0.15)

#Instantiating all models here with some basic values
forest = RFC(n_estimators=250, random_state=42)
gbc = GBC(n_estimators=250, random_state=42)
xgbc = xgb.XGBClassifier(objective='reg:logistic', n_estimators=250, seed=42)
logit = LR(solver='lbfgs', max_iter=300, random_state=42)


def model_reduce(estimator, n_features, X, y, verbose=1):
    rfe = RFE(estimator=estimator, n_features_to_select=n_features)
    rfe.fit(X, y)
    rf_mask = rfe.support_
    if verbose == 1:
        rfe_best_features(estimator, X, rfe)
    else:
        pass
    return rf_mask


def rfe_best_features(model, data, rfe):
    '''Lower ranking= Better'''
        label_ = self.predict(x)
        score_ = score_sup(label_, y)
        return score_


X, y = load_data()

y = y[:, np.newaxis]
"""
选取部分作为测试集

flag_choose=np.arange(np.shape(y)[0])
flag_train=(flag_choose%8!=0)
flag_test=(flag_choose%8==0)
"""
for i in range(5):
    model = LogisticRegression(learning_rate=0.01,
                               itr=200,
                               batch_size=1,
                               verbose=False)
    model.fit(X, y)
    print(i, "model's weights: ", model.weights.ravel())
    #print(i,"model's precision : ",model.score(X[flag_test],y[flag_test]))

model_sklearn = LR(C=1)
model_sklearn.fit(X, y.ravel())
print("sklearn weights: ", model_sklearn.coef_, model_sklearn.intercept_)

#print("sklearn's precision :",model_sklearn.score(X[flag_test],y[flag_test].ravel()))
        if key[0] == i:
            count += 1
    print("Variable %s has %s features" %(train.columns[i+1], count))
    featureField.append([i+2, count])

# get one-hot encoded train, dev and test sets
OneHotTrainNaive = oneHotEncoding(train.iloc[:,1:-1], featureMap)
OneHotDevNaive = oneHotEncoding(dev.iloc[:,1:-1], featureMap)
OneHotTestNaive = oneHotEncoding(test.iloc[:,1:], featureMap)

OneHotTrainNaive = pd.concat([OneHotTrainNaive, train.iloc[:,-1]], axis=1)
OneHotDevNaive = pd.concat([OneHotDevNaive, dev.iloc[:,-1]], axis=1)
OneHotTestNaive = pd.concat([OneHotTestNaive], axis=1)

# fitting regression model with sklearn
linearReg = LR()
linearReg.fit(OneHotTrainNaive.iloc[:,:-1],OneHotTrainNaive.iloc[:,-1])
devPred = linearReg.predict(OneHotDevNaive.iloc[:,:-1])
rmsleVan = rmse(devPred,OneHotDevNaive.iloc[:,-1])
print('\nRoom Mean Square Log Error for Naive implementations: %s' %(rmsleVan))

# get top 10 positive and negative features
coeff = linearReg.coef_
topFeat = np.argsort(coeff)[-10:]
bottomFeat = np.argsort(coeff)[:10]

print("\nTop 10 Positive Features:")
for x in topFeat:
    print("Variable: %s, Value: %s" %(train.columns[featureReMap[x][0]+1], featureReMap[x][1]))

print("\nTop 10 Negative Features:")
Beispiel #13
0
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression as LR
from sklearn.feature_extraction.text import TfidfVectorizer


def clean(text):
    return html.fromstring(text).text_content().lower().strip()


tr_data = pd.read_csv('/media/datasets/kaggle_imdb/labeledTrainData.tsv',
                      delimiter='\t')
te_data = pd.read_csv('/media/datasets/kaggle_imdb/testData.tsv',
                      delimiter='\t')

trX = [clean(text) for text in tr_data['review'].values]
trY = tr_data['sentiment'].values

vect = TfidfVectorizer(min_df=10, ngram_range=(1, 2))
trX = vect.fit_transform(trX)

model = LR()
model.fit(trX, trY)

ids = te_data['id'].values
teX = [clean(text) for text in te_data['review'].values]
teX = vect.transform(teX)
pr_teX = model.predict_proba(teX)[:, 1]

pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('test.csv',
                                                 index=False,
                                                 header=["id", "sentiment"])
Beispiel #14
0
def main(args):
    print(args)
    now = str(datetime.datetime.now())

    sess = tf.Session()

    # Off-plane sticker projection
    logo = tf.placeholder(tf.float32,
                          shape=[None, 400, 900, 3],
                          name='logo_input')
    param = tf.placeholder(tf.float32, shape=[None, 1], name='param_input')
    ph = tf.placeholder(tf.float32, shape=[None, 1], name='ph_input')
    result = projector(param, ph, logo)

    # Union of the sticker and face image
    mask_input = tf.placeholder(tf.float32,
                                shape=[None, 900, 900, 3],
                                name='mask_input')
    face_input = tf.placeholder(tf.float32,
                                shape=[None, 600, 600, 3],
                                name='face_input')
    theta = tf.placeholder(tf.float32, shape=[None, 6], name='theta_input')
    prepared = stn(result, theta)

    # Transformation to ArcFace template
    theta2 = tf.placeholder(tf.float32, shape=[None, 6], name='theta2_input')
    united = prepared[:,300:,150:750]*mask_input[:,300:,150:750]+\
                                    face_input*(1-mask_input[:,300:,150:750])
    final_crop = tf.clip_by_value(stn(united, theta2, (112, 112)), 0., 1.)

    # TV loss and gradients
    w_tv = tf.placeholder(tf.float32, name='w_tv_input')
    tv_loss = TVloss(logo, w_tv)

    grads_tv = tf.gradients(tv_loss, logo)
    grads_input = tf.placeholder(tf.float32,
                                 shape=[None, 112, 112, 3],
                                 name='grads_input')
    grads1 = tf.gradients(final_crop, logo, grad_ys=grads_input)

    # Varios images generator
    class Imgen(object):
        def __init__(self):
            self.fdict = {ph:[[args.ph]],\
                                      logo:np.ones((1,400,900,3)),\
                                      param:[[args.param]],\
                                      theta:1./args.scale*np.array([[1.,0.,-args.x/450.,0.,1.,-args.y/450.]]),\
                                      theta2:[[1.,0.,0.,0.,1.,0.]],\
                                      w_tv:args.w_tv}
            mask = sess.run(prepared, feed_dict=self.fdict)
            self.fdict[mask_input] = mask

        def gen_fixed(self, im, advhat):
            self.fdict[face_input] = np.expand_dims(im, 0)
            self.fdict[logo] = np.expand_dims(advhat, 0)
            return self.fdict, sess.run(final_crop, feed_dict=self.fdict)

        def gen_random(self, im, advhat, batch=args.batch_size):
            alpha1 = np.random.uniform(-1., 1., size=(batch, 1)) / 180. * np.pi
            scale1 = np.random.uniform(args.scale - 0.02,
                                       args.scale + 0.02,
                                       size=(batch, 1))
            y1 = np.random.uniform(args.y - 600. / 112.,
                                   args.y + 600. / 112.,
                                   size=(batch, 1))
            x1 = np.random.uniform(args.x - 600. / 112.,
                                   args.x + 600. / 112.,
                                   size=(batch, 1))
            alpha2 = np.random.uniform(-1., 1., size=(batch, 1)) / 180. * np.pi
            scale2 = np.random.uniform(1. / 1.04, 1.04, size=(batch, 1))
            y2 = np.random.uniform(-1., 1., size=(batch, 1)) / 66.
            angle = np.random.uniform(args.ph - 2.,
                                      args.ph + 2.,
                                      size=(batch, 1))
            parab = np.random.uniform(args.param - 0.0002,
                                      args.param + 0.0002,
                                      size=(batch, 1))
            fdict = {ph:angle,param:parab,w_tv:args.w_tv,\
                            theta:1./scale1*np.hstack([np.cos(alpha1),np.sin(alpha1),-x1/450.,\
                                                                               -np.sin(alpha1),np.cos(alpha1),-y1/450.]),\
                            theta2:scale2*np.hstack([np.cos(alpha2),np.sin(alpha2),np.zeros((batch,1)),\
                                                                            -np.sin(alpha2),np.cos(alpha2),y2]),\
                            logo:np.ones((batch,400,900,3)),\
                            face_input:np.tile(np.expand_dims(im,0),[batch,1,1,1])}
            mask = sess.run(prepared, feed_dict=fdict)
            fdict[mask_input] = mask
            fdict[logo] = np.tile(np.expand_dims(advhat, 0), [batch, 1, 1, 1])
            return fdict, sess.run(final_crop, feed_dict=fdict)

    gener = Imgen()

    # Initialization of the sticker
    init_logo = np.ones((400, 900, 3)) * 127. / 255.
    if args.init_face != None:
        init_face = io.imread(args.init_face) / 255.
        init_loss = tv_loss + tf.reduce_sum(tf.abs(init_face - united[0]))
        init_grads = tf.gradients(init_loss, logo)
        init_logo = np.ones((400, 900, 3)) * 127. / 255.
        fdict, _ = gener.gen_fixed(init_face, init_logo)
        moments = np.zeros((400, 900, 3))
        print('Initialization from face, step 1/2')
        for i in tqdm(range(500)):
            fdict[logo] = np.expand_dims(init_logo, 0)
            grads = moments * 0.9 + sess.run(init_grads, feed_dict=fdict)[0][0]
            moments = moments * 0.9 + grads * 0.1
            init_logo = np.clip(init_logo - 1. / 51. * np.sign(grads), 0., 1.)
        print('Initialization from face, step 2/2')
        for i in tqdm(range(500)):
            fdict[logo] = np.expand_dims(init_logo, 0)
            grads = moments * 0.9 + sess.run(init_grads, feed_dict=fdict)[0][0]
            moments = moments * 0.9 + grads * 0.1
            init_logo = np.clip(init_logo - 1. / 255. * np.sign(grads), 0., 1.)
        io.imsave(now + '_init_logo.png', init_logo)
    elif args.init_logo != None:
        init_logo[:] = io.imread(args.init_logo) / 255.

    # Embedding model
    with tf.gfile.GFile(args.model, "rb") as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    tf.import_graph_def(graph_def,
                        input_map=None,
                        return_elements=None,
                        name="")
    image_input = tf.get_default_graph().get_tensor_by_name('input:0')
    embedding = tf.get_default_graph().get_tensor_by_name('embeddings:0')
    phase_train_placeholder = tf.placeholder_with_default(tf.constant(
        False, dtype=tf.bool),
                                                          shape=None,
                                                          name='phase_train')

    orig_emb = tf.placeholder(tf.float32,
                              shape=[None, 128],
                              name='orig_emb_input')
    cos_loss = tf.reduce_sum(tf.multiply(embedding, orig_emb), axis=1)
    grads2 = tf.gradients(cos_loss, image_input)

    fdict2 = {phase_train_placeholder: False}

    # Anchor embedding calculation
    if args.anchor_face != None:
        print(io.imread(args.anchor_face).shape)
        anch_im = rescale(io.imread(args.anchor_face) / 255.,
                          112. / 600.,
                          order=5,
                          multichannel=True)
        print((io.imread(args.anchor_face) / 255.).shape)
        fdict2[image_input] = prep(anch_im)
        fdict2[orig_emb] = sess.run(embedding, feed_dict=fdict2)
    elif args.anchor_emb != None:
        fdict2[orig_emb] = np.load(args.anchor_emb)[-1:]
    else:
        anch_im = rescale(io.imread(args.image) / 255., 112. / 600., order=5)
        fdict2[image_input] = prep(anch_im)
        fdict2[orig_emb] = sess.run(embedding, feed_dict=fdict2)

    # Attack constants
    im0 = io.imread(args.image) / 255.
    regr = LR(n_jobs=4)
    regr_len = 100
    regr_coef = -1.
    moments = np.zeros((400, 900, 3))
    moment_val = 0.9
    step_val = 1. / 51.
    stage = 1
    step = 0
    lr_thresh = 100
    ls = []
    t = time()
    while True:
        # Projecting sticker to the face and feeding it to the embedding model
        fdict, ims = gener.gen_random(im0, init_logo)
        fdict2[image_input] = prep(ims)
        grad_tmp = sess.run(grads2, feed_dict=fdict2)
        fdict_val, im_val = gener.gen_fixed(im0, init_logo)
        fdict2[image_input] = prep(im_val)
        ls.append(sess.run(cos_loss, feed_dict=fdict2)[0])
        # Gradients to the original sticker image
        fdict[grads_input] = grad_tmp[0]
        grads_on_logo = np.mean(sess.run(grads1, feed_dict=fdict)[0], 0)
        grads_on_logo += sess.run(grads_tv, feed_dict=fdict)[0][0]
        moments = moments * moment_val + grads_on_logo * (1. - moment_val)
        init_logo -= step_val * np.sign(moments)
        init_logo = np.clip(init_logo, 0., 1.)

        # Logging
        step += 1
        if step % 20 == 0:
            print('Stage:', stage, 'Step:', step, 'Av. time:',
                  round((time() - t) / step, 2), 'Loss:', round(ls[-1], 2),
                  'Coef:', regr_coef)

        # Switching to the second stage
        if step > lr_thresh:
            regr.fit(np.expand_dims(np.arange(100), 1), np.hstack(ls[-100:]))
            regr_coef = regr.coef_[0]
            if regr_coef >= 0:
                if stage == 1:
                    stage = 2
                    moment_val = 0.995
                    step_val = 1. / 255.
                    step = 0
                    regr_coef = -1.
                    lr_thresh = 200
                    t = time()
                else:
                    break

    plt.plot(range(len(ls)), ls)
    plt.savefig(now + '_cosine.png')
    io.imsave(now + '_advhat.png', (init_logo * 255.).astype(np.uint8))
Beispiel #15
0
# In[8]:

train_data = vectorizer.fit_transform(trn)
print(train_data.shape)

# In[9]:

dev_data = vectorizer.transform(dev)
print(dev_data.shape)
test_data = vectorizer.transform(tst)
print(test_data.shape)

# In[11]:

classifier = LR()
classifier.fit(train_data, trn_label_int)

# In[12]:

Train_accuracy = classifier.score(train_data, trn_label_int)
Dev_accuracy = classifier.score(dev_data, dev_label_int)

# In[19]:

Train_accuracy * 100

# In[20]:

Dev_accuracy * 100
Beispiel #16
0
from pre_processing import PreProcess
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression as LR
import numpy as np
import matplotlib.pyplot as plt
from operator import add

preprocess = PreProcess("data/train", "data/test")
preprocess.read_train_test_data()
preprocess.getTfIdf()
#preprocess.add_pos_neg_feature()

#preprocess.polarity_POS_features()

softmax_clf = LR(multi_class='ovr', C=4)

scores = cross_val_score(softmax_clf, preprocess.traintfIdf, preprocess.train_target, cv=3)
print "the cross validated accuracy on training is " + str(scores)
print("the cross validated accuracy(standard deviation) on training is: %0.4f (+/- %0.4f)" % (
scores.mean(), scores.std() * 2))

softmax_clf.fit(preprocess.traintfIdf, preprocess.train_target)

train_pred_softmax = softmax_clf.predict(preprocess.traintfIdf)
test_pred_softmax = softmax_clf.predict(preprocess.testtfIdf)

# wrong_pred = np.where(preprocess.test_target!=test_pred_softmax)
# np.savetxt("data/softmax_wrong.dat", wrong_pred, delimiter=',', fmt="%d")
# c = test_pred_softmax!=preprocess.test_target
# print np.where(c==True)
        profit : float
        """
        print('\t{:.2f} ->\t{:.5f}'.format(ratio, profit))

    X_train, X_test, y_train, y_test = get_train_test(filepath)
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    confusion_mat = standard_confusion_matrix(y_test, y_predict)
    profit = np.sum(confusion_mat * cost_benefit) / len(y_test)
    original_ratio = np.mean(y_train)
    print('Profit from original ratio:')
    print_ratio_profit(original_ratio, profit)
    for sampling_techinque, name in zip(
        [undersample, oversample, smote],
        ['undersampling', 'oversampling', 'smoting']):
        print('Profit when {} to ratio of:'.format(name))
        for ratio in np.arange(*range_params):
            X_sampled, y_sampled = sampling_techinque(X_train, y_train, ratio)
            model.fit(X_sampled, y_sampled)
            y_predict = model.predict(X_test)
            confusion_mat = standard_confusion_matrix(y_test, y_predict)
            profit = np.sum(confusion_mat * cost_benefit) / float(len(y_test))
            print_ratio_profit(ratio, profit)


if __name__ == '__main__':
    churn_filepath = './data/churn.csv'
    cost_benefit = np.array([[79, -20], [0, 0]])
    profit_curve_main(churn_filepath, cost_benefit)
    sampling_main(LR(), churn_filepath, cost_benefit)
Beispiel #18
0
                                               5, alphas)
index, max_acc = max(enumerate(scores_20news), key=operator.itemgetter(1))
best_alpha_20news = alphas[index]
print(best_alpha_20news)

# IMDB dataset
scores_imdb = MultinomialNB.tune_hyperparams(X_train_imdb, y_train_imdb, 5,
                                             alphas)
index, max_acc = max(enumerate(scores_imdb), key=operator.itemgetter(1))
best_alpha_imdb = alphas[index]
print(best_alpha_imdb)

# hyperparameter tuning for Logistic Regression
# 20news dataset
rs = RandomizedSearchCV(
    LR(solver="lbfgs"),
    {
        "max_iter": np.arange(100, 500, 20),
        "C": [0.001, 0.01, 0.1, 1, 10, 100],
    },
    return_train_score=False,
    n_iter=10,
    cv=5,
)
rs.fit(X_train_20news, y_train_20news)
resuts = pd.DataFrame(rs.cv_results_)
print(resuts)
print("best parameters for logistic regression:", rs.best_params_)

print(SEPARATOR)
Beispiel #19
0
# 代码清单5-1 逻辑回归代码

import pandas as pd
# 参数初始化
fileName = 'data/bankloan.xls'
data = pd.read_excel(fileName)
x = data.iloc[:, :8].as_matrix()
y = data.iloc[:, 8].as_matrix()

# 逻辑回归模型
from sklearn.linear_model import LogisticRegression as LR
# 随机逻辑回归模型
from sklearn.linear_model import RandomizedLogisticRegression as RLR
# 建立随机逻辑回归模型,筛选变量
rlr = RLR()
# 训练模型
rlr.fit(x, y)
# 获取特筛选结果,也可以通过.score_方法获取各个特征的分数
rlr.get_support()
print(u'通过随机逻辑回归模型筛选特征结束。')
print(u'有效特征为 %s' % '.'.join(data.columns[rlr.get_support()]))
# 筛选好特征
x = data[data.columns[rlr.get_support()]].as_matrix()

# 建立逻辑回归模型
lr = LR()
# 用筛选后的特征数据来训练模型
lr.fit(x, y)
print(u'逻辑回归模型训练结束。')
# 给出模型的平均正确率,本例为81.48
print(u'模型的平均正确率为 %s' % lr.score(x, y))
Beispiel #20
0
plt.plot(Cost_i)
plt.xlim(0, 1500)
plt.ylabel('Cost J')
plt.xlabel('Iterations')

# In[10]:

xx = np.array(range(1, 25)).reshape([24, 1])
yy = np.c_[np.ones(xx.shape[0]), xx].dot(theta)
yy

plt.scatter(X[:, 1], y, c='r')
plt.plot(xx, yy, label='GD')

regr = LR()
regr.fit(X[:, 1].reshape(-1, 1), y)
plt.plot(xx, regr.intercept_ + regr.coef_ * xx, label='LR')
plt.xlim(4, 24)
plt.xlabel('Population of City in 10,000s')
plt.ylabel('Profit in $10,000s')
plt.legend(loc=4)

# In[11]:

# Predict profit for a city with population of 35000 and 70000
print(theta.T.dot([1, 3.5]) * 10000)
print(theta.T.dot([1, 7]) * 10000)

# In[12]:
    def run_CV(self):

        cvIter = 0

        totalInstanceNum = len(self.label)
        print("totalInstanceNum\t", totalInstanceNum)
        indexList = [i for i in range(totalInstanceNum)]

        totalTransferNumList = []
        # np.random.seed(3)
        random.shuffle(indexList)

        foldNum = 10
        foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum)
        foldInstanceList = []

        for foldIndex in range(foldNum - 1):
            foldIndexInstanceList = indexList[foldIndex *
                                              foldInstanceNum:(foldIndex + 1) *
                                              foldInstanceNum]
            foldInstanceList.append(foldIndexInstanceList)

        foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):]
        foldInstanceList.append(foldIndexInstanceList)
        # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
        cvIter = 0
        totalAccList = [[] for i in range(10)]
        totalNewClassFlagList = [[] for i in range(10)]
        for foldIndex in range(foldNum):
            # self.clf = LinearSVC(random_state=3)

            # self.m_clf = LR(random_state=3)
            if self.m_multipleClass:
                self.m_clf = LR(multi_class="multinomial",
                                solver='lbfgs',
                                random_state=3,
                                fit_intercept=False)
            else:
                self.m_clf = LR(random_state=3)

            train = []
            for preFoldIndex in range(foldIndex):
                train.extend(foldInstanceList[preFoldIndex])

            test = foldInstanceList[foldIndex]
            for postFoldIndex in range(foldIndex + 1, foldNum):
                train.extend(foldInstanceList[postFoldIndex])

            trainNum = int(totalInstanceNum * 0.9)

            fn_test = self.fn[test]
            label_test = self.label[test]

            fn_train = self.fn[train]

            featureDim = len(fn_train[0])
            self.init_confidence_bound(featureDim)

            initExList = []
            # initExList = [234, 366, 183]
            initExList = self.pretrainSelectInit(train, foldIndex)
            # initExList = [325, 287, 422]
            # random.seed(101)
            # initExList = random.sample(train, 3)
            fn_init = self.fn[initExList]
            label_init = self.label[initExList]

            print("initExList\t", initExList, label_init)
            queryIter = 3
            labeledExList = []
            unlabeledExList = []
            ###labeled index
            labeledExList.extend(initExList)
            unlabeledExList = list(set(train) - set(labeledExList))

            while queryIter < rounds:
                fn_train_iter = []
                label_train_iter = []

                fn_train_iter = self.fn[labeledExList]
                label_train_iter = self.label[labeledExList]

                self.m_clf.fit(fn_train_iter, label_train_iter)

                idx = self.select_example(unlabeledExList)
                self.update_select_confidence_bound(idx)
                # print(queryIter, "idx", idx, self.label[idx])
                # self.update_select_confidence_bound(idx)

                labeledExList.append(idx)
                unlabeledExList.remove(idx)

                acc = self.get_pred_acc(fn_test, label_test, labeledExList)
                totalAccList[cvIter].append(acc)
                queryIter += 1

            cvIter += 1

        totalACCFile = modelVersion + "_acc.txt"
        totalACCFile = os.path.join(fileSrc, totalACCFile)

        f = open(totalACCFile, "w")
        for i in range(10):
            totalAlNum = len(totalAccList[i])
            for j in range(totalAlNum):
                f.write(str(totalAccList[i][j]) + "\t")
            f.write("\n")
        f.close()
def optimCurveFit(strategy, method_clsf, ratio=0.8, NV_type='NVequals'):
    constrain_time = True

    ######################
    #TODO Step 1: Data input
    ######################
    data_set = 'mitdb'  # 'ecgiddb', 'mitdb'
    channel = 0
    records, IDs, fss, annss = mf.load_data(
        data_set, channel)  #, num_persons=60, record_time=20)
    fs = fss[0]

    records = np.array(records)
    IDs = np.array(IDs)
    annss = np.array(annss)
    ######################

    ######################
    #TODO Step 2: Data selection
    ######################

    if (strategy == 'allN_data') or (strategy == 'all_data'):
        ''  # do nothing here
    elif strategy == 'NV_data':
        NV_inds = [6, 15, 18, 23, 24, 26, 29, 31, 33, 35, 39, 41, 42, 46]
        #for i in NV_inds: #range(annss.shape[0]): #
        #    print i, Counter(annss[i][1])['V']

        records = records[NV_inds, :]
        IDs = IDs[NV_inds]
        annss = annss[NV_inds, :]

        ## re-numbering the IDs... wtf
        for i in range(len(NV_inds)):
            IDs[i] = i
    elif strategy == 'combine_IDs':
        num_to_combine = 4
        print IDs

        for i in range(int(len(records) / num_to_combine)):
            for j in range(num_to_combine - 1):
                IDs[i * num_to_combine + j + 1] = IDs[i * num_to_combine + j]
            #IDs[i*2+1] = IDs[i*2]
        for i in range(len(IDs)):
            IDs[i] /= num_to_combine

    if constrain_time:
        look_time = 600.  # in s
        look_ind = int(look_time * fs)
        records = records[:, :look_ind]
        annss = annss[:, :look_ind]

    recs = []
    for i in range(len(records)):
        curr_rec = Rec(records[i], fs, IDs[i], annss[i])
        recs.append(curr_rec)
    ######################

    ######################
    #TODO Step 3: Data filtering
    ######################

    ######################

    ######################
    #TODO Step 4: Data segmentation
    ######################
    USE_BIOSPPY_FILTERED = True
    sigs, labels_bySegs = mf.get_seg_data(records,
                                          IDs,
                                          fss,
                                          USE_BIOSPPY_FILTERED,
                                          annss=annss)
    sigs, labels_bySegs = np.array(sigs), np.array(labels_bySegs)
    mrks_bySegs = np.array([x[-1] for x in labels_bySegs])

    if strategy == 'allN_data':
        N_masks = (mrks_bySegs == 'N')
        sigs = sigs[N_masks, :]
        labels_bySegs = labels_bySegs[N_masks]

    IDs_bySegs = [int(x[:-1]) for x in labels_bySegs]
    mrks_bySegs = [x[-1] for x in labels_bySegs]
    IDs_bySegs, mrks_bySegs = np.array(IDs_bySegs), np.array(mrks_bySegs)

    segs = []
    for i in range(len(sigs)):
        curr_seg = Seg(sig=sigs[i],
                       fs=fs,
                       ID=IDs_bySegs[i],
                       mrk=mrks_bySegs[i])
        segs.append(curr_seg)
    segs = np.array(segs)
    ######################

    #for one_label in labels_all:
    #    if ('N' in one_label) or ('V' in one_label):
    #        print one_label
    #quit()

    #segs_all, labels_all = np.array(segs_all), np.array(labels_all)

    ######################
    #TODO Step 5: feature extraction
    ######################
    X_all = []
    y_all = []
    method_feat = 'PCA'  # 'template_matching'

    if method_feat == 'PCA':
        feat_dim = 20
        pca = PCA(n_components=feat_dim)
        X_all = np.array([x.sig for x in segs])
        X_all = pca.fit(X_all).transform(X_all)

        for i in range(len(segs)):
            segs[i].feat = X_all[i, :]
        y_all = np.array([x.ID for x in segs])

    X_all = np.array(X_all)
    ######################

    ######################
    #TODO Step 6: Data split
    ######################
    if strategy != 'NV_data':
        X_train, X_test, y_train, y_test = train_test_split(X_all,
                                                            y_all,
                                                            test_size=0.2,
                                                            random_state=42)
    else:
        X_train, X_test, y_train, y_test = [], [], [], []
        y_test_mrks = []
        for i in range(len(NV_inds)):
            curr_mrks = mrks_bySegs[IDs_bySegs == i]  #current people's mrks\
            #print curr_mrks

            curr_segs = segs[IDs_bySegs == i]
            curr_labels = labels_bySegs[IDs_bySegs == i]

            curr_inds_Vs = np.where(curr_mrks == 'V')[0]
            curr_inds_Ns = np.where(curr_mrks == 'N')[0]

            curr_num_Vs = sum(np.array(curr_mrks) == 'V')  #all his Vs
            curr_num_Ns = sum(np.array(curr_mrks) == 'N')

            if NV_type == 'fixV':
                train_num_Vs = int(curr_num_Vs * .8)
                train_num_Ns = min(
                    [int(curr_num_Ns * .8),
                     int(ratio * train_num_Vs)])
            elif NV_type == 'NVequals':
                train_num_Vs = int(curr_num_Vs * ratio)
                train_num_Ns = train_num_Vs

            train_inds_Vs = random.sample(curr_inds_Vs, train_num_Vs)
            test_inds_Vs = [
                x for x in curr_inds_Vs if not (x in train_inds_Vs)
            ]

            #test_inds_Vs = curr_inds_Vs[~ train_inds_Vs]
            train_inds_Ns = random.sample(curr_inds_Ns, train_num_Ns)
            test_inds_Ns = [
                x for x in curr_inds_Ns if not (x in train_inds_Ns)
            ]

            #print len(train_inds_Vs), len(test_inds_Vs)
            #print len(train_inds_Ns), len(test_inds_Ns)

            #test_inds_Ns = curr_inds_Vs[~ train_inds_Ns]
            #        print train_inds_Ns
            #        print test_inds_Ns

            curr_IDs = IDs_bySegs[IDs_bySegs == i]
            #print curr_IDs

            for one_seg in curr_segs[train_inds_Vs]:
                X_train.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[train_inds_Vs]:
                y_train.append(one_lab)

            for one_seg in curr_segs[train_inds_Ns]:
                X_train.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[train_inds_Ns]:
                y_train.append(one_lab)

            for one_seg in curr_segs[test_inds_Vs]:
                X_test.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[test_inds_Vs]:
                y_test.append(one_lab)
            for one_mrk in curr_mrks[test_inds_Vs]:
                y_test_mrks.append(one_mrk)

            for one_seg in curr_segs[test_inds_Ns]:
                X_test.append(one_seg.feat.tolist())
            for one_lab in curr_IDs[test_inds_Ns]:
                y_test.append(one_lab)
            for one_mrk in curr_mrks[test_inds_Ns]:
                y_test_mrks.append(one_mrk)

            #print i
            #print len(X_train), len(y_train), len(X_test), len(y_test)

    X_train, y_train, X_test, y_test = \
    np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)

    ######################

    #print X_train.shape, y_train.shape, X_test.shape, y_test.shape
    #quit()
    #print X_train
    #print X_test
    #y_train = [int(y[:-1]) for y in y_train]
    #y_test = [int(y[:-1]) for y in y_test]

    ######################
    #TODO Step 7: Model training
    ######################
    time_before_training = Time()

    if method_clsf == 'SVM':
        not_trained = True
        from sklearn.externals import joblib
        if not_trained:
            clf = svm.SVC(kernel='rbf', C=10., gamma=0.1)
            clf.fit(X_train, y_train)
            joblib.dump(clf, 'test_clf.pkl')
        else:
            clf = joblib.load('test_clf.pkl')
        res_pred = clf.predict(X_test)
    elif method_clsf == 'Logit':
        clf = LR(C=10.)
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'kNN':
        clf = KNC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'DTC':
        clf = DTC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'boosting':
        clf = XGBC()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'GNB':
        clf = GNB()
        clf.fit(X_train, y_train)
        res_pred = clf.predict(X_test)
    elif method_clsf == 'DL':
        not_trained = True
        from sklearn.externals import joblib

        if not_trained:
            model = Sequential()
            model.add(
                Dense(feat_dim, activation='relu', input_shape=(feat_dim, )))
            #model.add(Dense(input_dim,activation='relu'))

            num_categs = len(set(y_train))

            print y_train, num_categs
            Y_train = np_utils.to_categorical(y_train, num_categs)
            Y_test = np_utils.to_categorical(y_test, num_categs)

            model.add(Dense(num_categs, activation='softmax'))

            model.compile(loss='categorical_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
            X_train = np.array(X_train)
            Y_train = np.array(Y_train)
            #print X_train.shape
            #print Y_train.shape

            model.fit(X_train,
                      Y_train,
                      validation_split=0.2,
                      batch_size=32,
                      nb_epoch=50,
                      verbose=0)
            #model.save('test_clf_DL.pkl')
        else:
            model = keras.models.load_model('test_clf_DL.pkl')
        #score = model.evaluate(X_test, Y_test, verbose=0)

    time_after_training = Time()

    ######################
    #TODO Step 8: Model testing
    ######################
    if method_clsf != 'DL':
        res_pred = clf.predict(X_test)
    else:
        res_pred = model.predict_classes(X_test)
    ######################

    ######################
    #TODO Step 9: Result output
    ######################
    train_time = time_after_training - time_before_training

    print_res = False
    if print_res:
        print ''
        print 'Parameters:'
        print 'strategy:', strategy
        print 'constrain_time:', constrain_time
        print 'ratio:', ratio
        print 'method_clsf:', method_clsf

        #print ''

        print 'Results:'
        print 'Used time for training:', time_after_training - time_before_training

    res_look = []
    for i in range(len(res_pred)):
        res_look.append((res_pred[i], y_test[i]))
    #print res_look

    if False:
        res_pred_IDs = np.array([y[:-1] for y in res_pred])
        res_pred_mrks = np.array([y[-1] for y in res_pred])

        only_test_ID = True
        if only_test_ID:
            to_be_predct = res_pred_IDs
            to_be_tested = y_test
        else:
            to_be_predct = res_pred
            to_be_tested = y_test

    ##TODO: adjust accordingly
    if strategy == 'NV_data':
        look_stat = 'V'
        y_test_mrks = np.array(y_test_mrks)
        #print y_test_mrks
        to_be_predct = res_pred[y_test_mrks == look_stat]
        to_be_tested = y_test[y_test_mrks == look_stat]

        res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct,
                                       y_test=to_be_tested,
                                       type='by_seg')
        res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct,
                                         y_test=to_be_tested,
                                         type='by_categ')
        one_res = (float(format(res_by_seg,
                                '.3f')), float(format(res_by_categ, '.3f')))
        accuBySeg_V = one_res[0]
        #print len(to_be_predct), one_res

        look_stat = 'N'
        to_be_predct = res_pred[y_test_mrks == look_stat]
        to_be_tested = y_test[y_test_mrks == look_stat]

        res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct,
                                       y_test=to_be_tested,
                                       type='by_seg')
        res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct,
                                         y_test=to_be_tested,
                                         type='by_categ')
        one_res = (float(format(res_by_seg,
                                '.3f')), float(format(res_by_categ, '.3f')))
        accuBySeg_N = one_res[0]
        #print len(to_be_predct), one_res
        return [accuBySeg_V, accuBySeg_N, train_time]
    else:
        to_be_predct = res_pred
        to_be_tested = y_test

        res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct,
                                       y_test=to_be_tested,
                                       type='by_seg')
        res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct,
                                         y_test=to_be_tested,
                                         type='by_categ')
        one_res = (float(format(res_by_seg,
                                '.3f')), float(format(res_by_categ, '.3f')))
        return [one_res[0], train_time]
    def run_CV(self):

        cvIter = 0

        totalInstanceNum = len(self.m_targetLabel)
        print("totalInstanceNum\t", totalInstanceNum)
        indexList = [i for i in range(totalInstanceNum)]

        totalTransferNumList = []
        np.random.seed(3)
        np.random.shuffle(indexList)

        foldNum = 10
        foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum)
        foldInstanceList = []

        for foldIndex in range(foldNum - 1):
            foldIndexInstanceList = indexList[foldIndex *
                                              foldInstanceNum:(foldIndex + 1) *
                                              foldInstanceNum]
            foldInstanceList.append(foldIndexInstanceList)

        foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):]
        foldInstanceList.append(foldIndexInstanceList)
        # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
        # random.seed(3)
        totalAccList = [[] for i in range(10)]
        humanAccList = [[] for i in range(10)]
        totalExtraAccList = []
        # self.get_base_learners()

        correctTransferRatioList = []
        totalTransferNumList = []
        correctTransferLabelNumList = []
        correctUntransferRatioList = []

        totalAuditorPrecisionList = []
        totalAuditorRecallList = []
        totalAuditorAccList = []

        for foldIndex in range(foldNum):

            if self.m_multipleClass:
                self.m_clf = LR(multi_class="multinomial",
                                solver='lbfgs',
                                random_state=3)
            else:
                self.m_clf = LR(random_state=3)
            self.m_auditor = LR(random_state=3)

            train = []
            for preFoldIndex in range(foldIndex):
                train.extend(foldInstanceList[preFoldIndex])

            test = foldInstanceList[foldIndex]
            for postFoldIndex in range(foldIndex + 1, foldNum):
                train.extend(foldInstanceList[postFoldIndex])

            trainData, valid = train_test_split(train,
                                                random_state=3,
                                                test_size=0.2)

            train = trainData

            targetNameFeatureTrain = self.m_targetNameFeature[train]
            targetLabelTrain = self.m_targetLabel[train]

            targetNameFeatureValid = self.m_targetNameFeature[valid]
            targetLabelValid = self.m_targetLabel[valid]
            # targetDataFeatureTrain = self.m_targetDataFeature[train]

            targetNameFeatureTest = self.m_targetNameFeature[test]
            targetLabelTest = self.m_targetLabel[test]

            transferLabelTest = self.m_transferLabel[test]
            # targetDataFeatureTest = self.m_targetDataFeature[test]

            # sourceUniqueClass = np.unique(self.m_sourceLabel)

            initExList = []
            initExList = self.pretrainSelectInit(train, foldIndex)

            targetNameFeatureInit = self.m_targetNameFeature[initExList]
            targetLabelInit = self.m_targetLabel[initExList]

            transferLabelInit = self.m_transferLabel[initExList]

            print("initExList\t", initExList, targetLabelInit)

            queryIter = 0
            labeledExList = []
            unlabeledExList = []
            ###labeled index
            labeledExList.extend(initExList)
            unlabeledExList = list(set(train) - set(labeledExList))

            activeLabelNum = 3.0
            transferLabelNum = 0.0
            transferFeatureList = []
            transferFlagList = []

            featureDim = len(targetNameFeatureTrain[0])
            self.init_confidence_bound(featureDim, labeledExList,
                                       unlabeledExList)

            targetNameFeatureIter = targetNameFeatureInit
            targetLabelIter = targetLabelInit

            correctTransferLabelNum = 0.0
            wrongTransferLabelNum = 0.0
            correctUntransferLabelNum = 0.0
            wrongUntransferLabelNum = 0.0

            # auditorPrecisionList = []
            # auditorRecallList = []
            auditorAccList = []
            extraAccList = []

            self.m_clf.fit(targetNameFeatureInit, targetLabelInit)

            # targetAuditorLabelInit = (targetLabelInit==transferLabelInit)
            for exId in initExList:
                if self.m_targetLabel[exId] == self.m_transferLabel[exId]:
                    transferFlagList.append(1.0)
                else:
                    transferFlagList.append(0.0)

                transferFeatureList.append(self.m_targetNameFeature[exId])

            auditorScoreFlag = False
            if len(np.unique(transferFlagList)) > 1:
                self.m_auditor.fit(np.array(transferFeatureList),
                                   np.array(transferFlagList))
                auditorScoreFlag = True

            while activeLabelNum < rounds:

                exId = self.select_example(unlabeledExList, auditorScoreFlag)

                exLabel = -1

                self.m_strongLabeledIDList.append(exId)
                # self.update_select_confidence_bound(exId)
                # self.update_judge_confidence_bound(exId)
                activeLabelNum += 1.0
                activeLabelFlag = True

                exLabel = self.m_targetLabel[exId]

                transferLabel = self.m_transferLabel[exId]
                if transferLabel == exLabel:
                    # correctUntransferLabelNum += 1.0
                    transferFlagList.append(1.0)
                    transferFeatureList.append(self.m_targetNameFeature[exId])
                else:
                    # wrongUntransferLabelNum += 1.0
                    transferFlagList.append(0.0)
                    transferFeatureList.append(self.m_targetNameFeature[exId])

                    # auditorPrecision = 0.0
                    # if correctTransferLabelNum+wrongTransferLabelNum > 0.0:
                    # 	auditorPrecision = correctTransferLabelNum*1.0/(correctTransferLabelNum+wrongTransferLabelNum)

                auditorAcc = self.getAuditorMetric(transferFeatureList,
                                                   transferFlagList,
                                                   targetNameFeatureTest,
                                                   transferLabelTest,
                                                   targetLabelTest)
                # print("auditorAcc", auditorAcc)
                auditorAccList.append(auditorAcc)

                labeledExList.append(exId)
                unlabeledExList.remove(exId)

                # acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter)
                # totalAccList[cvIter].append(acc)
                extraAcc = self.addExtraWeakLabels(
                    transferFeatureList, transferFlagList,
                    targetNameFeatureValid, targetLabelValid,
                    targetNameFeatureTest, transferLabelTest, targetLabelTest,
                    queryIter)
                extraAccList.append(extraAcc)
                # humanAccList[cvIter].append(acc)
                queryIter += 1

            # totalAuditorPrecisionList.append(auditorPrecisionList)
            # totalAuditorRecallList.append(auditorRecallList)
            totalAuditorAccList.append(auditorAccList)
            totalExtraAccList.append(extraAccList)

            cvIter += 1

        # print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList)))

        # print("extraList", extraAccList, np.mean(extraAccList), np.sqrt(np.var(extraAccList)))
        # print("correct ratio\t", np.mean(correctTransferRatioList), np.sqrt(np.var(correctTransferRatioList)))
        # print("untransfer correct ratio\t", np.mean(correctUntransferRatioList), np.sqrt(np.var(correctUntransferRatioList)))

        # AuditorPrecisionFile = modelVersion+"_auditor_precision.txt"
        # writeFile(totalAuditorPrecisionList, AuditorPrecisionFile)

        # AuditorRecallFile = modelVersion+"_auditor_recall.txt"
        # writeFile(totalAuditorRecallList, AuditorRecallFile)

        AuditorAccFile = modelVersion + "_auditor_acc.txt"
        writeFile(totalAuditorAccList, AuditorAccFile)

        # totalACCFile = modelVersion+"_acc.txt"
        # writeFile(totalAccList, totalACCFile)

        # humanACCFile = modelVersion+"_human_acc.txt"
        # writeFile(humanAccList, humanACCFile)

        extraACCFile = modelVersion + "_extra_acc.txt"
        writeFile(totalExtraAccList, extraACCFile)
# In[]:
# 散点图
for i in Xtrain.columns:
    ft.con_data_scatter(Xtrain, i, Ytrain, "Y")
# In[]:
ft.con_data_scatter(Xtrain, 'AveRooms', Ytrain, "Y")

# In[]:
# 特征的 皮尔森相关度
ft.corrFunction(Xtrain)

# In[]:
'''
测试 SKlearn 和 statsmodels,在 无超参数情况下,是相同的。
'''
reg = LR().fit(Xtrain, Ytrain)
yhat = reg.predict(Xtrain)  #预测我们的yhat
print(reg.score(Xtrain, Ytrain))

predict = pd.DataFrame(yhat, columns=['Pred'])
resid = pd.DataFrame((Ytrain["Y"] - predict["Pred"]), columns=['resid'])

resid_1 = pd.concat([predict, resid], axis=1)
resid_1.plot('Pred', 'resid', kind='scatter')

print(ft.r2_score_customize(Ytrain, yhat, 2))
print(ft.adj_r2_customize(Ytrain, yhat, Xtrain.shape[1], 2))

# In[]:
from statsmodels.formula.api import ols
#1 - 不分割資料集
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression as LR
from sklearn.model_selection import train_test_split as tts

diabetes = datasets.load_diabetes()

x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
target = pd.DataFrame(diabetes.target, columns=["QM"])
y = target["QM"]

lm = LR()
lm.fit(x, y)
pred = lm.predict(x)

MSE = np.mean((y - pred)**2)
R = lm.score(x, y)

print("【#1 不分割資料集】")
print("完整資料的 MSE:", MSE)
print("完整資料的 R^2:", R)
print()

#2 - 分割比例為 3:1
x_train, x_test, y_train, y_test = tts(x, y, test_size=0.25, random_state=100)

lm = LR()
lm.fit(x_train, y_train)
Beispiel #26
0
def roc_it(input_file=INPUT_FILE):

    beer = pd.read_csv(input_file, delimiter='\t').dropna()

    # add class label for top half / bottom half
    midpt = int(len(beer) / 2)
    beer['label'] = beer['Rank'].map(lambda k: 1 if k <= midpt else 0)

    # drop categorical columns
    features = beer[['ABV', 'Reviews']]
    labels = beer['label']

    # create cv iterator (note: train pct is set implicitly by number of folds)
    num_recs = len(beer)
    kf = cv.KFold(n=num_recs, n_folds=NUM_FOLDS, shuffle=True)

    # initialize results sets
    all_fprs, all_tprs, all_aucs = (np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS),
                                    np.zeros(NUM_FOLDS))

    for i, (train_index, test_index) in enumerate(kf):

        # initialize & train model
        model = LR()

        # debug!
        train_features = features.loc[train_index].dropna()
        train_labels = labels.loc[train_index].dropna()

        test_features = features.loc[test_index].dropna()
        test_labels = labels.loc[test_index].dropna()

        model.fit(train_features, train_labels)

        # predict labels for test features
        pred_labels = model.predict(test_features)

        # calculate ROC/AUC
        fpr, tpr, thresholds = roc_curve(test_labels, pred_labels, pos_label=1)
        roc_auc = auc(fpr, tpr)

        print '\nfpr = {0}'.format(fpr)
        print 'tpr = {0}'.format(tpr)
        print 'auc = {0}'.format(roc_auc)

        all_fprs[i] = fpr[1]
        all_tprs[i] = tpr[1]
        all_aucs[i] = roc_auc

    print '\nall_fprs = {0}'.format(all_fprs)
    print 'all_tprs = {0}'.format(all_tprs)
    print 'all_aucs = {0}'.format(all_aucs)

    # plot ROC curve
    pl.clf()
    pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.0])
    pl.ylim([0.0, 1.0])
    pl.xlabel('False Positive Rate')
    pl.ylabel('True Positive Rate')
    pl.title('Receiver operating characteristic example')
    pl.legend(loc="lower right")
    pl.show()
Beispiel #27
0
print('[03]\n', X_train.head(5))  # 学習用データの説明変数Xの先頭5行を表示
print('[04]\n', y_train.head(5))  # 学習用データの目的変数Yの先頭5行を表示
# [04],[05]の最左列はCSVファイルの行番号です。
print('[05] 分離前の全データの説明変数Xの基本統計量\n', X.describe())
print('[06] 学習用データの説明変数Xの基本統計量\n', X_train.describe())
print('[07] 評価用データの説明変数Xの基本統計量\n', X_test.describe())
# count=データの個数、 mean=平均値、 std=標準偏差
# min=最小値、50%=中央値、max=最大値

# 欠損データの確認
#   ※もし欠測データがあった場合には、dropna関数で当該行のデータを削除、もしくはfillna
#     関数で仮の値を補充し、誤ったデータをもとに学習してしまわないようにします。
print("[08] 学習用データの欠測値の個数(X,Y)=\n",
      X_train.isnull().sum(),
      y_train.isnull().sum())
print("[09] 評価用データの欠測値の個数(X,Y)=\n",
      X_test.isnull().sum(),
      y_test.isnull().sum())

# 線形回帰による単回帰分析の実行
model = LR()  # 線形回帰のための器をLRライブラリを使って変数modelに準備
model.fit(X_train, y_train)  # 学習用データ(X_train, Y_train)を使って線形回帰分析
print(
    f"[11] 近似式は {MOKUTEKI} = {SETSUMEI} * {model.coef_} + {model.intercept_}")

# 線形回帰分析の結果得られたモデル(model)を評価用データに適用して精度をチェック
print("[12] 得られたモデルの評価用データへの適用時の決定係数R^2(最良=1)は", model.score(X_test, y_test))

# 線形近似ではなく、曲線近似をさせるにはscipyライブラリを使う方法があります。
# ここでは解説しませんが、興味があれば調べてみてください。
Beispiel #28
0
    def run_CV(self):

        cvIter = 0

        totalInstanceNum = len(self.label)
        print("totalInstanceNum\t", totalInstanceNum)
        indexList = [i for i in range(totalInstanceNum)]

        print("featureNum", len(self.fn[0]))
        # print("non zero feature num", sum(self.fn[0]))

        totalTransferNumList = []
        # np.random.seed(3)
        # np.random.shuffle(indexList)

        random.shuffle(indexList)

        foldNum = 10
        foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum)
        foldInstanceList = []

        for foldIndex in range(foldNum - 1):
            foldIndexInstanceList = indexList[foldIndex *
                                              foldInstanceNum:(foldIndex + 1) *
                                              foldInstanceNum]
            foldInstanceList.append(foldIndexInstanceList)

        foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):]
        foldInstanceList.append(foldIndexInstanceList)
        # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
        cvIter = 0
        # random.seed(3)
        totalAccList = [0 for i in range(10)]

        coefList = [0 for i in range(10)]

        for foldIndex in range(foldNum):

            # self.m_clf = LinearSVC(random_state=3)
            # self.m_clf = LR(fit_intercept=False)

            self.m_clf = LR(random_state=3)

            train = []
            for preFoldIndex in range(foldIndex):
                train.extend(foldInstanceList[preFoldIndex])

            test = foldInstanceList[foldIndex]
            for postFoldIndex in range(foldIndex + 1, foldNum):
                train.extend(foldInstanceList[postFoldIndex])

            trainNum = int(totalInstanceNum * 0.9)

            # print(test)
            fn_test = self.fn[test]

            label_test = self.label[test]

            sampledTrainNum = len(train)
            # sampledTrainNum = 100
            train_sampled = random.sample(train, sampledTrainNum)

            fn_train = self.fn[train_sampled]
            label_train = self.label[train_sampled]

            self.m_clf.fit(fn_train, label_train)

            coefList[cvIter] = self.m_clf.coef_

            label_preds = self.m_clf.predict(fn_test)
            acc = accuracy_score(label_test, label_preds)

            totalAccList[cvIter] = acc
            # initExList = []
            # random.seed(3)
            # initExList = random.sample(train, 3)
            # fn_init = self.fn[initExList]
            # label_init = self.label[initExList]
            # print("initExList\t", initExList, label_init)

            # queryIter = 3
            # labeledExList = []
            # unlabeledExList = []
            # ###labeled index
            # labeledExList.extend(initExList)
            # unlabeledExList = list(set(train)-set(labeledExList))

            # featureDim = len(self.fn[0])
            # self.init_confidence_bound(featureDim)

            # while queryIter < rounds:
            # 	fn_train_iter = []
            # 	label_train_iter = []

            # 	fn_train_iter = self.fn[labeledExList]
            # 	label_train_iter = self.label[labeledExList]

            # 	self.m_clf.fit(fn_train_iter, label_train_iter)

            # 	idx = self.select_example(unlabeledExList)
            # 	self.update_confidence_bound(idx)
            # 	# print(queryIter, "idx", idx, self.label[idx])
            # 	labeledExList.append(idx)
            # 	unlabeledExList.remove(idx)

            # 	acc = self.get_pred_acc(fn_test, label_test, labeledExList)
            # 	totalAccList[cvIter].append(acc)
            # 	queryIter += 1

            cvIter += 1

        totalACCFile = modelVersion + ".txt"
        f = open(totalACCFile, "w")
        for i in range(10):
            f.write(str(totalAccList[i]))
            # for j in range(totalAlNum):
            # 	f.write(str(totalAccList[i][j])+"\t")
            f.write("\n")
        f.close()

        coefFile = modelVersion + "_coef.txt"
        f = open(coefFile, "w")
        for i in range(10):
            coef4Classifier = coefList[i]
            coefNum = len(coef4Classifier)

            for coefIndex in range(coefNum):
                f.write(str(coef4Classifier[coefIndex]) + "\t")
            f.write("\n")

        f.close()

        print(np.mean(totalAccList), np.sqrt(np.var(totalAccList)))
Beispiel #29
0
	def run_CV(self):

		cvIter = 0
		
		totalInstanceNum = len(self.m_targetLabel)
		print("totalInstanceNum\t", totalInstanceNum)
		indexList = [i for i in range(totalInstanceNum)]

		totalTransferNumList = []
		np.random.seed(3)
		np.random.shuffle(indexList)

		foldNum = 10
		foldInstanceNum = int(totalInstanceNum*1.0/foldNum)
		foldInstanceList = []

		for foldIndex in range(foldNum-1):
			foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum]
			foldInstanceList.append(foldIndexInstanceList)

		foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):]
		foldInstanceList.append(foldIndexInstanceList)
		# kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True)
		# random.seed(3)
		totalAccList = [[] for i in range(10)]
		humanAccList = [[] for i in range(10)]
		totalExtraAccList = []
		# self.get_base_learners()

		correctTransferRatioList = []
		totalTransferNumList = []
		correctUntransferRatioList = []

		totalAuditorPrecisionList = []
		totalAuditorRecallList = []
		totalAuditorAccList = []


		for foldIndex in range(foldNum):
			
			# self.clf = LinearSVC(random_state=3)

			self.m_clf = LR(multi_class="multinomial", solver='lbfgs',random_state=3)
			# self.m_judgeClassifier = LR(random_state=3)
			self.m_auditor0 = LR(random_state=3)
			self.m_auditor1 = LR(random_state=3)

			train = []
			for preFoldIndex in range(foldIndex):
				train.extend(foldInstanceList[preFoldIndex])

			test = foldInstanceList[foldIndex]
			for postFoldIndex in range(foldIndex+1, foldNum):
				train.extend(foldInstanceList[postFoldIndex])

			trainNum = int(totalInstanceNum*0.9)

			targetNameFeatureTrain = self.m_targetNameFeature[train]
			targetLabelTrain = self.m_targetLabel[train]
			# targetDataFeatureTrain = self.m_targetDataFeature[train]

			targetNameFeatureTest = self.m_targetNameFeature[test]
			targetLabelTest = self.m_targetLabel[test]

			# transferLabelTest = self.m_transferLabel[test]
			# targetDataFeatureTest = self.m_targetDataFeature[test]

			# sourceUniqueClass = np.unique(self.m_sourceLabel)

			initExList = []
			initExList = self.pretrainSelectInit(train, foldIndex)

			targetNameFeatureInit = self.m_targetNameFeature[initExList]
			targetLabelInit = self.m_targetLabel[initExList]

			print("initExList\t", initExList, targetLabelInit)

			queryIter = 0
			labeledExList = []
			unlabeledExList = []
			###labeled index
			labeledExList.extend(initExList)
			unlabeledExList = list(set(train)-set(labeledExList))

			activeLabelNum = 3.0
			transferLabelNum = 0.0
			transferFeatureList = []
			transferFlagList0 = []
			transferFlagList1 = []

			featureDim = len(targetNameFeatureTrain[0])
			self.init_confidence_bound(featureDim, labeledExList, unlabeledExList)

			targetNameFeatureIter = targetNameFeatureInit
			targetLabelIter = targetLabelInit

			correctTransferLabelNum = 0.0
			wrongTransferLabelNum = 0.0
			correctUntransferLabelNum = 0.0
			wrongUntransferLabelNum = 0.0

			# auditorPrecisionList = []
			# auditorRecallList = []
			auditorAccList = []
			extraAccList = []

			self.m_clf.fit(targetNameFeatureInit, targetLabelInit)

			while activeLabelNum < rounds:

				# targetNameFeatureIter = self.m_targetNameFeature[labeledExList]
				# targetLabelIter = self.m_targetLabel[labeledExList]

				# self.m_clf.fit(targetNameFeatureIter, targetLabelIter) 

				exId = self.select_example(unlabeledExList) 
				
				exLabel = -1
				
				self.m_strongLabeledIDList.append(exId)
				# self.update_select_confidence_bound(exId)
				self.update_judge_confidence_bound(exId)
				activeLabelNum += 1.0
				activeLabelFlag = True

				exLabel = self.m_targetLabel[exId]
				
				transferLabel0 = self.m_transferLabel0[exId]
				transferLabel1 = self.m_transferLabel1[exId]

				transferFeatureList.append(self.m_targetNameFeature[exId])

				if transferLabel0 == exLabel:
					# correctUntransferLabelNum += 1.0
					transferFlagList0.append(1.0)
					
				else:
					# wrongUntransferLabelNum += 1.0
					transferFlagList0.append(0.0)

				if transferLabel1 == exLabel:
					# correctUntransferLabelNum += 1.0
					transferFlagList1.append(1.0)
					
				else:
					# wrongUntransferLabelNum += 1.0
					transferFlagList1.append(0.0)

					# auditorPrecision = 0.0
					# if correctTransferLabelNum+wrongTransferLabelNum > 0.0:
					# 	auditorPrecision = correctTransferLabelNum*1.0/(correctTransferLabelNum+wrongTransferLabelNum)

				# auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest)
				auditorAcc = 0.0
				# print("auditorAcc", auditorAcc)
				auditorAccList.append(auditorAcc)

				labeledExList.append(exId)
				unlabeledExList.remove(exId)

				# acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter)
				# totalAccList[cvIter].append(acc)
				extraAcc = self.addExtraExample(transferFeatureList, transferFlagList0, transferFlagList1, targetNameFeatureTest, targetLabelTest)
				extraAccList.append(extraAcc)
					# humanAccList[cvIter].append(acc)
				queryIter += 1

			# totalAuditorPrecisionList.append(auditorPrecisionList)
			# totalAuditorRecallList.append(auditorRecallList)
			totalAuditorAccList.append(auditorAccList)
			totalExtraAccList.append(extraAccList)
			

			cvIter += 1      


		AuditorAccFile = modelVersion+"_auditor_acc.txt"
		writeFile(totalAuditorAccList, AuditorAccFile)

		# totalACCFile = modelVersion+"_acc.txt"
		# writeFile(totalAccList, totalACCFile)

		# humanACCFile = modelVersion+"_human_acc.txt"
		# writeFile(humanAccList, humanACCFile)

		extraACCFile = modelVersion+"_extra_acc.txt"
		writeFile(totalExtraAccList, extraACCFile)
	def run_CV(self):

		cvIter = 0
		
		totalInstanceNum = len(self.m_targetLabel)
		print("totalInstanceNum\t", totalInstanceNum)
		indexList = [i for i in range(totalInstanceNum)]

		totalTransferNumList = []
		np.random.seed(3)
		np.random.shuffle(indexList)

		foldNum = 10
		foldInstanceNum = int(totalInstanceNum*1.0/foldNum)
		foldInstanceList = []

		for foldIndex in range(foldNum-1):
			foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum]
			foldInstanceList.append(foldIndexInstanceList)

		foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):]
		foldInstanceList.append(foldIndexInstanceList)
	
		totalAccList = [[] for i in range(10)]
		humanAccList = [[] for i in range(10)]

		correctTransferRatioList = []
		totalTransferNumList = []
		correctUntransferRatioList = []

		totalAuditorPrecisionList = []
		totalAuditorRecallList = []
		totalAuditorAccList = []

		for foldIndex in range(foldNum):
			
			self.m_clf = LR(multi_class="multinomial", solver='lbfgs',random_state=3)
			self.m_auditor0 = LR(random_state=3)
			self.m_auditor1 = LR(random_state=3)

			train = []
			for preFoldIndex in range(foldIndex):
				train.extend(foldInstanceList[preFoldIndex])

			test = foldInstanceList[foldIndex]
			for postFoldIndex in range(foldIndex+1, foldNum):
				train.extend(foldInstanceList[postFoldIndex])

			trainNum = int(totalInstanceNum*0.9)

			targetNameFeatureTrain = self.m_targetNameFeature[train]
			targetLabelTrain = self.m_targetLabel[train]
			# targetDataFeatureTrain = self.m_targetDataFeature[train]

			targetNameFeatureTest = self.m_targetNameFeature[test]
			targetLabelTest = self.m_targetLabel[test]

			# transferLabelTest = self.m_transferLabel[test]
			transferLabelTest = []

		
			initExList = []
			initExList = self.pretrainSelectInit(train, foldIndex)
			# random.seed(101)
			# initExList = random.sample(train, 3)

			targetNameFeatureInit = self.m_targetNameFeature[initExList]
			targetLabelInit = self.m_targetLabel[initExList]

			print("initExList\t", initExList, targetLabelInit)

			queryIter = 0
			labeledExList = []
			unlabeledExList = []
			###labeled index
			labeledExList.extend(initExList)
			unlabeledExList = list(set(train)-set(labeledExList))

			activeLabelNum = 3.0
			transferLabelNum = 0.0
			transferFeatureList = []
			transferFlagList0 = []
			transferFlagList1 = []

			featureDim = len(targetNameFeatureTrain[0])
			self.init_confidence_bound(featureDim, labeledExList, unlabeledExList)

			targetNameFeatureIter = targetNameFeatureInit
			targetLabelIter = targetLabelInit

			correctTransferLabelNum = 0.0
			wrongTransferLabelNum = 0.0
			correctUntransferLabelNum = 0.0
			wrongUntransferLabelNum = 0.0

			# auditorPrecisionList = []
			# auditorRecallList = []
			auditorAccList = []

			while activeLabelNum < rounds:

				# targetNameFeatureIter = self.m_targetNameFeature[labeledExList]
				# targetLabelIter = self.m_targetLabel[labeledExList]

				self.m_clf.fit(targetNameFeatureIter, targetLabelIter) 

				exId = self.select_example(unlabeledExList) 
				# self.update_select_confidence_bound(exId)

				# print(idx)
				activeLabelFlag = False
				transferLabelFlag, weakOracleIndex, transferLabel = self.get_transfer_flag(transferFeatureList, transferFlagList0, transferFlagList1, exId, activeLabelNum)

				exLabel = -1
				if transferLabelFlag:
					self.m_weakLabeledIDList.append(exId)
					
					transferLabelNum += 1.0
					activeLabelFlag = False
					
					exLabel = transferLabel
					targetNameFeatureIter = np.vstack((targetNameFeatureIter, self.m_targetNameFeature[exId]))
					targetLabelIter = np.hstack((targetLabelIter, exLabel))
					# targetNameFeatureIter.append(self.m_targetNameFeature[exId])
					# targetLabelIter.append(exLabel)

					if exLabel == self.m_targetLabel[exId]:
						correctTransferLabelNum += 1.0
						print("queryIter\t", queryIter)
					else:
						wrongTransferLabelNum += 1.0
						print("query iteration", queryIter, "error transfer label\t", exLabel, "true label", self.m_targetLabel[exId])
				else:
					self.m_strongLabeledIDList.append(exId)
					self.update_judge_confidence_bound(exId)
					activeLabelNum += 1.0
					activeLabelFlag = True

					exLabel = self.m_targetLabel[exId]
					targetNameFeatureIter = np.vstack((targetNameFeatureIter, self.m_targetNameFeature[exId]))
					targetLabelIter = np.hstack((targetLabelIter, exLabel))
					# targetNameFeatureIter.append(self.m_targetNameFeature[exId])
					# targetLabelIter.append(exLabel)

					weakLabel0 = self.m_transferLabel0[exId]
					weakLabel1 = self.m_transferLabel1[exId]
					
					transferFeatureList.append(self.m_targetNameFeature[exId])

					if weakLabel0 == exLabel:
						correctUntransferLabelNum += 1.0
						transferFlagList0.append(1.0)
					else:
						wrongUntransferLabelNum += 1.0
						transferFlagList0.append(0.0)

					if weakLabel1 == exLabel:
						correctUntransferLabelNum += 1.0
						transferFlagList1.append(1.0)
					else:
						wrongUntransferLabelNum += 1.0
						transferFlagList1.append(0.0)

					auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList0, transferFlagList1, targetNameFeatureTest, transferLabelTest, targetLabelTest)
					print("auditorAcc", auditorAcc)

					auditorAccList.append(auditorAcc)

				labeledExList.append(exId)
				unlabeledExList.remove(exId)

				acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter)
				totalAccList[cvIter].append(acc)
				if activeLabelFlag:
					humanAccList[cvIter].append(acc)
				queryIter += 1

			totalAuditorAccList.append(auditorAccList)

			transferLabelNum = len(self.m_weakLabeledIDList)
			totalTransferNumList.append(transferLabelNum)

			cvIter += 1      
		
		print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList)))

		AuditorAccFile = modelVersion+"_auditor_acc.txt"
		writeFile(totalAuditorAccList, AuditorAccFile)

		totalACCFile = modelVersion+"_acc.txt"
		writeFile(totalAccList, totalACCFile)

		humanACCFile = modelVersion+"_human_acc.txt"
		writeFile(humanAccList, humanACCFile)