# AGE # FILL OUT AGE VIA LINEAR REGRESSION # CREATE THE TRAINING SETS AND SET SEX TO BE 0 OR 1 X_train_age = train.dropna(subset=['Age']).drop( ['Cabin', 'Age', 'Name', 'Ticket', 'Embarked'], axis=1) X_train_age['Sex'] = X_train_age['Sex'].map({'male': 0, 'female': 1}) y_train_age = train.dropna(subset=['Age'])['Age'] # PREPARE THE PREDICTION SET AND SET SEX TO BE 0 OR 1 X_pred_age = train[np.invert(train.index.isin(X_train_age.index))].drop( ['Cabin', 'Age', 'Name', 'Ticket', 'Embarked'], axis=1) X_pred_age['Sex'] = X_pred_age['Sex'].map({'male': 0, 'female': 1}) # CREATE AND FIT THE MODEL lm = LR() lm.fit(X_train_age, y_train_age) # PREDICT AGES AND INSERT train.loc[np.isnan(train['Age']), 'Age'] = lm.predict(X_pred_age) # IMPUTANCE def impute_age(cols): Age = cols[0] Pclass = cols[1] if pd.isnull(Age): if Pclass == 1: return 37 elif Pclass == 2:
def calculate_probability_distribution(tree , instances , index , cal_method =None): if cal_method == None : return tree.distribution_for_instance(instances.get_instance(index)) elif cal_method == 'Platt' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ (dist[1] - 0.5)*2.0 ] y_train[i] = [instance.get_value(instance.class_index)] # print("p_train ====>>>" , p_train) # print("y_train ====>>>" , y_train) dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0 tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : warnings.filterwarnings("ignore", category=FutureWarning) lr = LR(solver='lbfgs') lr.fit( p_train , np.ravel(y_train,order='C') ) return lr.predict_proba( tmp.reshape(1, -1))[0] elif cal_method == 'Isotonic' : p_train = np.zeros(shape=(instances.num_instances,1)) y_train = np.zeros(shape=(instances.num_instances,1)) for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) p_train[i] = [ dist[1] ] y_train[i] = [instance.get_value(instance.class_index)] dist = tree.distribution_for_instance(instances.get_instance(index))[1] tmp = np.zeros(shape=(1,1)) tmp[0] = [dist] print(np.sum(y_train)) if np.sum(y_train) in [len(y_train),0]: print("all one class") for ins in instances : print("ins ===> " , ins) return tree.distribution_for_instance(instances.get_instance(index)) else : ir = IR( out_of_bounds = 'clip' ) ir.fit(np.ravel(p_train,order='C') , np.ravel(y_train,order='C')) p = ir.transform( np.ravel(tmp,order='C'))[0] return [p,1-p] # elif cal_method == 'ProbabilityCalibrationTree' : # pass elif cal_method == 'ICP' : pass elif cal_method == 'Venn1' : calibrPts = [] for i,instance in enumerate(instances) : dist = tree.distribution_for_instance(instance) score = dist[0] if dist[1] < dist[0] else dist[1] calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) dist = (tree.distribution_for_instance(instances.get_instance(index))) score = dist[0] if dist[1] < dist[0] else dist[1] tmp = [score] p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp) print("Vennnnnn =========>>>>>>>>>>>> ", p0, " , ",p1) return [p0,p1] pass
thresholds = np.concatenate([mate_dists, nonmate_dists]) thresholds.sort() thresholds = np.insert(thresholds, 0, 0) # add 0 threshold thresholds = np.around(thresholds, 4) thresholds = np.unique(thresholds) fp = np.sum(nonmate_dists[:, np.newaxis] <= thresholds[np.newaxis, :], axis=0) fpr = fp.astype(np.float) / len(nonmate_dists) chosen_index = np.argmin(abs(fpr - 1e-4)) thresh = thresholds[chosen_index] tp = np.sum(mate_dists[:, np.newaxis] <= thresholds[np.newaxis, :], axis=0) tpr = tp.astype(np.float) / len(mate_dists) lr = LR(fit_intercept=False) dists = np.concatenate([mate_dists, nonmate_dists]) - thresh # y = classification where 1 is nonmate y = np.ones(dists.shape, dtype=np.int) y[:len(mate_dists)] = 0 lr.fit(dists[:, np.newaxis], y) # Prob = 1 / (1 + exp(- alpha * dist)) alpha = lr.coef_[0, 0] print("\nNet %s threshold=%f, \tplatt's scaling=%f" % ( net, thresh, alpha, # lr.intercept_ ))
import numpy as np import pandas as pd import sys from pandas import Series, DataFrame import matplotlib.pyplot as plt filename = 'telco.xls' data = pd.read_excel(filename) data.head() x = data.iloc[:, :37].as_matrix() y = data.iloc[:, 37].as_matrix() from sklearn.linear_model import LogisticRegression as LR lr = LR() # 建立逻辑回归模型 lr.fit(x, y) # 用筛选后的特征数据来训练模型 print(u'逻辑回归模型训练结束。') print(u'模型的平均正确率为:%s' % lr.score(x, y)) # 给出模型的平均正确率,本例为77.8% def cm_plot(y, yp): from sklearn.metrics import confusion_matrix # 导入混淆矩阵函数 cm = confusion_matrix(y, yp) # 混淆矩阵 import matplotlib.pyplot as plt # 导入作图库 plt.matshow(cm, cmap=plt.cm.Greens) # 画混淆矩阵图,配色风格使用cm.Greens,更多风格请参考官网。 plt.colorbar() # 颜色标签 for x in range(len(cm)): # 数据标签
marker=m) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.title('Linear Discriminant Analysis - 2 discriminants') plt.legend(loc='lower right') plt.tight_layout() plt.show() pause() # let's diverge a bit from the book and run Logistic Regression on the # transformed data set and test set. A = X_train_lda.shape X_train_lda = X_train_lda.reshape(A[0], A[1]) lr = LR(multi_class='ovr', solver='lbfgs', C=.05) lr.fit(X_train_lda, y_train) plot_decision_regions(X_train_lda, y_train, classifier=lr) plt.xlabel('LD 1') plt.ylabel('LD 2') plt.xlim((-3, 3)) plt.ylim((-3, 3)) plt.title('Logistic Regression with LDA k = 2 wine data set') plt.legend(loc='lower left') plt.tight_layout() plt.show() pause() # now use the test data to predict and compare to the class
ridge = Ridge(alpha=0.0001).fit(X_train, y_train) print("Ridge Score train set : {}".format(ridge.score(X_train, y_train.ravel()))) print("Ridge Score test set : {}\n ".format(ridge.score(X_test, y_test))) lasso = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train) print("Lasso 0.01 Score train set : {}".format(lasso.score(X_train, y_train))) print("Lasso 0.01 Score test set : {}\n ".format(lasso.score(X_test, y_test))) lasso_ = Lasso(alpha=0.00001, max_iter=100000).fit(X_train, y_train) print("Lasso 0.00001 Score train set : {}".format( lasso_.score(X_train, y_train))) print("Lasso 0.00001 Score test set : {}\n ".format( lasso_.score(X_test, y_test))) LinReg = LR().fit(X_train, y_train) print("Linear Regression Train set : {}".format(LinReg.score(X_train, y_train))) print("Linear Regression Test set : {}\n ".format(LinReg.score( X_test, y_test))) n_esti = 100 forest = RandomForestRegressor(n_estimators=n_esti, random_state=0) forest.fit(X_train, y_train) print("Forest n_esti {} Score train set : {}".format( n_esti, forest.score(X_train, y_train))) print("Forest n_esti {} Score test set : {}\n ".format( n_esti, forest.score(X_test, y_test))) lr, max_depth = 0.1, 5 gbrt_mdlow = GradientBoostingRegressor(random_state=0,
df.to_csv(f'../data/{fn}_all.csv') gc.collect() df = initial_df('../data/use_for_predictions.csv') df, y = bin_df_get_y(df) ac = ['diff', 'color', 'time', 'game_time', 'weekday', 'elo', 'opp_elo', 'game_num'] df = df[ac].copy() X = df.values # Linear Discriminant Analysis ld_cls = LDA(solver='lsqr') # Logistic Regression lr_cls = LR(C=0.01, max_iter=50, tol=7.5e-3, class_weight=None, solver='saga', random_state=5, multi_class='ovr') # KNeighbors Classifier kn_cls = KNNc(n_neighbors=41, weights='uniform', algorithm='brute', metric='chebyshev') # Ridge Classifier rd_cls = RdC(fit_intercept=False, class_weight=None, solver='lsqr', random_state=5) # Random Forest Classifier rf_cls = RFC(n_estimators=200, max_depth=10, min_samples_split=2, min_samples_leaf=3, max_features=None, class_weight=None, criterion='entropy', random_state=5) # Extra Trees Classifier
y = np.zeros(num) yTest = np.zeros(num) for i in range(tmp.size): if tmp[i] >= 0.5: y[i] = 1 if tmpTest[i] >= 0.5: yTest[i] = 1 # 同序打乱 state = np.random.get_state() np.random.shuffle(x) np.random.set_state(state) np.random.shuffle(y) scaler = preprocessing.StandardScaler().fit(x) xscaled = scaler.transform(x) scaler = preprocessing.StandardScaler().fit(xTest) xTestScaled = scaler.transform(xTest) lr = LR(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5) lrcv = LRCV(solver='newton-cg', penalty='l2', max_iter=1e5, tol=1e-5) lrcv.fit(xscaled, y) yPred = lrcv.predict(xscaled) yTestPred = lrcv.predict(xTestScaled) acc1 = (yPred == y).sum()/yPred.size acc2 = (yTestPred == yTest).sum()/yTestPred.size print('LR train acc is {:.4f}'.format(acc1)) # 精度 print('LR test acc is {:.4f}'.format(acc2)) # 精度
import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression as LR y_df = pd.read_csv("bias_score_reg.csv") y_data = y_df["x-values"].values.tolist() x_data = [] for i in range(len(y_data)): x_data.append(i) x_data = np.array(x_data).reshape(-1, 1) y_data = np.array(y_data).reshape(-1, 1) bias_model = LR() bias_model.fit(x_data, y_data) print(bias_model.coef_)
tsne_data = tsne_visual(tsne_data) plt.show() #2. Dimension reduction with various models: RF; LR; XGBOOST; GradB; All w/ RFE as it is more conservative data = data.iloc[:, :-1] X_train, X_test, y_train, y_test = train_test_split(data, targets, test_size=0.15) #Instantiating all models here with some basic values forest = RFC(n_estimators=250, random_state=42) gbc = GBC(n_estimators=250, random_state=42) xgbc = xgb.XGBClassifier(objective='reg:logistic', n_estimators=250, seed=42) logit = LR(solver='lbfgs', max_iter=300, random_state=42) def model_reduce(estimator, n_features, X, y, verbose=1): rfe = RFE(estimator=estimator, n_features_to_select=n_features) rfe.fit(X, y) rf_mask = rfe.support_ if verbose == 1: rfe_best_features(estimator, X, rfe) else: pass return rf_mask def rfe_best_features(model, data, rfe): '''Lower ranking= Better'''
label_ = self.predict(x) score_ = score_sup(label_, y) return score_ X, y = load_data() y = y[:, np.newaxis] """ 选取部分作为测试集 flag_choose=np.arange(np.shape(y)[0]) flag_train=(flag_choose%8!=0) flag_test=(flag_choose%8==0) """ for i in range(5): model = LogisticRegression(learning_rate=0.01, itr=200, batch_size=1, verbose=False) model.fit(X, y) print(i, "model's weights: ", model.weights.ravel()) #print(i,"model's precision : ",model.score(X[flag_test],y[flag_test])) model_sklearn = LR(C=1) model_sklearn.fit(X, y.ravel()) print("sklearn weights: ", model_sklearn.coef_, model_sklearn.intercept_) #print("sklearn's precision :",model_sklearn.score(X[flag_test],y[flag_test].ravel()))
if key[0] == i: count += 1 print("Variable %s has %s features" %(train.columns[i+1], count)) featureField.append([i+2, count]) # get one-hot encoded train, dev and test sets OneHotTrainNaive = oneHotEncoding(train.iloc[:,1:-1], featureMap) OneHotDevNaive = oneHotEncoding(dev.iloc[:,1:-1], featureMap) OneHotTestNaive = oneHotEncoding(test.iloc[:,1:], featureMap) OneHotTrainNaive = pd.concat([OneHotTrainNaive, train.iloc[:,-1]], axis=1) OneHotDevNaive = pd.concat([OneHotDevNaive, dev.iloc[:,-1]], axis=1) OneHotTestNaive = pd.concat([OneHotTestNaive], axis=1) # fitting regression model with sklearn linearReg = LR() linearReg.fit(OneHotTrainNaive.iloc[:,:-1],OneHotTrainNaive.iloc[:,-1]) devPred = linearReg.predict(OneHotDevNaive.iloc[:,:-1]) rmsleVan = rmse(devPred,OneHotDevNaive.iloc[:,-1]) print('\nRoom Mean Square Log Error for Naive implementations: %s' %(rmsleVan)) # get top 10 positive and negative features coeff = linearReg.coef_ topFeat = np.argsort(coeff)[-10:] bottomFeat = np.argsort(coeff)[:10] print("\nTop 10 Positive Features:") for x in topFeat: print("Variable: %s, Value: %s" %(train.columns[featureReMap[x][0]+1], featureReMap[x][1])) print("\nTop 10 Negative Features:")
from sklearn.cross_validation import train_test_split from sklearn.linear_model import LogisticRegression as LR from sklearn.feature_extraction.text import TfidfVectorizer def clean(text): return html.fromstring(text).text_content().lower().strip() tr_data = pd.read_csv('/media/datasets/kaggle_imdb/labeledTrainData.tsv', delimiter='\t') te_data = pd.read_csv('/media/datasets/kaggle_imdb/testData.tsv', delimiter='\t') trX = [clean(text) for text in tr_data['review'].values] trY = tr_data['sentiment'].values vect = TfidfVectorizer(min_df=10, ngram_range=(1, 2)) trX = vect.fit_transform(trX) model = LR() model.fit(trX, trY) ids = te_data['id'].values teX = [clean(text) for text in te_data['review'].values] teX = vect.transform(teX) pr_teX = model.predict_proba(teX)[:, 1] pd.DataFrame(np.asarray([ids, pr_teX]).T).to_csv('test.csv', index=False, header=["id", "sentiment"])
def main(args): print(args) now = str(datetime.datetime.now()) sess = tf.Session() # Off-plane sticker projection logo = tf.placeholder(tf.float32, shape=[None, 400, 900, 3], name='logo_input') param = tf.placeholder(tf.float32, shape=[None, 1], name='param_input') ph = tf.placeholder(tf.float32, shape=[None, 1], name='ph_input') result = projector(param, ph, logo) # Union of the sticker and face image mask_input = tf.placeholder(tf.float32, shape=[None, 900, 900, 3], name='mask_input') face_input = tf.placeholder(tf.float32, shape=[None, 600, 600, 3], name='face_input') theta = tf.placeholder(tf.float32, shape=[None, 6], name='theta_input') prepared = stn(result, theta) # Transformation to ArcFace template theta2 = tf.placeholder(tf.float32, shape=[None, 6], name='theta2_input') united = prepared[:,300:,150:750]*mask_input[:,300:,150:750]+\ face_input*(1-mask_input[:,300:,150:750]) final_crop = tf.clip_by_value(stn(united, theta2, (112, 112)), 0., 1.) # TV loss and gradients w_tv = tf.placeholder(tf.float32, name='w_tv_input') tv_loss = TVloss(logo, w_tv) grads_tv = tf.gradients(tv_loss, logo) grads_input = tf.placeholder(tf.float32, shape=[None, 112, 112, 3], name='grads_input') grads1 = tf.gradients(final_crop, logo, grad_ys=grads_input) # Varios images generator class Imgen(object): def __init__(self): self.fdict = {ph:[[args.ph]],\ logo:np.ones((1,400,900,3)),\ param:[[args.param]],\ theta:1./args.scale*np.array([[1.,0.,-args.x/450.,0.,1.,-args.y/450.]]),\ theta2:[[1.,0.,0.,0.,1.,0.]],\ w_tv:args.w_tv} mask = sess.run(prepared, feed_dict=self.fdict) self.fdict[mask_input] = mask def gen_fixed(self, im, advhat): self.fdict[face_input] = np.expand_dims(im, 0) self.fdict[logo] = np.expand_dims(advhat, 0) return self.fdict, sess.run(final_crop, feed_dict=self.fdict) def gen_random(self, im, advhat, batch=args.batch_size): alpha1 = np.random.uniform(-1., 1., size=(batch, 1)) / 180. * np.pi scale1 = np.random.uniform(args.scale - 0.02, args.scale + 0.02, size=(batch, 1)) y1 = np.random.uniform(args.y - 600. / 112., args.y + 600. / 112., size=(batch, 1)) x1 = np.random.uniform(args.x - 600. / 112., args.x + 600. / 112., size=(batch, 1)) alpha2 = np.random.uniform(-1., 1., size=(batch, 1)) / 180. * np.pi scale2 = np.random.uniform(1. / 1.04, 1.04, size=(batch, 1)) y2 = np.random.uniform(-1., 1., size=(batch, 1)) / 66. angle = np.random.uniform(args.ph - 2., args.ph + 2., size=(batch, 1)) parab = np.random.uniform(args.param - 0.0002, args.param + 0.0002, size=(batch, 1)) fdict = {ph:angle,param:parab,w_tv:args.w_tv,\ theta:1./scale1*np.hstack([np.cos(alpha1),np.sin(alpha1),-x1/450.,\ -np.sin(alpha1),np.cos(alpha1),-y1/450.]),\ theta2:scale2*np.hstack([np.cos(alpha2),np.sin(alpha2),np.zeros((batch,1)),\ -np.sin(alpha2),np.cos(alpha2),y2]),\ logo:np.ones((batch,400,900,3)),\ face_input:np.tile(np.expand_dims(im,0),[batch,1,1,1])} mask = sess.run(prepared, feed_dict=fdict) fdict[mask_input] = mask fdict[logo] = np.tile(np.expand_dims(advhat, 0), [batch, 1, 1, 1]) return fdict, sess.run(final_crop, feed_dict=fdict) gener = Imgen() # Initialization of the sticker init_logo = np.ones((400, 900, 3)) * 127. / 255. if args.init_face != None: init_face = io.imread(args.init_face) / 255. init_loss = tv_loss + tf.reduce_sum(tf.abs(init_face - united[0])) init_grads = tf.gradients(init_loss, logo) init_logo = np.ones((400, 900, 3)) * 127. / 255. fdict, _ = gener.gen_fixed(init_face, init_logo) moments = np.zeros((400, 900, 3)) print('Initialization from face, step 1/2') for i in tqdm(range(500)): fdict[logo] = np.expand_dims(init_logo, 0) grads = moments * 0.9 + sess.run(init_grads, feed_dict=fdict)[0][0] moments = moments * 0.9 + grads * 0.1 init_logo = np.clip(init_logo - 1. / 51. * np.sign(grads), 0., 1.) print('Initialization from face, step 2/2') for i in tqdm(range(500)): fdict[logo] = np.expand_dims(init_logo, 0) grads = moments * 0.9 + sess.run(init_grads, feed_dict=fdict)[0][0] moments = moments * 0.9 + grads * 0.1 init_logo = np.clip(init_logo - 1. / 255. * np.sign(grads), 0., 1.) io.imsave(now + '_init_logo.png', init_logo) elif args.init_logo != None: init_logo[:] = io.imread(args.init_logo) / 255. # Embedding model with tf.gfile.GFile(args.model, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def, input_map=None, return_elements=None, name="") image_input = tf.get_default_graph().get_tensor_by_name('input:0') embedding = tf.get_default_graph().get_tensor_by_name('embeddings:0') phase_train_placeholder = tf.placeholder_with_default(tf.constant( False, dtype=tf.bool), shape=None, name='phase_train') orig_emb = tf.placeholder(tf.float32, shape=[None, 128], name='orig_emb_input') cos_loss = tf.reduce_sum(tf.multiply(embedding, orig_emb), axis=1) grads2 = tf.gradients(cos_loss, image_input) fdict2 = {phase_train_placeholder: False} # Anchor embedding calculation if args.anchor_face != None: print(io.imread(args.anchor_face).shape) anch_im = rescale(io.imread(args.anchor_face) / 255., 112. / 600., order=5, multichannel=True) print((io.imread(args.anchor_face) / 255.).shape) fdict2[image_input] = prep(anch_im) fdict2[orig_emb] = sess.run(embedding, feed_dict=fdict2) elif args.anchor_emb != None: fdict2[orig_emb] = np.load(args.anchor_emb)[-1:] else: anch_im = rescale(io.imread(args.image) / 255., 112. / 600., order=5) fdict2[image_input] = prep(anch_im) fdict2[orig_emb] = sess.run(embedding, feed_dict=fdict2) # Attack constants im0 = io.imread(args.image) / 255. regr = LR(n_jobs=4) regr_len = 100 regr_coef = -1. moments = np.zeros((400, 900, 3)) moment_val = 0.9 step_val = 1. / 51. stage = 1 step = 0 lr_thresh = 100 ls = [] t = time() while True: # Projecting sticker to the face and feeding it to the embedding model fdict, ims = gener.gen_random(im0, init_logo) fdict2[image_input] = prep(ims) grad_tmp = sess.run(grads2, feed_dict=fdict2) fdict_val, im_val = gener.gen_fixed(im0, init_logo) fdict2[image_input] = prep(im_val) ls.append(sess.run(cos_loss, feed_dict=fdict2)[0]) # Gradients to the original sticker image fdict[grads_input] = grad_tmp[0] grads_on_logo = np.mean(sess.run(grads1, feed_dict=fdict)[0], 0) grads_on_logo += sess.run(grads_tv, feed_dict=fdict)[0][0] moments = moments * moment_val + grads_on_logo * (1. - moment_val) init_logo -= step_val * np.sign(moments) init_logo = np.clip(init_logo, 0., 1.) # Logging step += 1 if step % 20 == 0: print('Stage:', stage, 'Step:', step, 'Av. time:', round((time() - t) / step, 2), 'Loss:', round(ls[-1], 2), 'Coef:', regr_coef) # Switching to the second stage if step > lr_thresh: regr.fit(np.expand_dims(np.arange(100), 1), np.hstack(ls[-100:])) regr_coef = regr.coef_[0] if regr_coef >= 0: if stage == 1: stage = 2 moment_val = 0.995 step_val = 1. / 255. step = 0 regr_coef = -1. lr_thresh = 200 t = time() else: break plt.plot(range(len(ls)), ls) plt.savefig(now + '_cosine.png') io.imsave(now + '_advhat.png', (init_logo * 255.).astype(np.uint8))
# In[8]: train_data = vectorizer.fit_transform(trn) print(train_data.shape) # In[9]: dev_data = vectorizer.transform(dev) print(dev_data.shape) test_data = vectorizer.transform(tst) print(test_data.shape) # In[11]: classifier = LR() classifier.fit(train_data, trn_label_int) # In[12]: Train_accuracy = classifier.score(train_data, trn_label_int) Dev_accuracy = classifier.score(dev_data, dev_label_int) # In[19]: Train_accuracy * 100 # In[20]: Dev_accuracy * 100
from pre_processing import PreProcess from sklearn import metrics from sklearn.cross_validation import cross_val_score from sklearn.linear_model import LogisticRegression as LR import numpy as np import matplotlib.pyplot as plt from operator import add preprocess = PreProcess("data/train", "data/test") preprocess.read_train_test_data() preprocess.getTfIdf() #preprocess.add_pos_neg_feature() #preprocess.polarity_POS_features() softmax_clf = LR(multi_class='ovr', C=4) scores = cross_val_score(softmax_clf, preprocess.traintfIdf, preprocess.train_target, cv=3) print "the cross validated accuracy on training is " + str(scores) print("the cross validated accuracy(standard deviation) on training is: %0.4f (+/- %0.4f)" % ( scores.mean(), scores.std() * 2)) softmax_clf.fit(preprocess.traintfIdf, preprocess.train_target) train_pred_softmax = softmax_clf.predict(preprocess.traintfIdf) test_pred_softmax = softmax_clf.predict(preprocess.testtfIdf) # wrong_pred = np.where(preprocess.test_target!=test_pred_softmax) # np.savetxt("data/softmax_wrong.dat", wrong_pred, delimiter=',', fmt="%d") # c = test_pred_softmax!=preprocess.test_target # print np.where(c==True)
profit : float """ print('\t{:.2f} ->\t{:.5f}'.format(ratio, profit)) X_train, X_test, y_train, y_test = get_train_test(filepath) model.fit(X_train, y_train) y_predict = model.predict(X_test) confusion_mat = standard_confusion_matrix(y_test, y_predict) profit = np.sum(confusion_mat * cost_benefit) / len(y_test) original_ratio = np.mean(y_train) print('Profit from original ratio:') print_ratio_profit(original_ratio, profit) for sampling_techinque, name in zip( [undersample, oversample, smote], ['undersampling', 'oversampling', 'smoting']): print('Profit when {} to ratio of:'.format(name)) for ratio in np.arange(*range_params): X_sampled, y_sampled = sampling_techinque(X_train, y_train, ratio) model.fit(X_sampled, y_sampled) y_predict = model.predict(X_test) confusion_mat = standard_confusion_matrix(y_test, y_predict) profit = np.sum(confusion_mat * cost_benefit) / float(len(y_test)) print_ratio_profit(ratio, profit) if __name__ == '__main__': churn_filepath = './data/churn.csv' cost_benefit = np.array([[79, -20], [0, 0]]) profit_curve_main(churn_filepath, cost_benefit) sampling_main(LR(), churn_filepath, cost_benefit)
5, alphas) index, max_acc = max(enumerate(scores_20news), key=operator.itemgetter(1)) best_alpha_20news = alphas[index] print(best_alpha_20news) # IMDB dataset scores_imdb = MultinomialNB.tune_hyperparams(X_train_imdb, y_train_imdb, 5, alphas) index, max_acc = max(enumerate(scores_imdb), key=operator.itemgetter(1)) best_alpha_imdb = alphas[index] print(best_alpha_imdb) # hyperparameter tuning for Logistic Regression # 20news dataset rs = RandomizedSearchCV( LR(solver="lbfgs"), { "max_iter": np.arange(100, 500, 20), "C": [0.001, 0.01, 0.1, 1, 10, 100], }, return_train_score=False, n_iter=10, cv=5, ) rs.fit(X_train_20news, y_train_20news) resuts = pd.DataFrame(rs.cv_results_) print(resuts) print("best parameters for logistic regression:", rs.best_params_) print(SEPARATOR)
# 代码清单5-1 逻辑回归代码 import pandas as pd # 参数初始化 fileName = 'data/bankloan.xls' data = pd.read_excel(fileName) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() # 逻辑回归模型 from sklearn.linear_model import LogisticRegression as LR # 随机逻辑回归模型 from sklearn.linear_model import RandomizedLogisticRegression as RLR # 建立随机逻辑回归模型,筛选变量 rlr = RLR() # 训练模型 rlr.fit(x, y) # 获取特筛选结果,也可以通过.score_方法获取各个特征的分数 rlr.get_support() print(u'通过随机逻辑回归模型筛选特征结束。') print(u'有效特征为 %s' % '.'.join(data.columns[rlr.get_support()])) # 筛选好特征 x = data[data.columns[rlr.get_support()]].as_matrix() # 建立逻辑回归模型 lr = LR() # 用筛选后的特征数据来训练模型 lr.fit(x, y) print(u'逻辑回归模型训练结束。') # 给出模型的平均正确率,本例为81.48 print(u'模型的平均正确率为 %s' % lr.score(x, y))
plt.plot(Cost_i) plt.xlim(0, 1500) plt.ylabel('Cost J') plt.xlabel('Iterations') # In[10]: xx = np.array(range(1, 25)).reshape([24, 1]) yy = np.c_[np.ones(xx.shape[0]), xx].dot(theta) yy plt.scatter(X[:, 1], y, c='r') plt.plot(xx, yy, label='GD') regr = LR() regr.fit(X[:, 1].reshape(-1, 1), y) plt.plot(xx, regr.intercept_ + regr.coef_ * xx, label='LR') plt.xlim(4, 24) plt.xlabel('Population of City in 10,000s') plt.ylabel('Profit in $10,000s') plt.legend(loc=4) # In[11]: # Predict profit for a city with population of 35000 and 70000 print(theta.T.dot([1, 3.5]) * 10000) print(theta.T.dot([1, 7]) * 10000) # In[12]:
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.label) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] # np.random.seed(3) random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum) foldInstanceList = [] for foldIndex in range(foldNum - 1): foldIndexInstanceList = indexList[foldIndex * foldInstanceNum:(foldIndex + 1) * foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) cvIter = 0 totalAccList = [[] for i in range(10)] totalNewClassFlagList = [[] for i in range(10)] for foldIndex in range(foldNum): # self.clf = LinearSVC(random_state=3) # self.m_clf = LR(random_state=3) if self.m_multipleClass: self.m_clf = LR(multi_class="multinomial", solver='lbfgs', random_state=3, fit_intercept=False) else: self.m_clf = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex + 1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum * 0.9) fn_test = self.fn[test] label_test = self.label[test] fn_train = self.fn[train] featureDim = len(fn_train[0]) self.init_confidence_bound(featureDim) initExList = [] # initExList = [234, 366, 183] initExList = self.pretrainSelectInit(train, foldIndex) # initExList = [325, 287, 422] # random.seed(101) # initExList = random.sample(train, 3) fn_init = self.fn[initExList] label_init = self.label[initExList] print("initExList\t", initExList, label_init) queryIter = 3 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train) - set(labeledExList)) while queryIter < rounds: fn_train_iter = [] label_train_iter = [] fn_train_iter = self.fn[labeledExList] label_train_iter = self.label[labeledExList] self.m_clf.fit(fn_train_iter, label_train_iter) idx = self.select_example(unlabeledExList) self.update_select_confidence_bound(idx) # print(queryIter, "idx", idx, self.label[idx]) # self.update_select_confidence_bound(idx) labeledExList.append(idx) unlabeledExList.remove(idx) acc = self.get_pred_acc(fn_test, label_test, labeledExList) totalAccList[cvIter].append(acc) queryIter += 1 cvIter += 1 totalACCFile = modelVersion + "_acc.txt" totalACCFile = os.path.join(fileSrc, totalACCFile) f = open(totalACCFile, "w") for i in range(10): totalAlNum = len(totalAccList[i]) for j in range(totalAlNum): f.write(str(totalAccList[i][j]) + "\t") f.write("\n") f.close()
def optimCurveFit(strategy, method_clsf, ratio=0.8, NV_type='NVequals'): constrain_time = True ###################### #TODO Step 1: Data input ###################### data_set = 'mitdb' # 'ecgiddb', 'mitdb' channel = 0 records, IDs, fss, annss = mf.load_data( data_set, channel) #, num_persons=60, record_time=20) fs = fss[0] records = np.array(records) IDs = np.array(IDs) annss = np.array(annss) ###################### ###################### #TODO Step 2: Data selection ###################### if (strategy == 'allN_data') or (strategy == 'all_data'): '' # do nothing here elif strategy == 'NV_data': NV_inds = [6, 15, 18, 23, 24, 26, 29, 31, 33, 35, 39, 41, 42, 46] #for i in NV_inds: #range(annss.shape[0]): # # print i, Counter(annss[i][1])['V'] records = records[NV_inds, :] IDs = IDs[NV_inds] annss = annss[NV_inds, :] ## re-numbering the IDs... wtf for i in range(len(NV_inds)): IDs[i] = i elif strategy == 'combine_IDs': num_to_combine = 4 print IDs for i in range(int(len(records) / num_to_combine)): for j in range(num_to_combine - 1): IDs[i * num_to_combine + j + 1] = IDs[i * num_to_combine + j] #IDs[i*2+1] = IDs[i*2] for i in range(len(IDs)): IDs[i] /= num_to_combine if constrain_time: look_time = 600. # in s look_ind = int(look_time * fs) records = records[:, :look_ind] annss = annss[:, :look_ind] recs = [] for i in range(len(records)): curr_rec = Rec(records[i], fs, IDs[i], annss[i]) recs.append(curr_rec) ###################### ###################### #TODO Step 3: Data filtering ###################### ###################### ###################### #TODO Step 4: Data segmentation ###################### USE_BIOSPPY_FILTERED = True sigs, labels_bySegs = mf.get_seg_data(records, IDs, fss, USE_BIOSPPY_FILTERED, annss=annss) sigs, labels_bySegs = np.array(sigs), np.array(labels_bySegs) mrks_bySegs = np.array([x[-1] for x in labels_bySegs]) if strategy == 'allN_data': N_masks = (mrks_bySegs == 'N') sigs = sigs[N_masks, :] labels_bySegs = labels_bySegs[N_masks] IDs_bySegs = [int(x[:-1]) for x in labels_bySegs] mrks_bySegs = [x[-1] for x in labels_bySegs] IDs_bySegs, mrks_bySegs = np.array(IDs_bySegs), np.array(mrks_bySegs) segs = [] for i in range(len(sigs)): curr_seg = Seg(sig=sigs[i], fs=fs, ID=IDs_bySegs[i], mrk=mrks_bySegs[i]) segs.append(curr_seg) segs = np.array(segs) ###################### #for one_label in labels_all: # if ('N' in one_label) or ('V' in one_label): # print one_label #quit() #segs_all, labels_all = np.array(segs_all), np.array(labels_all) ###################### #TODO Step 5: feature extraction ###################### X_all = [] y_all = [] method_feat = 'PCA' # 'template_matching' if method_feat == 'PCA': feat_dim = 20 pca = PCA(n_components=feat_dim) X_all = np.array([x.sig for x in segs]) X_all = pca.fit(X_all).transform(X_all) for i in range(len(segs)): segs[i].feat = X_all[i, :] y_all = np.array([x.ID for x in segs]) X_all = np.array(X_all) ###################### ###################### #TODO Step 6: Data split ###################### if strategy != 'NV_data': X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42) else: X_train, X_test, y_train, y_test = [], [], [], [] y_test_mrks = [] for i in range(len(NV_inds)): curr_mrks = mrks_bySegs[IDs_bySegs == i] #current people's mrks\ #print curr_mrks curr_segs = segs[IDs_bySegs == i] curr_labels = labels_bySegs[IDs_bySegs == i] curr_inds_Vs = np.where(curr_mrks == 'V')[0] curr_inds_Ns = np.where(curr_mrks == 'N')[0] curr_num_Vs = sum(np.array(curr_mrks) == 'V') #all his Vs curr_num_Ns = sum(np.array(curr_mrks) == 'N') if NV_type == 'fixV': train_num_Vs = int(curr_num_Vs * .8) train_num_Ns = min( [int(curr_num_Ns * .8), int(ratio * train_num_Vs)]) elif NV_type == 'NVequals': train_num_Vs = int(curr_num_Vs * ratio) train_num_Ns = train_num_Vs train_inds_Vs = random.sample(curr_inds_Vs, train_num_Vs) test_inds_Vs = [ x for x in curr_inds_Vs if not (x in train_inds_Vs) ] #test_inds_Vs = curr_inds_Vs[~ train_inds_Vs] train_inds_Ns = random.sample(curr_inds_Ns, train_num_Ns) test_inds_Ns = [ x for x in curr_inds_Ns if not (x in train_inds_Ns) ] #print len(train_inds_Vs), len(test_inds_Vs) #print len(train_inds_Ns), len(test_inds_Ns) #test_inds_Ns = curr_inds_Vs[~ train_inds_Ns] # print train_inds_Ns # print test_inds_Ns curr_IDs = IDs_bySegs[IDs_bySegs == i] #print curr_IDs for one_seg in curr_segs[train_inds_Vs]: X_train.append(one_seg.feat.tolist()) for one_lab in curr_IDs[train_inds_Vs]: y_train.append(one_lab) for one_seg in curr_segs[train_inds_Ns]: X_train.append(one_seg.feat.tolist()) for one_lab in curr_IDs[train_inds_Ns]: y_train.append(one_lab) for one_seg in curr_segs[test_inds_Vs]: X_test.append(one_seg.feat.tolist()) for one_lab in curr_IDs[test_inds_Vs]: y_test.append(one_lab) for one_mrk in curr_mrks[test_inds_Vs]: y_test_mrks.append(one_mrk) for one_seg in curr_segs[test_inds_Ns]: X_test.append(one_seg.feat.tolist()) for one_lab in curr_IDs[test_inds_Ns]: y_test.append(one_lab) for one_mrk in curr_mrks[test_inds_Ns]: y_test_mrks.append(one_mrk) #print i #print len(X_train), len(y_train), len(X_test), len(y_test) X_train, y_train, X_test, y_test = \ np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test) ###################### #print X_train.shape, y_train.shape, X_test.shape, y_test.shape #quit() #print X_train #print X_test #y_train = [int(y[:-1]) for y in y_train] #y_test = [int(y[:-1]) for y in y_test] ###################### #TODO Step 7: Model training ###################### time_before_training = Time() if method_clsf == 'SVM': not_trained = True from sklearn.externals import joblib if not_trained: clf = svm.SVC(kernel='rbf', C=10., gamma=0.1) clf.fit(X_train, y_train) joblib.dump(clf, 'test_clf.pkl') else: clf = joblib.load('test_clf.pkl') res_pred = clf.predict(X_test) elif method_clsf == 'Logit': clf = LR(C=10.) clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'kNN': clf = KNC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'DTC': clf = DTC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'boosting': clf = XGBC() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'GNB': clf = GNB() clf.fit(X_train, y_train) res_pred = clf.predict(X_test) elif method_clsf == 'DL': not_trained = True from sklearn.externals import joblib if not_trained: model = Sequential() model.add( Dense(feat_dim, activation='relu', input_shape=(feat_dim, ))) #model.add(Dense(input_dim,activation='relu')) num_categs = len(set(y_train)) print y_train, num_categs Y_train = np_utils.to_categorical(y_train, num_categs) Y_test = np_utils.to_categorical(y_test, num_categs) model.add(Dense(num_categs, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) X_train = np.array(X_train) Y_train = np.array(Y_train) #print X_train.shape #print Y_train.shape model.fit(X_train, Y_train, validation_split=0.2, batch_size=32, nb_epoch=50, verbose=0) #model.save('test_clf_DL.pkl') else: model = keras.models.load_model('test_clf_DL.pkl') #score = model.evaluate(X_test, Y_test, verbose=0) time_after_training = Time() ###################### #TODO Step 8: Model testing ###################### if method_clsf != 'DL': res_pred = clf.predict(X_test) else: res_pred = model.predict_classes(X_test) ###################### ###################### #TODO Step 9: Result output ###################### train_time = time_after_training - time_before_training print_res = False if print_res: print '' print 'Parameters:' print 'strategy:', strategy print 'constrain_time:', constrain_time print 'ratio:', ratio print 'method_clsf:', method_clsf #print '' print 'Results:' print 'Used time for training:', time_after_training - time_before_training res_look = [] for i in range(len(res_pred)): res_look.append((res_pred[i], y_test[i])) #print res_look if False: res_pred_IDs = np.array([y[:-1] for y in res_pred]) res_pred_mrks = np.array([y[-1] for y in res_pred]) only_test_ID = True if only_test_ID: to_be_predct = res_pred_IDs to_be_tested = y_test else: to_be_predct = res_pred to_be_tested = y_test ##TODO: adjust accordingly if strategy == 'NV_data': look_stat = 'V' y_test_mrks = np.array(y_test_mrks) #print y_test_mrks to_be_predct = res_pred[y_test_mrks == look_stat] to_be_tested = y_test[y_test_mrks == look_stat] res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_seg') res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_categ') one_res = (float(format(res_by_seg, '.3f')), float(format(res_by_categ, '.3f'))) accuBySeg_V = one_res[0] #print len(to_be_predct), one_res look_stat = 'N' to_be_predct = res_pred[y_test_mrks == look_stat] to_be_tested = y_test[y_test_mrks == look_stat] res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_seg') res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_categ') one_res = (float(format(res_by_seg, '.3f')), float(format(res_by_categ, '.3f'))) accuBySeg_N = one_res[0] #print len(to_be_predct), one_res return [accuBySeg_V, accuBySeg_N, train_time] else: to_be_predct = res_pred to_be_tested = y_test res_by_seg = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_seg') res_by_categ = mf.get_corr_ratio(res_pred=to_be_predct, y_test=to_be_tested, type='by_categ') one_res = (float(format(res_by_seg, '.3f')), float(format(res_by_categ, '.3f'))) return [one_res[0], train_time]
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.m_targetLabel) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] np.random.seed(3) np.random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum) foldInstanceList = [] for foldIndex in range(foldNum - 1): foldIndexInstanceList = indexList[foldIndex * foldInstanceNum:(foldIndex + 1) * foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) # random.seed(3) totalAccList = [[] for i in range(10)] humanAccList = [[] for i in range(10)] totalExtraAccList = [] # self.get_base_learners() correctTransferRatioList = [] totalTransferNumList = [] correctTransferLabelNumList = [] correctUntransferRatioList = [] totalAuditorPrecisionList = [] totalAuditorRecallList = [] totalAuditorAccList = [] for foldIndex in range(foldNum): if self.m_multipleClass: self.m_clf = LR(multi_class="multinomial", solver='lbfgs', random_state=3) else: self.m_clf = LR(random_state=3) self.m_auditor = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex + 1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainData, valid = train_test_split(train, random_state=3, test_size=0.2) train = trainData targetNameFeatureTrain = self.m_targetNameFeature[train] targetLabelTrain = self.m_targetLabel[train] targetNameFeatureValid = self.m_targetNameFeature[valid] targetLabelValid = self.m_targetLabel[valid] # targetDataFeatureTrain = self.m_targetDataFeature[train] targetNameFeatureTest = self.m_targetNameFeature[test] targetLabelTest = self.m_targetLabel[test] transferLabelTest = self.m_transferLabel[test] # targetDataFeatureTest = self.m_targetDataFeature[test] # sourceUniqueClass = np.unique(self.m_sourceLabel) initExList = [] initExList = self.pretrainSelectInit(train, foldIndex) targetNameFeatureInit = self.m_targetNameFeature[initExList] targetLabelInit = self.m_targetLabel[initExList] transferLabelInit = self.m_transferLabel[initExList] print("initExList\t", initExList, targetLabelInit) queryIter = 0 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train) - set(labeledExList)) activeLabelNum = 3.0 transferLabelNum = 0.0 transferFeatureList = [] transferFlagList = [] featureDim = len(targetNameFeatureTrain[0]) self.init_confidence_bound(featureDim, labeledExList, unlabeledExList) targetNameFeatureIter = targetNameFeatureInit targetLabelIter = targetLabelInit correctTransferLabelNum = 0.0 wrongTransferLabelNum = 0.0 correctUntransferLabelNum = 0.0 wrongUntransferLabelNum = 0.0 # auditorPrecisionList = [] # auditorRecallList = [] auditorAccList = [] extraAccList = [] self.m_clf.fit(targetNameFeatureInit, targetLabelInit) # targetAuditorLabelInit = (targetLabelInit==transferLabelInit) for exId in initExList: if self.m_targetLabel[exId] == self.m_transferLabel[exId]: transferFlagList.append(1.0) else: transferFlagList.append(0.0) transferFeatureList.append(self.m_targetNameFeature[exId]) auditorScoreFlag = False if len(np.unique(transferFlagList)) > 1: self.m_auditor.fit(np.array(transferFeatureList), np.array(transferFlagList)) auditorScoreFlag = True while activeLabelNum < rounds: exId = self.select_example(unlabeledExList, auditorScoreFlag) exLabel = -1 self.m_strongLabeledIDList.append(exId) # self.update_select_confidence_bound(exId) # self.update_judge_confidence_bound(exId) activeLabelNum += 1.0 activeLabelFlag = True exLabel = self.m_targetLabel[exId] transferLabel = self.m_transferLabel[exId] if transferLabel == exLabel: # correctUntransferLabelNum += 1.0 transferFlagList.append(1.0) transferFeatureList.append(self.m_targetNameFeature[exId]) else: # wrongUntransferLabelNum += 1.0 transferFlagList.append(0.0) transferFeatureList.append(self.m_targetNameFeature[exId]) # auditorPrecision = 0.0 # if correctTransferLabelNum+wrongTransferLabelNum > 0.0: # auditorPrecision = correctTransferLabelNum*1.0/(correctTransferLabelNum+wrongTransferLabelNum) auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest) # print("auditorAcc", auditorAcc) auditorAccList.append(auditorAcc) labeledExList.append(exId) unlabeledExList.remove(exId) # acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter) # totalAccList[cvIter].append(acc) extraAcc = self.addExtraWeakLabels( transferFeatureList, transferFlagList, targetNameFeatureValid, targetLabelValid, targetNameFeatureTest, transferLabelTest, targetLabelTest, queryIter) extraAccList.append(extraAcc) # humanAccList[cvIter].append(acc) queryIter += 1 # totalAuditorPrecisionList.append(auditorPrecisionList) # totalAuditorRecallList.append(auditorRecallList) totalAuditorAccList.append(auditorAccList) totalExtraAccList.append(extraAccList) cvIter += 1 # print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList))) # print("extraList", extraAccList, np.mean(extraAccList), np.sqrt(np.var(extraAccList))) # print("correct ratio\t", np.mean(correctTransferRatioList), np.sqrt(np.var(correctTransferRatioList))) # print("untransfer correct ratio\t", np.mean(correctUntransferRatioList), np.sqrt(np.var(correctUntransferRatioList))) # AuditorPrecisionFile = modelVersion+"_auditor_precision.txt" # writeFile(totalAuditorPrecisionList, AuditorPrecisionFile) # AuditorRecallFile = modelVersion+"_auditor_recall.txt" # writeFile(totalAuditorRecallList, AuditorRecallFile) AuditorAccFile = modelVersion + "_auditor_acc.txt" writeFile(totalAuditorAccList, AuditorAccFile) # totalACCFile = modelVersion+"_acc.txt" # writeFile(totalAccList, totalACCFile) # humanACCFile = modelVersion+"_human_acc.txt" # writeFile(humanAccList, humanACCFile) extraACCFile = modelVersion + "_extra_acc.txt" writeFile(totalExtraAccList, extraACCFile)
# In[]: # 散点图 for i in Xtrain.columns: ft.con_data_scatter(Xtrain, i, Ytrain, "Y") # In[]: ft.con_data_scatter(Xtrain, 'AveRooms', Ytrain, "Y") # In[]: # 特征的 皮尔森相关度 ft.corrFunction(Xtrain) # In[]: ''' 测试 SKlearn 和 statsmodels,在 无超参数情况下,是相同的。 ''' reg = LR().fit(Xtrain, Ytrain) yhat = reg.predict(Xtrain) #预测我们的yhat print(reg.score(Xtrain, Ytrain)) predict = pd.DataFrame(yhat, columns=['Pred']) resid = pd.DataFrame((Ytrain["Y"] - predict["Pred"]), columns=['resid']) resid_1 = pd.concat([predict, resid], axis=1) resid_1.plot('Pred', 'resid', kind='scatter') print(ft.r2_score_customize(Ytrain, yhat, 2)) print(ft.adj_r2_customize(Ytrain, yhat, Xtrain.shape[1], 2)) # In[]: from statsmodels.formula.api import ols
#1 - 不分割資料集 import pandas as pd import numpy as np from sklearn import datasets from sklearn.linear_model import LinearRegression as LR from sklearn.model_selection import train_test_split as tts diabetes = datasets.load_diabetes() x = pd.DataFrame(diabetes.data, columns=diabetes.feature_names) target = pd.DataFrame(diabetes.target, columns=["QM"]) y = target["QM"] lm = LR() lm.fit(x, y) pred = lm.predict(x) MSE = np.mean((y - pred)**2) R = lm.score(x, y) print("【#1 不分割資料集】") print("完整資料的 MSE:", MSE) print("完整資料的 R^2:", R) print() #2 - 分割比例為 3:1 x_train, x_test, y_train, y_test = tts(x, y, test_size=0.25, random_state=100) lm = LR() lm.fit(x_train, y_train)
def roc_it(input_file=INPUT_FILE): beer = pd.read_csv(input_file, delimiter='\t').dropna() # add class label for top half / bottom half midpt = int(len(beer) / 2) beer['label'] = beer['Rank'].map(lambda k: 1 if k <= midpt else 0) # drop categorical columns features = beer[['ABV', 'Reviews']] labels = beer['label'] # create cv iterator (note: train pct is set implicitly by number of folds) num_recs = len(beer) kf = cv.KFold(n=num_recs, n_folds=NUM_FOLDS, shuffle=True) # initialize results sets all_fprs, all_tprs, all_aucs = (np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS), np.zeros(NUM_FOLDS)) for i, (train_index, test_index) in enumerate(kf): # initialize & train model model = LR() # debug! train_features = features.loc[train_index].dropna() train_labels = labels.loc[train_index].dropna() test_features = features.loc[test_index].dropna() test_labels = labels.loc[test_index].dropna() model.fit(train_features, train_labels) # predict labels for test features pred_labels = model.predict(test_features) # calculate ROC/AUC fpr, tpr, thresholds = roc_curve(test_labels, pred_labels, pos_label=1) roc_auc = auc(fpr, tpr) print '\nfpr = {0}'.format(fpr) print 'tpr = {0}'.format(tpr) print 'auc = {0}'.format(roc_auc) all_fprs[i] = fpr[1] all_tprs[i] = tpr[1] all_aucs[i] = roc_auc print '\nall_fprs = {0}'.format(all_fprs) print 'all_tprs = {0}'.format(all_tprs) print 'all_aucs = {0}'.format(all_aucs) # plot ROC curve pl.clf() pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) pl.plot([0, 1], [0, 1], 'k--') pl.xlim([0.0, 1.0]) pl.ylim([0.0, 1.0]) pl.xlabel('False Positive Rate') pl.ylabel('True Positive Rate') pl.title('Receiver operating characteristic example') pl.legend(loc="lower right") pl.show()
print('[03]\n', X_train.head(5)) # 学習用データの説明変数Xの先頭5行を表示 print('[04]\n', y_train.head(5)) # 学習用データの目的変数Yの先頭5行を表示 # [04],[05]の最左列はCSVファイルの行番号です。 print('[05] 分離前の全データの説明変数Xの基本統計量\n', X.describe()) print('[06] 学習用データの説明変数Xの基本統計量\n', X_train.describe()) print('[07] 評価用データの説明変数Xの基本統計量\n', X_test.describe()) # count=データの個数、 mean=平均値、 std=標準偏差 # min=最小値、50%=中央値、max=最大値 # 欠損データの確認 # ※もし欠測データがあった場合には、dropna関数で当該行のデータを削除、もしくはfillna # 関数で仮の値を補充し、誤ったデータをもとに学習してしまわないようにします。 print("[08] 学習用データの欠測値の個数(X,Y)=\n", X_train.isnull().sum(), y_train.isnull().sum()) print("[09] 評価用データの欠測値の個数(X,Y)=\n", X_test.isnull().sum(), y_test.isnull().sum()) # 線形回帰による単回帰分析の実行 model = LR() # 線形回帰のための器をLRライブラリを使って変数modelに準備 model.fit(X_train, y_train) # 学習用データ(X_train, Y_train)を使って線形回帰分析 print( f"[11] 近似式は {MOKUTEKI} = {SETSUMEI} * {model.coef_} + {model.intercept_}") # 線形回帰分析の結果得られたモデル(model)を評価用データに適用して精度をチェック print("[12] 得られたモデルの評価用データへの適用時の決定係数R^2(最良=1)は", model.score(X_test, y_test)) # 線形近似ではなく、曲線近似をさせるにはscipyライブラリを使う方法があります。 # ここでは解説しませんが、興味があれば調べてみてください。
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.label) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] print("featureNum", len(self.fn[0])) # print("non zero feature num", sum(self.fn[0])) totalTransferNumList = [] # np.random.seed(3) # np.random.shuffle(indexList) random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum) foldInstanceList = [] for foldIndex in range(foldNum - 1): foldIndexInstanceList = indexList[foldIndex * foldInstanceNum:(foldIndex + 1) * foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) cvIter = 0 # random.seed(3) totalAccList = [0 for i in range(10)] coefList = [0 for i in range(10)] for foldIndex in range(foldNum): # self.m_clf = LinearSVC(random_state=3) # self.m_clf = LR(fit_intercept=False) self.m_clf = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex + 1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum * 0.9) # print(test) fn_test = self.fn[test] label_test = self.label[test] sampledTrainNum = len(train) # sampledTrainNum = 100 train_sampled = random.sample(train, sampledTrainNum) fn_train = self.fn[train_sampled] label_train = self.label[train_sampled] self.m_clf.fit(fn_train, label_train) coefList[cvIter] = self.m_clf.coef_ label_preds = self.m_clf.predict(fn_test) acc = accuracy_score(label_test, label_preds) totalAccList[cvIter] = acc # initExList = [] # random.seed(3) # initExList = random.sample(train, 3) # fn_init = self.fn[initExList] # label_init = self.label[initExList] # print("initExList\t", initExList, label_init) # queryIter = 3 # labeledExList = [] # unlabeledExList = [] # ###labeled index # labeledExList.extend(initExList) # unlabeledExList = list(set(train)-set(labeledExList)) # featureDim = len(self.fn[0]) # self.init_confidence_bound(featureDim) # while queryIter < rounds: # fn_train_iter = [] # label_train_iter = [] # fn_train_iter = self.fn[labeledExList] # label_train_iter = self.label[labeledExList] # self.m_clf.fit(fn_train_iter, label_train_iter) # idx = self.select_example(unlabeledExList) # self.update_confidence_bound(idx) # # print(queryIter, "idx", idx, self.label[idx]) # labeledExList.append(idx) # unlabeledExList.remove(idx) # acc = self.get_pred_acc(fn_test, label_test, labeledExList) # totalAccList[cvIter].append(acc) # queryIter += 1 cvIter += 1 totalACCFile = modelVersion + ".txt" f = open(totalACCFile, "w") for i in range(10): f.write(str(totalAccList[i])) # for j in range(totalAlNum): # f.write(str(totalAccList[i][j])+"\t") f.write("\n") f.close() coefFile = modelVersion + "_coef.txt" f = open(coefFile, "w") for i in range(10): coef4Classifier = coefList[i] coefNum = len(coef4Classifier) for coefIndex in range(coefNum): f.write(str(coef4Classifier[coefIndex]) + "\t") f.write("\n") f.close() print(np.mean(totalAccList), np.sqrt(np.var(totalAccList)))
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.m_targetLabel) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] np.random.seed(3) np.random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum*1.0/foldNum) foldInstanceList = [] for foldIndex in range(foldNum-1): foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) # random.seed(3) totalAccList = [[] for i in range(10)] humanAccList = [[] for i in range(10)] totalExtraAccList = [] # self.get_base_learners() correctTransferRatioList = [] totalTransferNumList = [] correctUntransferRatioList = [] totalAuditorPrecisionList = [] totalAuditorRecallList = [] totalAuditorAccList = [] for foldIndex in range(foldNum): # self.clf = LinearSVC(random_state=3) self.m_clf = LR(multi_class="multinomial", solver='lbfgs',random_state=3) # self.m_judgeClassifier = LR(random_state=3) self.m_auditor0 = LR(random_state=3) self.m_auditor1 = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex+1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum*0.9) targetNameFeatureTrain = self.m_targetNameFeature[train] targetLabelTrain = self.m_targetLabel[train] # targetDataFeatureTrain = self.m_targetDataFeature[train] targetNameFeatureTest = self.m_targetNameFeature[test] targetLabelTest = self.m_targetLabel[test] # transferLabelTest = self.m_transferLabel[test] # targetDataFeatureTest = self.m_targetDataFeature[test] # sourceUniqueClass = np.unique(self.m_sourceLabel) initExList = [] initExList = self.pretrainSelectInit(train, foldIndex) targetNameFeatureInit = self.m_targetNameFeature[initExList] targetLabelInit = self.m_targetLabel[initExList] print("initExList\t", initExList, targetLabelInit) queryIter = 0 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train)-set(labeledExList)) activeLabelNum = 3.0 transferLabelNum = 0.0 transferFeatureList = [] transferFlagList0 = [] transferFlagList1 = [] featureDim = len(targetNameFeatureTrain[0]) self.init_confidence_bound(featureDim, labeledExList, unlabeledExList) targetNameFeatureIter = targetNameFeatureInit targetLabelIter = targetLabelInit correctTransferLabelNum = 0.0 wrongTransferLabelNum = 0.0 correctUntransferLabelNum = 0.0 wrongUntransferLabelNum = 0.0 # auditorPrecisionList = [] # auditorRecallList = [] auditorAccList = [] extraAccList = [] self.m_clf.fit(targetNameFeatureInit, targetLabelInit) while activeLabelNum < rounds: # targetNameFeatureIter = self.m_targetNameFeature[labeledExList] # targetLabelIter = self.m_targetLabel[labeledExList] # self.m_clf.fit(targetNameFeatureIter, targetLabelIter) exId = self.select_example(unlabeledExList) exLabel = -1 self.m_strongLabeledIDList.append(exId) # self.update_select_confidence_bound(exId) self.update_judge_confidence_bound(exId) activeLabelNum += 1.0 activeLabelFlag = True exLabel = self.m_targetLabel[exId] transferLabel0 = self.m_transferLabel0[exId] transferLabel1 = self.m_transferLabel1[exId] transferFeatureList.append(self.m_targetNameFeature[exId]) if transferLabel0 == exLabel: # correctUntransferLabelNum += 1.0 transferFlagList0.append(1.0) else: # wrongUntransferLabelNum += 1.0 transferFlagList0.append(0.0) if transferLabel1 == exLabel: # correctUntransferLabelNum += 1.0 transferFlagList1.append(1.0) else: # wrongUntransferLabelNum += 1.0 transferFlagList1.append(0.0) # auditorPrecision = 0.0 # if correctTransferLabelNum+wrongTransferLabelNum > 0.0: # auditorPrecision = correctTransferLabelNum*1.0/(correctTransferLabelNum+wrongTransferLabelNum) # auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest) auditorAcc = 0.0 # print("auditorAcc", auditorAcc) auditorAccList.append(auditorAcc) labeledExList.append(exId) unlabeledExList.remove(exId) # acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter) # totalAccList[cvIter].append(acc) extraAcc = self.addExtraExample(transferFeatureList, transferFlagList0, transferFlagList1, targetNameFeatureTest, targetLabelTest) extraAccList.append(extraAcc) # humanAccList[cvIter].append(acc) queryIter += 1 # totalAuditorPrecisionList.append(auditorPrecisionList) # totalAuditorRecallList.append(auditorRecallList) totalAuditorAccList.append(auditorAccList) totalExtraAccList.append(extraAccList) cvIter += 1 AuditorAccFile = modelVersion+"_auditor_acc.txt" writeFile(totalAuditorAccList, AuditorAccFile) # totalACCFile = modelVersion+"_acc.txt" # writeFile(totalAccList, totalACCFile) # humanACCFile = modelVersion+"_human_acc.txt" # writeFile(humanAccList, humanACCFile) extraACCFile = modelVersion+"_extra_acc.txt" writeFile(totalExtraAccList, extraACCFile)
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.m_targetLabel) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] np.random.seed(3) np.random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum*1.0/foldNum) foldInstanceList = [] for foldIndex in range(foldNum-1): foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):] foldInstanceList.append(foldIndexInstanceList) totalAccList = [[] for i in range(10)] humanAccList = [[] for i in range(10)] correctTransferRatioList = [] totalTransferNumList = [] correctUntransferRatioList = [] totalAuditorPrecisionList = [] totalAuditorRecallList = [] totalAuditorAccList = [] for foldIndex in range(foldNum): self.m_clf = LR(multi_class="multinomial", solver='lbfgs',random_state=3) self.m_auditor0 = LR(random_state=3) self.m_auditor1 = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex+1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum*0.9) targetNameFeatureTrain = self.m_targetNameFeature[train] targetLabelTrain = self.m_targetLabel[train] # targetDataFeatureTrain = self.m_targetDataFeature[train] targetNameFeatureTest = self.m_targetNameFeature[test] targetLabelTest = self.m_targetLabel[test] # transferLabelTest = self.m_transferLabel[test] transferLabelTest = [] initExList = [] initExList = self.pretrainSelectInit(train, foldIndex) # random.seed(101) # initExList = random.sample(train, 3) targetNameFeatureInit = self.m_targetNameFeature[initExList] targetLabelInit = self.m_targetLabel[initExList] print("initExList\t", initExList, targetLabelInit) queryIter = 0 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train)-set(labeledExList)) activeLabelNum = 3.0 transferLabelNum = 0.0 transferFeatureList = [] transferFlagList0 = [] transferFlagList1 = [] featureDim = len(targetNameFeatureTrain[0]) self.init_confidence_bound(featureDim, labeledExList, unlabeledExList) targetNameFeatureIter = targetNameFeatureInit targetLabelIter = targetLabelInit correctTransferLabelNum = 0.0 wrongTransferLabelNum = 0.0 correctUntransferLabelNum = 0.0 wrongUntransferLabelNum = 0.0 # auditorPrecisionList = [] # auditorRecallList = [] auditorAccList = [] while activeLabelNum < rounds: # targetNameFeatureIter = self.m_targetNameFeature[labeledExList] # targetLabelIter = self.m_targetLabel[labeledExList] self.m_clf.fit(targetNameFeatureIter, targetLabelIter) exId = self.select_example(unlabeledExList) # self.update_select_confidence_bound(exId) # print(idx) activeLabelFlag = False transferLabelFlag, weakOracleIndex, transferLabel = self.get_transfer_flag(transferFeatureList, transferFlagList0, transferFlagList1, exId, activeLabelNum) exLabel = -1 if transferLabelFlag: self.m_weakLabeledIDList.append(exId) transferLabelNum += 1.0 activeLabelFlag = False exLabel = transferLabel targetNameFeatureIter = np.vstack((targetNameFeatureIter, self.m_targetNameFeature[exId])) targetLabelIter = np.hstack((targetLabelIter, exLabel)) # targetNameFeatureIter.append(self.m_targetNameFeature[exId]) # targetLabelIter.append(exLabel) if exLabel == self.m_targetLabel[exId]: correctTransferLabelNum += 1.0 print("queryIter\t", queryIter) else: wrongTransferLabelNum += 1.0 print("query iteration", queryIter, "error transfer label\t", exLabel, "true label", self.m_targetLabel[exId]) else: self.m_strongLabeledIDList.append(exId) self.update_judge_confidence_bound(exId) activeLabelNum += 1.0 activeLabelFlag = True exLabel = self.m_targetLabel[exId] targetNameFeatureIter = np.vstack((targetNameFeatureIter, self.m_targetNameFeature[exId])) targetLabelIter = np.hstack((targetLabelIter, exLabel)) # targetNameFeatureIter.append(self.m_targetNameFeature[exId]) # targetLabelIter.append(exLabel) weakLabel0 = self.m_transferLabel0[exId] weakLabel1 = self.m_transferLabel1[exId] transferFeatureList.append(self.m_targetNameFeature[exId]) if weakLabel0 == exLabel: correctUntransferLabelNum += 1.0 transferFlagList0.append(1.0) else: wrongUntransferLabelNum += 1.0 transferFlagList0.append(0.0) if weakLabel1 == exLabel: correctUntransferLabelNum += 1.0 transferFlagList1.append(1.0) else: wrongUntransferLabelNum += 1.0 transferFlagList1.append(0.0) auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList0, transferFlagList1, targetNameFeatureTest, transferLabelTest, targetLabelTest) print("auditorAcc", auditorAcc) auditorAccList.append(auditorAcc) labeledExList.append(exId) unlabeledExList.remove(exId) acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter) totalAccList[cvIter].append(acc) if activeLabelFlag: humanAccList[cvIter].append(acc) queryIter += 1 totalAuditorAccList.append(auditorAccList) transferLabelNum = len(self.m_weakLabeledIDList) totalTransferNumList.append(transferLabelNum) cvIter += 1 print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList))) AuditorAccFile = modelVersion+"_auditor_acc.txt" writeFile(totalAuditorAccList, AuditorAccFile) totalACCFile = modelVersion+"_acc.txt" writeFile(totalAccList, totalACCFile) humanACCFile = modelVersion+"_human_acc.txt" writeFile(humanAccList, humanACCFile)