MALE_FEM.append(in_data[i][4]) Data = pd.DataFrame(index=range(1, 712)) Data['TOT_POP'] = pd.Series(TOT_POP, index=Data.index) Data['PCT_U18'] = pd.Series(PCT_U18, index=Data.index) Data['PC_18_65'] = pd.Series(PC_18_65, index=Data.index) Data['PCT_O65'] = pd.Series(PCT_O65, index=Data.index) Data['MALE_FEM'] = pd.Series(MALE_FEM, index=Data.index) X = np.log(Data[['TOT_POP', 'PCT_U18', 'PC_18_65', 'PCT_O65']]) y = np.log(Data['MALE_FEM']) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) from sklearn.linear_model import LinearRegression as LR lm = LR(normalize=True, fit_intercept=True) lm_fit = lm.fit(X_train, y_train) Predicts = lm.predict(np.array(X_test)) import sklearn.metrics as po print(po.r2_score(np.exp(y_test), np.exp(Predicts)))
# -*- coding:utf-8 -*- import pandas as pd filename = '../data/bankloan.xls' data = pd.read_excel(filename) x = data.iloc[:,:8].as_matrix() y = data.iloc[:,8].as_matrix() from sklearn.linear_model import LogisticRegression as LR from sklearn.linear_model import RandomizedLogisticRegression as RLR rlr = RLR() #建立随机逻辑回归模型,复筛选变量 rlr.fit(x, y) #训练模型 rlr.get_support() #获取特征筛选变量 print(u'有效特征为:%s' % ','.join(data.columns[rlr.get_support()])) x = data[data.columns[rlr.get_support()]].as_matrix() #筛选锟斤拷锟斤拷锟斤拷 lr = LR() #建立逻辑回归模型 lr.fit(x, y) #训练模型 print(u'模型的平均正确率:%s' % lr.score(x, y))
def time_accuracy(subject, bands, ec, kwargs, has_data, folds=10): """ Classify data independently at each point in time. """ def reshape_time(X): n_ex = X.shape[0] Xp = X.reshape(n_ex, 1, -1, 258) return np.transpose(Xp, (0, 1, 3, 2)) ds = ec.ECoG(subject, bands, 'train', **kwargs) X_shape = ds.get_topological_view().shape n_time = 258 ca = np.zeros((10, n_time)) va = np.zeros((10, n_time)) cva = np.zeros((10, n_time)) c_va = np.zeros((10, n_time)) for fold in range(folds): kwargs_copy = copy.deepcopy(kwargs) print('fold: {}'.format(fold)) cv_ds = ec.ECoG(subject, bands, 'train', fold=fold, **kwargs_copy) kwargs_copy['consonant_prediction'] = True c_ds = ec.ECoG(subject, bands, 'train', fold=fold, **kwargs_copy) kwargs_copy['consonant_prediction'] = False kwargs_copy['vowel_prediction'] = True v_ds = ec.ECoG(subject, bands, 'train', fold=fold, **kwargs_copy) # Consonants c_ts = c_ds.get_test_set() c_vs = c_ds.get_valid_set() c_train_X = reshape_time( np.concatenate( (c_ds.get_topological_view(), c_vs.get_topological_view()), axis=0)) c_train_y = np.concatenate((c_ds.y, c_vs.y), axis=0) c_test_X = reshape_time(c_ts.get_topological_view()) c_test_y = c_ts.y # Vowels v_ts = v_ds.get_test_set() v_vs = v_ds.get_valid_set() v_train_X = reshape_time( np.concatenate( (v_ds.get_topological_view(), v_vs.get_topological_view()), axis=0)) v_train_y = np.concatenate((v_ds.y, v_vs.y), axis=0) v_test_X = reshape_time(v_ts.get_topological_view()) v_test_y = v_ts.y # CV cv_ts = cv_ds.get_test_set() cv_vs = cv_ds.get_valid_set() cv_train_X = reshape_time( np.concatenate( (cv_ds.get_topological_view(), cv_vs.get_topological_view()), axis=0)) cv_train_y = np.concatenate((cv_ds.y, cv_vs.y), axis=0) cv_test_X = reshape_time(cv_ts.get_topological_view()) cv_test_y = cv_ts.y assert np.all(c_train_X == v_train_X) assert np.all(c_train_X == cv_train_X) assert np.all(c_test_X == v_test_X) assert np.all(c_test_X == cv_test_X) for tt in range(n_time): X_train = c_train_X[:, 0, tt] c_cl = LR(solver='lbfgs', multi_class='multinomial').fit(X_train, c_train_y.ravel()) v_cl = LR(solver='lbfgs', multi_class='multinomial').fit(v_train_X[:, 0, tt], v_train_y.ravel()) """ cv_cl = LR(solver='lbfgs', multi_class='multinomial').fit(cv_train_X[:, 0, tt], cv_train_y.ravel()) cva[fold, tt] = cv_cl.score(cv_test_X[:, 0, tt], cv_test_y.ravel()) """ ca[fold, tt] = c_cl.score(c_test_X[:, 0, tt], c_test_y.ravel()) pc = c_cl.predict_proba(c_test_X[:, 0, tt]) va[fold, tt] = v_cl.score(v_test_X[:, 0, tt], v_test_y.ravel()) pv = v_cl.predict_proba(v_test_X[:, 0, tt]) pcv = (pc[:, np.newaxis, :] * pv[..., np.newaxis]).reshape( pc.shape[0], -1)[:, has_data].argmax(axis=1) c_va[fold, tt] = np.equal(pcv.ravel(), cv_test_y.ravel()).mean() return ca, va, cva, c_va
X = breast_data.data y = breast_data.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=420) # %% # 不同的梯度下降的步长在训练集和测试的预测表现 l2 = [] l2test = [] l2_big = [] for i in np.arange(1, 201, 10): lrl2 = LR(penalty="l2", solver="liblinear", C=0.9, max_iter=i) # penalty是正则化 lrl2 = lrl2.fit(X_train, y_train) l2.append(accuracy_score(lrl2.predict(X_train), y_train)) # 训练集的评估分数 l2test.append(accuracy_score(lrl2.predict(X_test), y_test)) # 测试集的评估分数 # %% for i in np.arange(1, 201, 10): lrl2 = LR(penalty="l2", solver="liblinear", C=0.9, max_iter=500) lrl2 = lrl2.fit(X_train, y_train) # l2_big.append(accuracy_score(lrl2.predict(X_train), y_train)) # 训练集的评估分数 l2_big.append(accuracy_score(lrl2.predict(X_test), y_test)) # 测试集的评估分数 # %% graph = [l2, l2test, l2_big] color = ["black", "gray", "red"] label = ["L2", "L2test", "l2_big"]
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.label) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] print("featureNum", len(self.fn[0])) # print("non zero feature num", sum(self.fn[0])) totalTransferNumList = [] # np.random.seed(3) # np.random.shuffle(indexList) random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum*1.0/foldNum) foldInstanceList = [] for foldIndex in range(foldNum-1): foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) cvIter = 0 # random.seed(3) totalAccList = [0 for i in range(10)] coefList = [0 for i in range(10)] posRatioList = [] for foldIndex in range(foldNum): self.m_clf = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex+1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum*0.9) # print(test) fn_test = self.fn[test] label_test = self.label[test] sampledTrainNum = len(train) # sampledTrainNum = 100 train_sampled = random.sample(train, sampledTrainNum) fn_train = self.fn[train_sampled] label_train = self.transferLabel[train_sampled] self.m_clf.fit(fn_train, label_train) label_preds = self.m_clf.predict(fn_test) acc = accuracy_score(label_test, label_preds) testOneNum = np.sum(label_test==self.transferLabel[test]) testNum = len(fn_test) posRatio = testOneNum*1.0/testNum posRatioList.append(posRatio) totalAccList[cvIter] = acc cvIter += 1 totalACCFile = modelVersion+".txt" f = open(totalACCFile, "w") for i in range(10): f.write(str(totalAccList[i])) # for j in range(totalAlNum): # f.write(str(totalAccList[i][j])+"\t") f.write("\n") f.close() print("posRatioList", posRatioList, np.mean(posRatioList), np.sqrt(np.var(posRatioList))) print("acc", np.mean(totalAccList), np.sqrt(np.var(totalAccList)))
# Author:马肖 # E-mail:[email protected] # Github:https://github.com/Albertsr import pickle from sklearn.datasets import load_breast_cancer from sklearn.linear_model import LogisticRegression as LR from sklearn.model_selection import train_test_split X, y = load_breast_cancer(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2018) clf = LR().fit(X_train, y_train) # 运用pickle序列化机器学习模型, 保存为字符串形式 s = pickle.dumps(clf) # 反序列化 clf_load = pickle.loads(s) # 输出模型预测精度 print(clf_load.score(X_test, y_test)) # 用dump(object, file) 将模型保存至磁盘 with open('clf_pickle', 'wb') as model: pickle.dump(clf, model) # 运用pickle调用模型,并输出模型结果 with open('clf_pickle', 'rb') as model:
else: new_sentence += " " data[sess].append(new_sentence) chronology = list(data.keys()) for i in range(len(data.keys())): for j in range(i + 1, len(data.keys())): if chronology[i] > chronology[j]: chronology[i], chronology[j] = chronology[j], chronology[i] date = [chronology[len(chronology) * i // 6 - 1] for i in range(1, 7)] clf_option = [ Boosting(), LR(n_jobs=-1), NB(), LinearSVC(), Neighbors(), RFC() ] mre_pred = [] for iter in tqdm(range(5)): query = "Select * from berita WHERE Date <= " + str( date[iter]) + " AND Title LIKE '%ekono%' " c.execute(query) train_data = c.fetchall() query = "Select * from berita WHERE Date <= " + str( date[iter]) + " AND NOT Title LIKE '%ekono%' " c.execute(query)
labels = np.zeros(numPoints) test_point = np.zeros(numDims) #loop variable to index data and labels i = 0 #read in the data and labels from the file for line in file_data: split_line = line.strip("\n").split(" ") labels[i] = int(split_line[numDims]) for j in range(0, numDims): data[i][j] = float(split_line[j]) i = i + 1 file_data.close() start_time = time.time() logistic = LR(n_jobs=8) logistic.fit(data, labels) end_time = time.time() - start_time print "Time take for logistic regression: ", end_time, " seconds" print "Number of Iterations: ", logistic.n_iter_ logistic.predict([test_point]) outfile = open('out8/out100k.txt', 'a') outfile.write(str(end_time)) outfile.write('\n') outfile.close()
def train_predict_lr_forward(train_file, test_file, predict_valid_file, predict_test_file, C, n_fold=5): feature_name = os.path.basename(train_file)[:-8] algo_name = 'lr_forward_{}'.format(C) model_name = '{}_{}'.format(algo_name, feature_name) logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', level=logging.DEBUG, filename='{}.log'.format(model_name)) logging.info("Loading training and test data...") X_trn, y_trn = load_data(train_file, dense=True) X_tst, _ = load_data(test_file, dense=True) logging.info('Normalizing data') scaler = StandardScaler() X_trn = scaler.fit_transform(X_trn) X_tst = scaler.transform(X_tst) cv = StratifiedKFold(y_trn, n_folds=n_fold, shuffle=True, random_state=2015) selected_features = [] features_to_test = [ x for x in range(X_trn.shape[1]) if x not in selected_features ] auc_cv_old = .5 is_improving = True while is_improving: auc_cvs = [] for feature in features_to_test: logging.info('{}'.format(selected_features + [feature])) X = X_trn[:, selected_features + [feature]] p_val = np.zeros_like(y_trn) for i, (i_trn, i_val) in enumerate(cv, start=1): clf = LR(C=C, class_weight='auto', random_state=2014) clf.fit(X[i_trn], y_trn[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] auc_cv = AUC(y_trn, p_val) logging.info('AUC CV: {:.6f}'.format(auc_cv)) auc_cvs.append(auc_cv) auc_cv_new = max(auc_cvs) if auc_cv_new > auc_cv_old: auc_cv_old = auc_cv_new feature = features_to_test.pop(auc_cvs.index(auc_cv_new)) selected_features.append(feature) logging.info('selected features: {}'.format(selected_features)) else: is_improving = False logging.info( 'final selected features: {}'.format(selected_features)) logging.info('saving selected features as a file') with open('{}_selected.txt'.format(model_name), 'w') as f: f.write('{}\n'.format(selected_features)) X = X_trn[:, selected_features] logging.debug('feature matrix: {}x{}'.format(X.shape[0], X.shape[1])) p_val = np.zeros_like(y_trn) for i, (i_trn, i_val) in enumerate(cv, start=1): logging.info('Training CV #{}'.format(i)) clf = LR(C=C, class_weight='auto', random_state=2015) clf.fit(X[i_trn], y_trn[i_trn]) p_val[i_val] = clf.predict_proba(X[i_val])[:, 1] auc_cv = AUC(y_trn, p_val) logging.info('AUC CV: {:.6f}'.format(auc_cv)) logging.info("Writing test predictions to file") np.savetxt(predict_valid_file, p_val, fmt='%.6f', delimiter=',') logging.info('Retraining with 100% data...') clf.fit(X, y_trn) p_tst = clf.predict_proba(X_tst[:, selected_features])[:, 1] np.savetxt(predict_test_file, p_tst, fmt='%.6f', delimiter=',')
# 7.3 线性回归的诊断 # 7.3.1 残差分析 # In[13]: # ols类计算 线性回归模型 并得到 预测值 和 残差 ana1 = ols('avg_exp ~ Age + Income + dist_home_val', data=exp).fit() exp['Pred'] = ana1.predict(exp) exp['resid'] = ana1.resid # 残差随着x的增大呈现 喇叭口形状,出现异方差 exp.plot('Pred', 'resid', kind='scatter') # Pred = β*Income,随着预测值的增大,残差resid呈现 喇叭口形状 ana1.summary() # In[]: Xtrain = exp[['Age', 'Income', 'dist_home_val']] Ytrain = exp[['avg_exp']] reg = LR().fit(Xtrain, Ytrain) yhat = reg.predict(Xtrain) # 预测我们的yhat print(reg.score(Xtrain, Ytrain)) predict = pd.DataFrame(yhat, columns=['Pred']) print(Ytrain.dtypes, predict.dtypes) y = Ytrain.copy() ft.recovery_index([y]) # resid = pd.DataFrame((y['avg_exp'] - predict["Pred"]), columns=['resid']) resid = pd.DataFrame(y['avg_exp'].sub(predict["Pred"]), columns=['resid']) resid_1 = pd.concat([predict, resid], axis=1) resid_1.plot('Pred', 'resid', kind='scatter') print(ft.r2_score_customize(Ytrain, yhat, 1))
def fit(self, X, y): # -1 for unlabeled unlabeledX = X[y == -1, :] labeledX = X[y != -1, :] labeledy = y[y != -1] M = unlabeledX.shape[0] # train on labeled data self.model.fit(labeledX, labeledy) unlabeledy = self.predict(unlabeledX) #re-train, labeling unlabeled instances pessimistically # pessimistic soft labels ('weights') q for unlabelled points, q=P(k=0|Xu) f = lambda softlabels, grad=[ ]: self.discriminative_likelihood_objective( self.model, labeledX, labeledy=labeledy, unlabeledData=unlabeledX, unlabeledWeights=numpy.vstack((softlabels, 1 - softlabels)).T, gradient=grad) #- supLL lblinit = numpy.random.random(len(unlabeledy)) try: self.it = 0 opt = nlopt.opt(nlopt.GN_DIRECT_L_RAND, M) opt.set_lower_bounds(numpy.zeros(M)) opt.set_upper_bounds(numpy.ones(M)) opt.set_min_objective(f) opt.set_maxeval(self.max_iter) self.bestsoftlbl = opt.optimize(lblinit) print(" max_iter exceeded.") except Exception as e: print(e) self.bestsoftlbl = self.bestlbls if numpy.any(self.bestsoftlbl != self.bestlbls): self.bestsoftlbl = self.bestlbls ll = f(self.bestsoftlbl) unlabeledy = (self.bestsoftlbl < 0.5) * 1 uweights = numpy.copy( self.bestsoftlbl ) # large prob. for k=0 instances, small prob. for k=1 instances uweights[unlabeledy == 1] = 1 - uweights[ unlabeledy == 1] # subtract from 1 for k=1 instances to reflect confidence weights = numpy.hstack((numpy.ones(len(labeledy)), uweights)) labels = numpy.hstack((labeledy, unlabeledy)) if self.use_sample_weighting: self.model.fit(numpy.vstack((labeledX, unlabeledX)), labels, sample_weight=weights) else: self.model.fit(numpy.vstack((labeledX, unlabeledX)), labels) if self.verbose > 1: print("number of non-one soft labels: ", numpy.sum(self.bestsoftlbl != 1), ", balance:", numpy.sum(self.bestsoftlbl < 0.5), " / ", len(self.bestsoftlbl)) print("current likelihood: ", ll) if not getattr(self.model, "predict_proba", None): # Platt scaling self.plattlr = LR() preds = self.model.predict(labeledX) self.plattlr.fit(preds.reshape(-1, 1), labeledy) return self
# sns.distplot(df['Price']) # plt.show() # sns.heatmap(df.corr(),annot=True) # plt.show() # REGERESSION X = df[df.columns[range(5)]] y = df['Price'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=101) lm = LR() lm.fit(X_train, y_train) print("Intecept: {}\n".format(lm.intercept_)) cdf = pd.DataFrame(lm.coef_, X.columns, columns=['Coeff']) print("Coefficients:\n{}\n".format(cdf)) pred = pd.DataFrame({'A': lm.predict(X_test), 'B': y_test}) # print(pred) print("Correlation:\n{}\n".format(pred.corr())) # BOSTON from sklearn.datasets import load_boston boston = load_boston()
import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression as LR from sklearn.preprocessing import PolynomialFeatures data = pd.read_csv("positions.csv") print(data.columns) print("************\n") #print(data.describe()) #X ekseni için Level = data.iloc[:, 1].values.reshape(-1, 1) #Y ekseni için Salary = data.iloc[:, 2].values.reshape(-1, 1) #BU kısımda Linear bir doğru üzerinden tahmin yaptık ve linear bir line çizdik. regression = LR() #X ve Y eksini fitledik. regression.fit(Level, Salary) # BUrada Level i 8.3 olan bir kişi için tahmini Salary fiyatını Hesapladık. tahmin = regression.predict([[8.3]]) #burada linear bir doğrusallık yaptığımız için tahmin sağlıklı bir tahmin değil. print("Tahmini Salary fiyatı:" + str(tahmin)) # Bu Polinomal bir eğri ile tahmin yapıtık. regressionPoly = PolynomialFeatures(degree=4) # burada Level değeri üzerinde polynomal bir eğri oluşturduk. levelPoly = regressionPoly.fit_transform(Level)
'/Users/jiangjiantao/Downloads/79079699_2_py大作业_调查_5_5.csv', encoding='GBK') print(sourceData) y = read_csv('/Users/jiangjiantao/Downloads/yyyyyyyyyyyyyyy.csv', encoding='GBK') print(y.shape) XTrain, XTest, YTrain, YTest = train_test_split(sourceData, y, test_size=0.6, random_state=420) for i in [XTrain, XTest]: i.index = range(i.shape[0]) XTrain.shape reg = LR().fit(XTrain, YTrain) yHat = reg.predict(XTest) print(yHat) reg.coef_ [*zip(XTrain.columns, reg.coef_)] print(reg.intercept_) from sklearn.metrics import mean_squared_error as MSE print(MSE(yHat, YTest)) y.max() y.min() import sklearn sorted(sklearn.metrics.SCORERS.keys()) print( cross_val_score(reg, sourceData, y, cv=5,
xfit = Xo[:nfit] yfit = Y[:nfit] xval = Xo[nfit:] yval = Y[nfit:] '''2. 在已有的数据集上进行算法的验证和测试''' from sklearn.svm import SVR from sklearn.svm import NuSVR model = SVR() from sklearn.linear_model import * from sklearn.tree import DecisionTreeRegressor as dtr from sklearn.metrics import roc_curve, auc import matplotlib.pyplot as plt from sklearn.cross_validation import StratifiedKFold from sklearn.linear_model import LogisticRegression as LR model = LR(C=0.004) model = LR(C=0.01, penalty='l1') from sklearn.linear_model import BayesianRidge as BR model = BR(alpha_1=1e2, alpha_2=3e2, lambda_1=1e-9, lambda_2=1e-9, compute_score=False) from sklearn.linear_model import (LinearRegression, Lasso, RandomizedLasso, Ridge) from sklearn.feature_selection import (RFE, f_regression) from sklearn.ensemble import RandomForestRegressor as rfr from sklearn.ensemble import AdaBoostRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.neighbors import RadiusNeighborsRegressor from sklearn.neighbors import NearestNeighbors
def whole_data_models(train_indep, train_dep, num_bootstraps, num_trees, master_train, test_data): model_rf = RF(n_estimators=num_trees, criterion='gini', bootstrap=True, n_jobs=-1, min_samples_leaf=5) model_lr = LR() model_dt = DT() indep_columns = master_train.columns.tolist() indep_columns.remove('churn_flag') indep_columns.remove('sl_uuid') #indep_columns = ['customer_life'] '''try: columns = pk.load(open('columns.pk','r')) indep_columns = columns except: print ("not created yet") ''' master_indep = master_train[indep_columns] master_dep = master_train['churn_flag'].tolist() model_rf.fit(master_indep, master_dep) model_lr.fit(master_indep, master_dep) model_dt.fit(master_indep, master_dep) #Printing out accuracies from train set print model_rf.score(master_indep, master_dep) print model_lr.score(master_indep, master_dep) print model_dt.score(master_indep, master_dep) train_pred_rf = model_rf.predict(master_indep) train_pred_lr = model_lr.predict(master_indep) train_pred_dt = model_dt.predict(master_indep) score_rf = PRFS(y_true=master_dep, y_pred=train_pred_rf, average='binary') score_lr = PRFS(y_true=master_dep, y_pred=train_pred_lr, average='binary') score_dt = PRFS(y_true=master_dep, y_pred=train_pred_dt, average='binary') print('Random forest precision and recall:\t' + str(score_rf[0]) + '\t' + str(score_rf[1])) print('Logistic regression precision and recall:\t' + str(score_lr[0]) + '\t' + str(score_lr[1])) print('Decision tree precision and recall:\t' + str(score_dt[0]) + '\t' + str(score_dt[1])) #Evaluating test data indep_columns = master_train.columns.tolist() indep_columns.remove('churn_flag') indep_columns.remove('sl_uuid') #indep_columns = ['customer_life'] master_indep = test_data[indep_columns] master_dep = test_data['churn_flag'].tolist() #model_rf.fit(master_indep, master_dep) #model_lr.fit(master_indep, master_dep) #model_dt.fit(master_indep, master_dep) #Printing out accuracies from train set print('TEST DATA') print model_rf.score(master_indep, master_dep) print model_lr.score(master_indep, master_dep) print model_dt.score(master_indep, master_dep) train_pred_rf = model_rf.predict(master_indep) train_pred_lr = model_lr.predict(master_indep) train_pred_dt = model_dt.predict(master_indep) score_rf = PRFS(y_true=master_dep, y_pred=train_pred_rf) score_lr = PRFS(y_true=master_dep, y_pred=train_pred_lr) score_dt = PRFS(y_true=master_dep, y_pred=train_pred_dt) prediction = model_rf.predict(master_indep) master_indep['actual'] = master_dep master_indep['prediction'] = prediction master_indep.to_excel('master_indep.xlsx') print('Random forest precision and recall:\t' + str(score_rf[0]) + '\t' + str(score_rf[1])) print('Logistic regression precision and recall:\t' + str(score_lr[0]) + '\t' + str(score_lr[1])) print('Decision tree precision and recall:\t' + str(score_dt[0]) + '\t' + str(score_dt[1])) print('importance of variables:\t') col = indep_columns imp = model_rf.feature_importances_.tolist() di = {'columns': col, 'importance': imp} df = pd.DataFrame(di) df.to_excel('variable_importance.xlsx') df = df[df['importance'] > 0.00001] columns = df['columns'].tolist() pk.dump(columns, open('columns.pk', 'w')) print(df)
#Y-axis setup. ax.set_ylabel("Price", fontsize=22) ax.set_ylim(0, 800000) ax.yaxis.set_major_formatter(SMF('${x:,.0f}')) ax.tick_params(axis='both', which='major', labelsize=14) #Legend setup. exp3._legend.remove() ax.legend(loc='upper left', title='Overall House Quality', labels=[ 'Very Poor', 'Poor', 'Fair', 'Below Average', 'Average', 'Above Average', 'Good', 'Very Good', 'Excellent', 'Very Excellent' ], ncol=2, title_fontsize=18, fontsize=14) ############## HYPOTHESIS TESTING ################## ### Significance Test x = df['Overall Qual'].to_numpy().reshape(-1, 1) y = df['SalePrice'].to_numpy().reshape(-1, ) reg = LR().fit(x, y) r_sq = reg.score(x, y) p_val = fr(x, y)[1][0] print("The R-squared is {}.".format(round(r_sq, 3))) print("The p_value is {}.".format(p_val))
def generatePredictWithLostic(x, y, x_predict): model = LR() model.fit(x, y) y_predict = model.predict_proba(x_predict) return pd.DataFrame(y_predict)
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.label) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] # np.random.seed(3) random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum) foldInstanceList = [] for foldIndex in range(foldNum - 1): foldIndexInstanceList = indexList[foldIndex * foldInstanceNum:(foldIndex + 1) * foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) cvIter = 0 totalAccList = [[] for i in range(10)] totalNewClassFlagList = [[] for i in range(10)] for foldIndex in range(foldNum): if self.m_multipleClass: self.m_clf = LR(multi_class="multinomial", solver='lbfgs', random_state=3, fit_intercept=False) else: self.m_clf = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) for postFoldIndex in range(foldIndex + 1, foldNum): train.extend(foldInstanceList[postFoldIndex]) fn_train = self.fn[train] test = foldInstanceList[foldIndex] fn_test = self.fn[test] label_test = self.label[test] featureDim = len(fn_train[0]) self.init_confidence_bound(featureDim) initExList = [] initExList = self.pretrainSelectInit(train, foldIndex) fn_init = self.fn[initExList] label_init = self.label[initExList] print("initExList\t", initExList, label_init) queryIter = 3 labeledExList = [] unlabeledExList = [] labeledExList.extend(initExList) unlabeledExList = list(set(train) - set(labeledExList)) while queryIter < rounds: fn_train_iter = [] label_train_iter = [] fn_train_iter = self.fn[labeledExList] label_train_iter = self.label[labeledExList] self.m_clf.fit(fn_train_iter, label_train_iter) idx = self.select_example(unlabeledExList) self.update_select_confidence_bound(idx) # print(queryIter, "idx", idx, self.label[idx]) # self.update_select_confidence_bound(idx) labeledExList.append(idx) unlabeledExList.remove(idx) acc = self.get_pred_acc(fn_test, label_test, labeledExList) totalAccList[cvIter].append(acc) queryIter += 1 cvIter += 1 totalACCFile = modelVersion + "_acc.txt" totalACCFile = os.path.join(fileSrc, totalACCFile) f = open(totalACCFile, "w") for i in range(10): totalAlNum = len(totalAccList[i]) for j in range(totalAlNum): f.write(str(totalAccList[i][j]) + "\t") f.write("\n") f.close()
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.m_targetLabel) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] np.random.seed(3) np.random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum*1.0/foldNum) foldInstanceList = [] for foldIndex in range(foldNum-1): foldIndexInstanceList = indexList[foldIndex*foldInstanceNum:(foldIndex+1)*foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum*(foldNum-1):] foldInstanceList.append(foldIndexInstanceList) # kf = KFold(totalInstanceNum, n_folds=self.fold, shuffle=True) # random.seed(3) totalAccList = [[] for i in range(10)] humanAccList = [[] for i in range(10)] totalExtraAccList = [] # self.get_base_learners() correctTransferRatioList = [] totalTransferNumList = [] correctTransferLabelNumList = [] correctUntransferRatioList = [] totalAuditorPrecisionList = [] totalAuditorRecallList = [] totalAuditorAccList = [] for foldIndex in range(foldNum): # self.clf = LinearSVC(random_state=3) if self.m_multipleClass: self.m_clf = LR(multi_class="multinomial", solver='lbfgs',random_state=3, fit_intercept=False) else: self.m_clf = LR(random_state=3) self.m_judgeClassifier = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex+1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum*0.9) targetNameFeatureTrain = self.m_targetNameFeature[train] targetLabelTrain = self.m_targetLabel[train] # targetDataFeatureTrain = self.m_targetDataFeature[train] targetNameFeatureTest = self.m_targetNameFeature[test] targetLabelTest = self.m_targetLabel[test] transferLabelTest = self.m_transferLabel[test] # targetDataFeatureTest = self.m_targetDataFeature[test] # sourceUniqueClass = np.unique(self.m_sourceLabel) initExList = [] initExList = self.pretrainSelectInit(train, foldIndex) targetNameFeatureInit = self.m_targetNameFeature[initExList] targetLabelInit = self.m_targetLabel[initExList] print("initExList\t", initExList, targetLabelInit) queryIter = 0 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train)-set(labeledExList)) activeLabelNum = 3.0 transferLabelNum = 0.0 transferFeatureList = [] transferFlagList = [] featureDim = len(targetNameFeatureTrain[0]) self.init_confidence_bound(featureDim, labeledExList, unlabeledExList) targetNameFeatureIter = targetNameFeatureInit targetLabelIter = targetLabelInit correctTransferLabelNum = 0.0 wrongTransferLabelNum = 0.0 correctUntransferLabelNum = 0.0 wrongUntransferLabelNum = 0.0 # auditorPrecisionList = [] # auditorRecallList = [] auditorAccList = [] extraAccList = [] self.m_clf.fit(targetNameFeatureInit, targetLabelInit) while activeLabelNum < rounds: # targetNameFeatureIter = self.m_targetNameFeature[labeledExList] # targetLabelIter = self.m_targetLabel[labeledExList] # self.m_clf.fit(targetNameFeatureIter, targetLabelIter) exId = self.select_example(unlabeledExList) exLabel = -1 self.m_strongLabeledIDList.append(exId) self.update_select_confidence_bound(exId) self.update_judge_confidence_bound(exId) activeLabelNum += 1.0 activeLabelFlag = True exLabel = self.m_targetLabel[exId] transferLabel = self.m_transferLabel[exId] if transferLabel == exLabel: # correctUntransferLabelNum += 1.0 transferFlagList.append(1.0) transferFeatureList.append(self.m_targetNameFeature[exId]) else: # wrongUntransferLabelNum += 1.0 transferFlagList.append(0.0) transferFeatureList.append(self.m_targetNameFeature[exId]) # auditorPrecision = 0.0 # if correctTransferLabelNum+wrongTransferLabelNum > 0.0: # auditorPrecision = correctTransferLabelNum*1.0/(correctTransferLabelNum+wrongTransferLabelNum) auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest) # print("auditorAcc", auditorAcc) auditorAccList.append(auditorAcc) labeledExList.append(exId) unlabeledExList.remove(exId) # acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter) # totalAccList[cvIter].append(acc) extraAcc = self.addExtraExample(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest) extraAccList.append(extraAcc) # humanAccList[cvIter].append(acc) queryIter += 1 # totalAuditorPrecisionList.append(auditorPrecisionList) # totalAuditorRecallList.append(auditorRecallList) totalAuditorAccList.append(auditorAccList) totalExtraAccList.append(extraAccList) cvIter += 1 # print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList))) # print("extraList", extraAccList, np.mean(extraAccList), np.sqrt(np.var(extraAccList))) # print("correct ratio\t", np.mean(correctTransferRatioList), np.sqrt(np.var(correctTransferRatioList))) # print("untransfer correct ratio\t", np.mean(correctUntransferRatioList), np.sqrt(np.var(correctUntransferRatioList))) # AuditorPrecisionFile = modelVersion+"_auditor_precision.txt" # writeFile(totalAuditorPrecisionList, AuditorPrecisionFile) # AuditorRecallFile = modelVersion+"_auditor_recall.txt" # writeFile(totalAuditorRecallList, AuditorRecallFile) AuditorAccFile = modelVersion+"_auditor_acc.txt" writeFile(totalAuditorAccList, AuditorAccFile) # totalACCFile = modelVersion+"_acc.txt" # writeFile(totalAccList, totalACCFile) # humanACCFile = modelVersion+"_human_acc.txt" # writeFile(humanAccList, humanACCFile) extraACCFile = modelVersion+"_extra_acc.txt" writeFile(totalExtraAccList, extraACCFile)
import pandas as pd from sklearn.linear_model import LogisticRegression as LR inputpath = 'D:/PythonProject/python02\逻辑回归/data/bankloan.xls' data = pd.read_excel(inputpath) x = data.iloc[:, :8].values y = data.iloc[:, 8].values lr = LR(solver='liblinear') lr = lr.fit(x, y) print('模型的平均准确度为:%s' % lr.score(x, y))
import operator from functools import reduce from sklearn.linear_model import LogisticRegression as LR from sklearn.ensemble import RandomForestClassifier as RF import pickle data = pd.read_csv('Train_vec.csv') data = np.array(data) x_train = data[:, :9] y_train = data[:, 9] print('x_train[1] : ', x_train[1]) print('y_train[1] : ', y_train[1]) lr = LR(random_state=1, solver='lbfgs', C=1.0, multi_class='ovr') lr.fit(x_train, y_train) preds = lr.predict(x_train) preds_prob = lr.predict_proba(x_train) lr_score = lr.score(x_train, y_train) print('LR - Predictions (' + str(len(preds)) + ') : ', preds) #print('LR - Prediction Probability : ', preds_prob) print('LR - Scores : ', lr_score) #print('Are all predictions of LR are correct : ', reduce(operator.and_, y_train == preds)) idxs = [x for x in range(0, len(preds)) if preds[x] == 1] x_train_2 = x_train #[idxs]
# ============================================================================= # linear regression # ============================================================================= # case2) # - train -> X(feature) , y(target)분리 # - 학습(fit)후 test_set (test2) y값 예측 train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.25, random_state=0) print(len(train_x)) print(len(train_y)) print(len(test_x)) print(len(test_y)) linear= lr() linear.fit(train_x, train_y) LR = LR() pd.Series(y) x2 = sm.add_constant(X) model = sm.OLS(y, x2) result = model.fit() print(result.summary()) y_pred = linear.predict(test_x) print(y_pred) print(list(test_y)) print('정확도 : ' metrics.accuracy_score(y_test, y_pred)) print('정확도 :', metrics.accuracy_score(y_test, y_pred))
Y = data['future48_AKI2_overlap'].values Ytrain, Ytest = Y[:cutoff], Y[cutoff:] IDtrain = data['PAT_ENC_CSN_ID'].values[:cutoff] selectTrain, selectTest = np.isfinite(Ytrain), np.isfinite(Ytest) Xtrain, Ytrain, IDtrain = Xtrain[ selectTrain, :], Ytrain[selectTrain], IDtrain[selectTrain] Xtest, Ytest = Xtest[selectTest, :], Ytest[selectTest] pIndexSub = getPatientIndices(IDtrain) sampleWeights = np.zeros(len(Xtrain)) for i in tqdm(range(len(pIndexSub))): start, stop, length = pIndexSub[i, :] sampleWeights[start:stop] = 1 / length #X = np.concatenate((valueFeatures, timeFeatures, data2[['los']].values, # data2[['creatinine']].values), axis = 1) ############################################################################## from sklearn.linear_model import LogisticRegression as LR from sklearn.metrics import roc_auc_score as AUC from Helper.utilities import showCoef model = LR(class_weight='balanced', C=1e-1) model.fit(Xtrain, Ytrain) #,sample_weight = sampleWeights) P = model.predict_proba(Xtest)[:, 1] model.coef_ print(AUC(Ytest, P)) #, sample_weight = sampleWeights)) ## performance is around 0.83 currently
target, features = targetFeatureSplit(data) ### training-testing split needed in regression, just like classification from sklearn.cross_validation import train_test_split feature_train, feature_test, target_train, target_test = train_test_split( features, target, test_size=0.5, random_state=42) train_color = "b" test_color = "r" ### Your regression goes here! ### Please name it reg, so that the plotting code below picks it up and ### plots it correctly. Don't forget to change the test_color above from "b" to ### "r" to differentiate training points from test points. from sklearn.linear_model import LinearRegression as LR reg = LR().fit(feature_train, target_train) print "Coeff: ", reg.coef_ print "intercept: ", reg.intercept_ print "Train Score: ", reg.score(feature_train, target_train) print "Test Score: ", reg.score(feature_test, target_test) ### draw the scatterplot, with color-coded training and testing points import matplotlib.pyplot as plt for feature, target in zip(feature_test, target_test): plt.scatter(feature, target, color=test_color) for feature, target in zip(feature_train, target_train): plt.scatter(feature, target, color=train_color) ### labels for the legend plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
def fit(self, X, y): # -1 for unlabeled """Fit the model according to the given training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vector, where n_samples in the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target vector relative to X Must be 0 or 1 for labeled and -1 for unlabeled instances Returns ------- self : object Returns self. """ # http://www.fabiangieseke.de/index.php/code/qns3vm unlabeledX = X[y == -1, :].tolist() labeledX = X[y != -1, :].tolist() labeledy = y[y != -1] print 1 # convert class 0 to -1 for tsvm labeledy[labeledy == 0] = -1 labeledy = labeledy.tolist() print 2 if 'rbf' in self.kernel.lower(): self.model = QN_S3VM(labeledX, labeledy, unlabeledX, self.random_generator, lam=self.C, lamU=self.lamU, kernel_type="RBF", sigma=self.gamma) else: self.model = QN_S3VM(labeledX, labeledy, unlabeledX, self.random_generator, lam=self.C, lamU=self.lamU) print 3 self.model.train() print 4 # probabilities by Platt scaling if self.probability: self.plattlr = LR() preds = self.model.mygetPreds(labeledX) self.plattlr.fit(preds.reshape(-1, 1), labeledy)
def svd_accuracy(file_name, ec, kwargs, folds=10, max_svs=10, max_init=15): """ Classify data based on svd features. """ kwargs['condense'] = False ds = ec.ECoG(file_name, which_set='train', **kwargs) n_classes = int(np.around(ds.y.max() + 1)) max_svs = min(max_svs, n_classes) init_list = np.arange(0, n_classes - max_svs + 1) init_list = init_list[init_list < max_init] nsvs_list = np.arange(1, max_svs + 1) pa = np.inf * np.ones((folds, len(nsvs_list), len(init_list))) ma = np.inf * np.ones((folds, len(nsvs_list), len(init_list))) va = np.inf * np.ones((folds, len(nsvs_list), len(init_list))) u_s = np.zeros((folds, n_classes, n_classes)) s_s = np.zeros((folds, n_classes)) v_s = np.zeros((folds, n_classes, ds.X.shape[1])) ohf = OneHotFormatter(n_classes) for fold in range(folds): kwargs_copy = copy.deepcopy(kwargs) print('fold: {}'.format(fold)) ds = ec.ECoG(file_name, which_set='train', fold=fold, center=False, **kwargs_copy) # CV ts = ds.get_test_set() vs = ds.get_valid_set() train_X = np.concatenate((ds.X, vs.X), axis=0) train_mean = train_X.mean(axis=0) train_X = train_X - train_mean train_y = np.concatenate((ds.y, vs.y), axis=0) test_X = ts.X - train_mean test_y = ts.y y_oh = ohf.format(train_y, mode='concatenate') c_yx = (y_oh - y_oh.mean(axis=0)).T.dot(train_X) / train_X.shape[0] u, s, v = np.linalg.svd(c_yx, full_matrices=False) u_s[fold] = u s_s[fold] = s v_s[fold] = v for ii, n_svs in enumerate(nsvs_list): for jj, sv_init in enumerate(init_list): vp = v[sv_init:sv_init + n_svs] train_proj = train_X.dot(vp.T) test_proj = test_X.dot(vp.T) cl = LR(solver='lbfgs', multi_class='multinomial').fit(train_proj, train_y.ravel()) y_hat = cl.predict(test_proj) p_results = [] m_results = [] v_results = [] for y, yh in zip(test_y.ravel(), y_hat.ravel()): pr = place_equiv(y, yh) if pr is not None: p_results.append(pr) mr = manner_equiv(y, yh) if mr is not None: m_results.append(mr) vr = vowel_equiv(y, yh) if vr is not None: v_results.append(vr) pa[fold, ii, jj] = np.array(p_results).mean() ma[fold, ii, jj] = np.array(m_results).mean() va[fold, ii, jj] = np.array(v_results).mean() return pa, ma, va, u_s, s_s, v_s, init_list, nsvs_list
def run_CV(self): cvIter = 0 totalInstanceNum = len(self.m_targetLabel) print("totalInstanceNum\t", totalInstanceNum) indexList = [i for i in range(totalInstanceNum)] totalTransferNumList = [] np.random.seed(3) np.random.shuffle(indexList) foldNum = 10 foldInstanceNum = int(totalInstanceNum * 1.0 / foldNum) foldInstanceList = [] for foldIndex in range(foldNum - 1): foldIndexInstanceList = indexList[foldIndex * foldInstanceNum:(foldIndex + 1) * foldInstanceNum] foldInstanceList.append(foldIndexInstanceList) foldIndexInstanceList = indexList[foldInstanceNum * (foldNum - 1):] foldInstanceList.append(foldIndexInstanceList) totalAccList = [[] for i in range(10)] humanAccList = [[] for i in range(10)] correctTransferRatioList = [] totalTransferNumList = [] correctUntransferRatioList = [] totalAuditorPrecisionList = [] totalAuditorRecallList = [] totalAuditorAccList = [] for foldIndex in range(foldNum): self.m_clf = LR(random_state=3) self.m_judgeClassifier = LR(random_state=3) self.m_weakOracle = LR(random_state=3) train = [] for preFoldIndex in range(foldIndex): train.extend(foldInstanceList[preFoldIndex]) test = foldInstanceList[foldIndex] for postFoldIndex in range(foldIndex + 1, foldNum): train.extend(foldInstanceList[postFoldIndex]) trainNum = int(totalInstanceNum * 0.9) targetNameFeatureTrain = self.m_targetNameFeature[train] targetLabelTrain = self.m_targetLabel[train] transferLabelTrain = self.m_transferLabel[train] # targetDataFeatureTrain = self.m_targetDataFeature[train] self.m_weakOracle.fit(targetNameFeatureTrain, transferLabelTrain) targetNameFeatureTest = self.m_targetNameFeature[test] targetLabelTest = self.m_targetLabel[test] transferLabelTest = self.m_transferLabel[test] initExList = [] initExList = self.pretrainSelectInit(train) # random.seed(101) # initExList = random.sample(train, 3) targetNameFeatureInit = self.m_targetNameFeature[initExList] targetLabelInit = self.m_targetLabel[initExList] print("initExList\t", initExList, targetLabelInit) queryIter = 0 labeledExList = [] unlabeledExList = [] ###labeled index labeledExList.extend(initExList) unlabeledExList = list(set(train) - set(labeledExList)) activeLabelNum = 3.0 transferLabelNum = 0.0 transferFeatureList = [] transferFlagList = [] featureDim = len(targetNameFeatureTrain[0]) self.init_confidence_bound(featureDim, labeledExList, unlabeledExList) targetNameFeatureIter = targetNameFeatureInit targetLabelIter = targetLabelInit correctTransferLabelNum = 0.0 wrongTransferLabelNum = 0.0 correctUntransferLabelNum = 0.0 wrongUntransferLabelNum = 0.0 # auditorPrecisionList = [] # auditorRecallList = [] auditorAccList = [] while activeLabelNum < rounds: # targetNameFeatureIter = self.m_targetNameFeature[labeledExList] # targetLabelIter = self.m_targetLabel[labeledExList] self.m_clf.fit(targetNameFeatureIter, targetLabelIter) exId = self.select_example(unlabeledExList) self.update_select_confidence_bound(exId) # print(idx) activeLabelFlag = False transferLabelFlag, transferLabel = self.get_transfer_flag(exId) exLabel = -1 if transferLabelFlag: self.m_weakLabeledIDList.append(exId) transferLabelNum += 1.0 activeLabelFlag = False exLabel = transferLabel targetNameFeatureIter = np.vstack( (targetNameFeatureIter, self.m_targetNameFeature[exId])) targetLabelIter = np.hstack((targetLabelIter, exLabel)) # targetNameFeatureIter.append(self.m_targetNameFeature[exId]) # targetLabelIter.append(exLabel) if exLabel == self.m_targetLabel[exId]: print("correct transfer queryIter\t", queryIter) correctTransferLabelNum += 1.0 else: wrongTransferLabelNum += 1.0 print("query iteration", queryIter, "error transfer label\t", exLabel, "true label", self.m_targetLabel[exId]) else: self.m_strongLabeledIDList.append(exId) # self.update_judge_confidence_bound(exId) activeLabelNum += 1.0 activeLabelFlag = True exLabel = self.m_targetLabel[exId] targetNameFeatureIter = np.vstack( (targetNameFeatureIter, self.m_targetNameFeature[exId])) targetLabelIter = np.hstack((targetLabelIter, exLabel)) # targetNameFeatureIter.append(self.m_targetNameFeature[exId]) # targetLabelIter.append(exLabel) if transferLabel == exLabel: correctUntransferLabelNum += 1.0 transferFlagList.append(1.0) transferFeatureList.append( self.m_targetNameFeature[exId]) else: wrongUntransferLabelNum += 1.0 transferFlagList.append(0.0) transferFeatureList.append( self.m_targetNameFeature[exId]) auditorAcc = self.getAuditorMetric(transferFeatureList, transferFlagList, targetNameFeatureTest, transferLabelTest, targetLabelTest) print("auditorAcc", auditorAcc) auditorAccList.append(auditorAcc) labeledExList.append(exId) unlabeledExList.remove(exId) acc = self.get_pred_acc(targetNameFeatureTest, targetLabelTest, targetNameFeatureIter, targetLabelIter) totalAccList[cvIter].append(acc) if activeLabelFlag: humanAccList[cvIter].append(acc) queryIter += 1 totalAuditorAccList.append(auditorAccList) transferLabelNum = len(self.m_weakLabeledIDList) totalTransferNumList.append(transferLabelNum) cvIter += 1 print("transfer num\t", np.mean(totalTransferNumList), np.sqrt(np.var(totalTransferNumList))) AuditorAccFile = modelVersion + "_auditor_acc.txt" writeFile(totalAuditorAccList, AuditorAccFile) totalACCFile = modelVersion + "_acc.txt" writeFile(totalAccList, totalACCFile) humanACCFile = modelVersion + "_human_acc.txt" writeFile(humanAccList, humanACCFile)
from sklearn.preprocessing import MinMaxScaler scalar = MinMaxScaler(feature_range=(-1, 1)) scalar_fit = scalar.fit(X) dmin = scalar.data_min_ dmax = scalar.data_max_ Xnorm = scalar.transform(X) # sample weights yrat = np.sum(y == 1) / len(y) xrat = 1 - yrat s_weights = np.zeros(len(y)) s_weights[y == 0] = yrat s_weights[y == 1] = xrat # Logistic Regression clf = LR(penalty='l2', class_weight='balanced').fit(Xnorm, y) preds = clf.predict_proba(Xnorm)[:, 1] class_preds = np.round(preds) ll = log_loss(y, preds, s_weights) accuracy = np.sum(class_preds == y) / len(y) prfs = precision_recall_fscore_support(class_preds, y, average='weighted') # plot hist of preds import matplotlib.pyplot as plt plt.hist(preds, bins=20) plt.title("Label prediction distribution (balanced dataset)") plt.xlabel("Label prediction") plt.ylabel("Count") plt.show() # feature ranking
def main(): # load data # training data data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps', 'zip.train'), header=None, delimiter=' ').iloc[:, :-1] y_train = data.pop(0).values X_train = data.values # test data data = pd.read_csv(os.path.join(os.path.dirname(__file__), 'data', 'usps', 'zip.test'), header=None, delimiter=' ') y_test = data.pop(0).values X_test = data.values pca = PCA(n_components=.95) pca.fit(X_train) X_train = pca.transform(X_train) X_test = pca.transform(X_test) svm_errs = [] with tqdm(desc="Problem 1", total=len(C_VALS)) as pbar: for C in C_VALS: svm = SVC(C=C, kernel='linear', decision_function_shape='ovo') svm.fit(X_train, y_train) pbar.update(1) svm_errs.append(1 - svm.score(X_test, y_test)) lr = OVO(LR(solver='lbfgs', max_iter=5000)) lr.fit(X_train, y_train) lr_score = lr.score(X_test, y_test) err_plot([svm_errs], ["SVM"], lr=1. - lr_score, title="One vs. One Linear SVM", out='hw7/ovo_linear_svm.pdf') ovo_svm_errs = [] with tqdm(desc="Problem 2", total=len(C_VALS)) as pbar: for C in C_VALS: svm = OVO(SVC(C=C, kernel='poly', degree=3, gamma='auto')) svm.fit(X_train, y_train) pbar.update(1) ovo_svm_errs.append(1 - svm.score(X_test, y_test)) err_plot([ovo_svm_errs], ["OvO SVM"], lr=1. - lr_score, title="One vs. One Cubic SVM", out='hw7/ovo_cubic_svm.pdf') ovr_svm_errs = [] with tqdm(desc="Problem 3", total=len(C_VALS)) as pbar: for C in C_VALS: svm = OVR(SVC(C=C, kernel='poly', degree=3, gamma='auto')) svm.fit(X_train, y_train) pbar.update(1) ovr_svm_errs.append(1 - svm.score(X_test, y_test)) err_plot([ovo_svm_errs, ovr_svm_errs], ["OvO SVM", "OvR SVM"], lr=1. - lr_score, title="One vs. Rest Cubic SVM/OvO Cubic", out='hw7/ovr_cubic_svm.pdf') n = 5 # ensuring that we have at least n neighbors for all classes in the # sample while True: index = np.random.choice(X_train.shape[0], 100, replace=False) X_sample = X_train[index] y_sample = y_train[index] # can use a list comprehension to check if all([ len(X_sample[y_sample == y_i]) >= n for y_i in np.unique(y_sample) ]): break dists = [] for X_i, y_i in zip(X_sample, y_sample): X_cls = X_sample[y_sample == y_i] nbrs = NearestNeighbors(n_neighbors=n) nbrs.fit(X_cls) try: distances, _ = nbrs.kneighbors(X_i.reshape(1, -1)) except ValueError as err: raise err # nee to use reshape b/c single sample dists.append(distances[-1]) global SIGMA SIGMA = np.mean(dists) ovo_gauss_svm_errs = [] with tqdm(desc="Problem 4 (SVM)", total=len(C_VALS), file=sys.stdout) as pbar: for C in C_VALS: svm = OVO(SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA**2))) # svm = SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA ** 2), # decision_function_shape='ovo') svm.fit(X_train, y_train) score = svm.score(X_test, y_test) pbar.update(1) ovo_gauss_svm_errs.append(1 - score) knn_errs = [] with tqdm(desc="Problem 4 (kNN)", total=len(np.arange(3, 11)), file=sys.stdout) as pbar: for k in np.arange(3, 11): knn = KNeighborsClassifier(n_neighbors=k, weights=gaussian) knn.fit(X_train, y_train) pbar.update(1) knn_errs.append((k, 1 - knn.score(X_test, y_test))) err_plot([ovo_gauss_svm_errs], ["OvO SVM"], knn=knn_errs, title="One vs. One Gaussian SVM with kNN", out='hw7/ovo_gaussian_svm_knn.pdf') ovr_gauss_svm_errs = [] with tqdm(desc="Problem 5", total=len(C_VALS), file=sys.stdout) as pbar: for C in C_VALS: svm = OVR(SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA**2))) # svm = SVC(C=C, kernel='rbf', gamma=1. / (2. * SIGMA ** 2), # decision_function_shape='ovr') svm.fit(X_train, y_train) score = svm.score(X_test, y_test) pbar.update(1) ovr_gauss_svm_errs.append(1 - score) err_plot([ovr_gauss_svm_errs], ["OvR SVM"], knn=knn_errs, title="One vs. Rest Gaussian SVM with kNN", out='hw7/ovr_gaussian_svm_knn.pdf') err_plot([ svm_errs, ovo_svm_errs, ovr_svm_errs, ovo_gauss_svm_errs, ovr_gauss_svm_errs ], [ "Linear SVM", "OvO Cubic SVM", "OvR Cubic SVM", "OvO Gaussian SVM", "OvR Gaussian SVM" ], lr=1. - lr_score, knn=knn_errs, title="Multiclass SVM Kernels", out='hw7/all_svm_knn.pdf') min_idx = np.argmin(svm_errs) min_lin_err = svm_errs[min_idx] min_lin_c = np.log2(C_VALS[min_idx]) print("Min Linear SVM Error = {0:.4f}".format(min_lin_err)) print("Min Linear SVM log2(C) = {0}".format(min_lin_c)) print("LR Error = {0:.4f}".format(1. - lr_score)) min_idx = np.argmin(ovo_svm_errs) min_lin_err = ovo_svm_errs[min_idx] min_lin_c = np.log2(C_VALS[min_idx]) print("Min OvO Cubic SVM Error = {0:.4f}".format(min_lin_err)) print("Min OvO Cubic SVM log2(C) = {0}".format(min_lin_c)) min_idx = np.argmin(ovr_svm_errs) min_lin_err = ovr_svm_errs[min_idx] min_lin_c = np.log2(C_VALS[min_idx]) print("Min OvR Cubic SVM Error = {0:.4f}".format(min_lin_err)) print("Min OvR Cubic SVM log2(C) = {0}".format(min_lin_c)) min_idx = np.argmin(knn_errs) min_lin_k, min_lin_err = knn_errs[min_idx] print("Min kNN Error = {0:.4f}".format(min_lin_err)) print("Min kNN log2(C) = {0}".format(min_lin_k)) min_idx = np.argmin(ovo_gauss_svm_errs) min_lin_err = ovo_gauss_svm_errs[min_idx] min_lin_c = np.log2(C_VALS[min_idx]) print("Min OvO Gaussian SVM Error = {0:.4f}".format(min_lin_err)) print("Min OvO Gaussian SVM log2(C) = {0}".format(min_lin_c)) min_idx = np.argmin(ovr_gauss_svm_errs) min_lin_err = ovr_gauss_svm_errs[min_idx] min_lin_c = np.log2(C_VALS[min_idx]) print("Min OvR Gaussian SVM Error = {0:.4f}".format(min_lin_err)) print("Min OvR Gaussian SVM log2(C) = {0}".format(min_lin_c)) print("sigma = {0:.4f}".format(SIGMA))