def run_LR(lmbd, preprocess_method, train_validation_frame, test_frame, columns): result = {} minimum_cv_error = 1 for lamda in lmbd: cv_accuracy = [] for times in range(5): # generate train set and validation set train_frame, validation_frame = split_train_set( train_validation_frame, columns) train_set, validation_set, test_set = transform( preprocess_method, train_frame, validation_frame, test_frame, columns) # Logistic regression L2_classify = LogisticRegression(C=1 / lamda, penalty='l2', solver='newton-cg') L2_classify.fit(train_set.drop(['y'], axis=1), train_set['y']) # cv for model selection cv_result_list = cross_val_score(L2_classify, validation_set.drop(['y'], axis=1), validation_set['y'], cv=5) cv_accuracy.append(np.mean(cv_result_list)) result[lamda] = 1 - np.mean(cv_accuracy) minimum_cv_error = min(minimum_cv_error, result[lamda]) # print all lambda and cv error for key in result.keys(): print('when lambda =', key, 'cv error rate =', result[key]) # select the best model for key in result.keys(): if result[key] == minimum_cv_error: print('the best model is when lambda =', key) print('cv error rate =', minimum_cv_error) train_set, no_use_set, test_set = transform( preprocess_method, train_validation_frame, train_validation_frame, test_frame, columns) L2_classify = LogisticRegression(C=1 / key, penalty='l2', solver='newton-cg') L2_classify.fit(train_set.drop(['y'], axis=1), train_set['y']) # print test error print( 'test error =', 1 - L2_classify.score(test_set.drop(['y'], axis=1), test_set['y'])) # print train error for whole train set print( 'train error =', 1 - L2_classify.score( train_set.drop(['y'], axis=1), train_set['y'])) break print('')
def logic_pca_standard(y, n): # 逻辑回归+降维+标准化 pa = PCA(n_components=n) data = pa.fit_transform(train) # 分割数据 x_train, x_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=24) # 标准化 std = StandardScaler() print(std) x_train = std.fit_transform(x_train) x_test = std.transform(x_test) # estimator logic = LogisticRegression() logic.fit(x_train, y_train) # 预测 pre_score = logic.score(x_test, y_test) print("准确率(逻辑回归+降维+标准化):{}".format(pre_score)) print( "精确率和召回率:", classification_report(y_test, logic.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) # 输出概率 predictions = logic.predict_proba(x_test) # Compute Receiver operating characteristic (ROC) fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value))
class LogReg: def __init__(self): self.load_data() self.clf = LogisticRegression(class_weight = 'balanced') self.train() self.predict() def load_data(self): train_csv = './data/train.csv' test_csv = './data/test.csv' df_train = pd.read_csv(train_csv, header=0) df_test = pd.read_csv(test_csv, header=0) arr_train = df_train.values arr_test = df_test.values self.train_X = arr_train[0::,1::] self.train_Y = arr_train[0::, 0] self.test_X = arr_test[0::, 1::] self.test_ID = arr_test[0::,0] def train(self): self.clf.fit(self.train_X, self.train_Y) def predict(self): self.test_Y = self.clf.predict_proba(self.test_X) def get_training_accuracy(self): return (self.clf.score(self.train_X, self.train_Y)) def store_result(self): df_out = pd.DataFrame() df_out['Id'] = self.test_ID df_out['Action'] = self.test_Y[0::,1] df_out.to_csv('./data/results/c1_result.csv',index=False)
def regressiontest(list0Posts, listClasses): digits = datasets.load_digits() digits.data = list0Posts digits.target = listClasses length = len(digits.data) # 多元线性回归 # model = LinearRegression() model = LogisticRegression() model.fit(digits.data, digits.target) joblib.dump(model, 'models/regressiontest/regressiontest.pkl') model_load = joblib.load('models/regressiontest/regressiontest.pkl') testInput = digits.data testResult = model_load.predict(testInput) realResult = digits.target testLen = len(testResult) errorCount = 0 totalError = 0 for i in range(len(testResult)): if fabs(testResult[i] - realResult[i]) > 10: errorCount += 1 # print i+2 totalError += float(errorCount) / testLen totalRight = 1 - totalError print "number of training:", length, "errorCount :", errorCount, "total error rate", totalError, "total right rate", totalRight print "R-squared:", model.score(testInput, testResult)
def score(id): data = [] mark = [] with open(id, 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f) for x in csv_reader: data.append(list(map(float, x[0:-1]))) mark.append(float(x[-1])) acc = [] auc = [] f1 = [] for i in range(10): X_train, X_test, y_train, y_test = cross_validation.train_test_split( data, mark, test_size=0.05, random_state=i) clf = LogisticRegression(C=4.8, random_state=1113) clf.fit(X_train, y_train) # print('准确率:',clf.score(X_test, y_test)) acc.append(round(clf.score(X_test, y_test), 3)) y_pred = clf.predict(X_test) # print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred)) auc.append(round(metrics.roc_auc_score(y_test, y_pred), 3)) # print ('F1-score: %.4f' %metrics.f1_score(y_test,y_predict)) f1.append(round(metrics.f1_score(y_test, y_pred), 3)) acc.append(round(sum(acc) / len(acc), 3)) auc.append(round(sum(auc) / len(auc), 3)) f1.append(round(sum(f1) / len(f1), 3)) return [auc, acc, f1]
def train_model(X, Y, name, plot=False): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) clfs = [] # for the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) #save the trained model to disk joblib.dump(clf, 'C:\\Users\\hp\\Desktop\\project\\logregdata.rar') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def model(train_data, new_label): train_x, test_x, train_y, test_y = train_test_split(train_data, new_label, test_size=0.5, random_state=0) clf = LogisticRegression() clf.fit(train_x, train_y) score = clf.score(test_x, test_y) return score
all_real_y = [] all_pred_y = [] for train_inds, test_inds in folder.split(X): print(n_fold) n_fold += 1 sel_w_pvals = fwd_stepwise_selection( pd.DataFrame(X[train_inds, :], columns=immu_cols), y[train_inds], verbose=True, top_n=10) # ~1/3 of available samples print('Forward-stepwise selection: ' + ' -> '.join(sel_w_pvals)) sel_vars_inds = np.isin(immu_cols, sel_w_pvals) clf.fit(X[train_inds, :][:, sel_vars_inds], y[train_inds]) sel_frequencies.append(sel_vars_inds) coefs.append(clf.coef_) accs.append(clf.score(X[test_inds, :][:, sel_vars_inds], y[test_inds])) all_pred_y += list(clf.predict(X[test_inds, :][:, sel_vars_inds])) all_real_y += list(y[test_inds]) all_sel_cols += sel_w_pvals print(np.mean(accs)) # print(accs) from sklearn.metrics import f1_score print(f1_score(y_true=all_real_y, y_pred=all_pred_y)) #Confusion matrix, Accuracy, sensitivity and specificity from sklearn.metrics import confusion_matrix cm1 = confusion_matrix(y_true=all_real_y, y_pred=all_pred_y) print('Confusion Matrix : \n', cm1)
def train_model(clf_factory, X, Y, name, plot=False): """ Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit( n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0) #print "cv = ",cv train_errors = [] test_errors = [] scores = [] pr_scores, precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list), defaultdict(list) roc_scores, tprs, fprs = defaultdict(list), defaultdict(list) ,defaultdict(list) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] global clf clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: #print("Plotting %s"%genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) #plot_pr(pr_scores[label][median], desc, precisions[label][median],recalls[label][median], label='%s vs rest' % genre_list[label]) #plot_roc(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores),np.mean(all_pr_scores), np.std(all_pr_scores)) print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model_fft/my_model.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def main(): global train global test # 将训练集Y数据存储在y中,并删除训练集Y数据 y = train['Y'] del train['Y'] # 重命名列标题 origin = [ "年龄", "工作天数", "职业类型", "投资收入", "投资损失", "省份", "教育", "家庭角色", "婚姻状况", "教育时间", "民族", "工作情况", "性别" ] target = [ "age", "work_days", "job", "invest_income", "invest_loss", "province", "education", "home_role", "marital_status", "education_time", "nation", "work_type", "gender" ] rename_dict = dict() for i in range(len(origin)): rename_dict[origin[i]] = target[i] train.rename(columns=rename_dict, inplace=True) test.rename(columns=rename_dict, inplace=True) # 查看是否有缺失数据 print("===================统计缺失数据-训练集====================") print(train.isnull().sum(axis=0)) print(train.isnull().any()) print("===================统计缺失数据-测试集====================") print(test.isnull().sum(axis=0)) print(test.isnull().any()) full_data = [train, test] # 性别特征转为数字 for dataset in full_data: dataset['gender'] = dataset['gender'].map({'女': 0, '男': 1}).astype(int) # 处理投资收益,分为五类 for dataset in full_data: dataset['invest'] = dataset['invest_income'] - dataset['invest_loss'] for dataset in full_data: dataset.loc[dataset['invest'] < 0, 'invest'] = 0 dataset.loc[dataset['invest'] == 0, 'invest'] = 1 dataset.loc[(dataset['invest'] > 0) & (dataset['invest'] <= 5000), 'invest'] = 2 dataset.loc[(dataset['invest'] > 5000) & (dataset['invest'] <= 10000), 'invest'] = 3 dataset.loc[dataset['invest'] > 10000, 'invest'] = 4 # 处理省份为数字 for dataset in full_data: province_list = [] for province_name in dataset['province']: province_list.append(int(province_name.replace("省份", "")) / 2) dataset['province'] = np.array(province_list) # 分类特征转为哑变量(数字分类) dumb_columns('job') # 职业类型 dumb_columns('education') # 教育 dumb_columns('nation') # 民族 dumb_columns('home_role') # 家庭角色 dumb_columns('marital_status') # 婚姻状况 dumb_columns('work_type') # 工作情况 # 年龄分类-五类 for dataset in full_data: # Mapping Age dataset.loc[dataset['age'] <= 22, 'age'] = 0 dataset.loc[(dataset['age'] > 22) & (dataset['age'] <= 32), 'age'] = 1 dataset.loc[(dataset['age'] > 32) & (dataset['age'] <= 48), 'age'] = 2 dataset.loc[(dataset['age'] > 48) & (dataset['age'] <= 64), 'age'] = 3 dataset.loc[dataset['age'] > 64, 'age'] = 4 # 工作天数-按比例缩小,防止维度之间差异过大 for dataset in full_data: dataset['work_days'] = dataset['work_days'] / 10 # 删除不必要的列 drop_elements = ['invest_income', 'invest_loss', 'education'] train = train.drop(drop_elements, axis=1) test = test.drop(drop_elements, axis=1) # 显示训练集和测试集特征 print(train.head(3)) print("===") print(test.head(3)) # 模型列表 model_list = [] # =====================逻辑回归=================== # 分割数据 x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=24) # estimator logic = LogisticRegression() logic.fit(x_train, y_train) # 预测 print( "精确率和召回率(逻辑回归):", classification_report(y_test, logic.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) pre_score = logic.score(x_test, y_test) print("准确率(逻辑回归):{}".format(pre_score)) # 输出概率 predictions = logic.predict_proba(x_test) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value)) model_list.append({"model": logic, "auc": auc_value}) # 绘图 plt.title('LogisticRegression AUC') plt.plot(fpr, tpr, 'r', label='AUC_LOGIC = %0.3f' % auc_value) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # plt.savefig("./LogisticRegression_auc.png") # ==============决策树=========== # 数据集分割 x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=24) # 转换为字典数据,并进行特征抽取 dc = DictVectorizer(sparse=False) x_train = dc.fit_transform(x_train.to_dict(orient="records")) features = dc.get_feature_names() x_test = dc.transform(x_test.to_dict(orient="records")) # estimator dec = DecisionTreeClassifier(max_depth=4) dec.fit(x_train, y_train) # 决策树本地保存 # dot -Tpng -o tree.png tree.dot export_graphviz(dec, out_file="./tree.dot", feature_names=features) # 预测 print( "精确率和召回率(决策树):", classification_report(y_test, dec.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) pre_score = dec.score(x_test, y_test) print("准确率(决策树):{}".format(pre_score)) # 输出概率 predictions = dec.predict_proba(x_test) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value)) model_list.append({"model": dec, "auc": auc_value}) # 绘图 plt.title('DecisionTreeClassifier AUC') plt.plot(fpr, tpr, 'b', label='AUC_DTC = %0.3f' % auc_value) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') # plt.savefig("./DecisionTreeClassifier_auc.png") # =============随机森林============== # 数据集分割 x_train, x_test, y_train, y_test = train_test_split(train, y, test_size=0.25, random_state=24) # 转换为字典数据,并进行特征抽取 dc = DictVectorizer(sparse=False) x_train = dc.fit_transform(x_train.to_dict(orient="records")) # print(dc.get_feature_names()) x_test = dc.transform(x_test.to_dict(orient="records")) # estimator rf = RandomForestClassifier(n_estimators=5) rf.fit(x_train, y_train) # 预测 print( "精确率和召回率(随机森林):", classification_report(y_test, rf.predict(x_test), labels=[0, 1], target_names=["非高收入", "高收入"])) pre_score = rf.score(x_test, y_test) print("准确率(随机森林):{}".format(pre_score)) # 输出概率 predictions = rf.predict_proba(x_test) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(y_test, predictions[:, 1]) auc_value = metrics.auc(fpr, tpr) print("auc值为:{}".format(auc_value)) model_list.append({"model": rf, "auc": auc_value}) # 绘图 plt.title('RandomForestClassifier AUC') plt.plot(fpr, tpr, 'y', label='AUC_RF = %0.3f' % auc_value) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.ylabel('tpr') plt.xlabel('fpr') plt.savefig("./count_auc.png") # 模型对比,选择auc值最大的模型进行预测 sorted_key_list = sorted(model_list, key=lambda x: x['auc'], reverse=True) model = sorted_key_list[0]['model'] auc_v = sorted_key_list[0]['auc'] print("选择模型 {}".format(model)) print("AUC值为 {}".format(auc_v)) pre_data = model.predict_proba(test) # 保存目标值 test['Y'] = pre_data[:, 0] test['Y'].to_csv('Results_1.csv', encoding='utf-8', index=False, header=False) # 保存完整版本 test_origin['Y'] = pre_data[:, 0] test_origin.to_csv("./my_results.csv", encoding='utf-8', index=False)
def test_fit_credit_backupsklearn(): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.LogisticRegression enet_h2o4gpu = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet_h2o4gpu.fit(X, y) print("h2o4gpu predict()") print(enet_h2o4gpu.predict(X)) print("h2o4gpu score()") print(enet_h2o4gpu.score(X,y)) enet = Solver(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet.predict(X)) print("h2o4gpu scikit wrapper predict_proba()") print(enet.predict_proba(X)) print("h2o4gpu scikit wrapper predict_log_proba()") print(enet.predict_log_proba(X)) print("h2o4gpu scikit wrapper score()") print(enet.score(X,y)) print("h2o4gpu scikit wrapper decision_function()") print(enet.decision_function(X)) print("h2o4gpu scikit wrapper densify()") print(enet.densify()) print("h2o4gpu scikit wrapper sparsify") print(enet.sparsify()) from sklearn.linear_model.logistic import LogisticRegression enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, intercept_scaling=0.99, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit predict_proba()") print(enet_sk.predict_proba(X)) print("Scikit predict_log_proba()") print(enet_sk.predict_log_proba(X)) print("Scikit score()") print(enet_sk.score(X,y)) print("Scikit decision_function()") print(enet_sk.decision_function(X)) print("Scikit densify()") print(enet_sk.densify()) print("Sciki sparsify") print(enet_sk.sparsify()) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet.coef_) print(enet_sk.intercept_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet.coef_, enet_sk_coef) assert np.allclose(enet.intercept_, enet_sk.intercept_) assert np.allclose(enet.n_iter_, enet_sk.n_iter_) print("Preds should match") assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X)) assert np.allclose(enet.predict(X), enet_sk.predict(X)) assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
def train_model(clf_factory, X, Y, name, plot=False): """ Trains and saves model to disk. """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) #print "cv = ",cv train_errors = [] test_errors = [] scores = [] pr_scores, precisions, recalls, thresholds = list(defaultdict(list)), list( defaultdict(list)), list(defaultdict(list)), list(defaultdict(list)) roc_scores, tprs, fprs = list(defaultdict(list)), list( defaultdict(list)), list(defaultdict(list)) clfs = [] # just to later get the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] global clf clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) """ for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] precision, recall, pr_thresholds = precision_recall_curve( y_label_test, proba_label) pr_scores[label].append(auc(recall, precision)) precisions[label].append(precision) recalls[label].append(recall) thresholds[label].append(pr_thresholds) fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr)""" if plot: for label in labels: print("Plotting %s" % genre_list[label]) scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) // 2] desc = "%s %s" % (name, genre_list[label]) plot_pr(pr_scores[label][median], desc, precisions[label][median], recalls[label][median], label='%s vs rest' % genre_list[label]) plot_roc(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores) ) #222pr_scores[label].append(auc(recall, precision)) print(summary) #save the trained model to disk joblib.dump( clf, r'C:\Users\Rag9704\Documents\GitHub\Music_Genre_Classification\my_model.pkl' ) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
#df_german=pd.read_excel("df_after_vif.xlsx") y = df_german["target"] #x=df_german.ix[:,"Account Balance":"Foreign Worker"] x = df_german.loc[:, "Account Balance":"Foreign Worker"] X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) #solver='liblinear' classifier = LogisticRegression(solver='liblinear') classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) print("accuracy on the training subset:{:.3f}".format( classifier.score(X_train, y_train))) print("accuracy on the test subset:{:.3f}".format( classifier.score(X_test, y_test))) ''' P0 = 50 PDO = 10 theta0 = 1.0/20 B = PDO/np.log(2) A = P0 + B*np.log(theta0) ''' def Score(probability): score = A - B * np.log(probability / (1 - probability)) return score
pl.plot(x[:, 0], x[:, 1], '.') pl.show() pca = PCA(n_components=1) pca.fit(x) print pca.explained_variance_ print np.cov(np.dot(x, v).T) x = load_iris()['data'] y = load_iris()['target'] lr = LR() lr.fit(x, y) lr.score(x, y) #accuracy tx = np.dot(x, lr.coef_.T) pl.plot(tx) pl.show() pca2 = PCA(n_components=2) pca2.fit(x) px = pca2.transform(x) pl.plot(px[0:50, 0], px[0:50, 1], 'r.') pl.plot(px[50:100, 0], px[50:100, 1], 'gx') pl.plot(px[100:150, 0], px[100:150, 1], 'ko') pl.show()
print(root_path) file_path = os.path.abspath( os.path.join(root_path, "python", "datasets", "datasets", "bankloan.xls")) print(file_path) data = pd.read_excel(file_path) x = data.iloc[:, :8].as_matrix() y = data.iloc[:, 8].as_matrix() features_columns = data.columns[:len(data.columns) - 1] rlr = RandomizedLogisticRegression() # random logistics regression model rlr.fit(x, y) # training rlr.get_support() # get feature selection results print(features_columns) print(rlr.get_support()) # get feature selection result. print(rlr.scores_) # get each feature score print(u'RandomizedLogisticRegression feature selection finished.') print(u'The effective features are:\n\t %s' % ', '.join(features_columns[rlr.get_support()])) x = data[features_columns[rlr.get_support()]].as_matrix() # selected features lr = LogisticRegression() # create logistic regression model lr.fit(x, y) # using effective features training model print(u'Accuracy:%s' % lr.score(x, y))
from sklearn import datasets from sklearn.linear_model.logistic import LogisticRegression from sklearn.model_selection import train_test_split iris = datasets.load_iris() X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.33) lgr = LogisticRegression() lgr.fit(X_train, Y_train) score = lgr.score(X_test, Y_test) print(score.mean()) print(score.std())
acc = clf.score(XX, yy) * 100 y_pre = clf.predict(XX) d = np.equal(y_pre, yy) dd = np.sum(np.equal(y_pre, yy) == True) print('matchs:{0}/{1}'.format(np.sum(np.equal(y_pre, yy) == True), yy.shape[0])) print(dd / yy.shape[0]) t = np.array([1, 2, 3]) u = np.array([4, 5, 6]) print(t * u.T) claa = LogisticRegression(max_iter=7000) claa.fit(XX, yy) acc2 = claa.score(XX, yy) y_pre = claa.predict(XX) d = np.equal(y_pre, yy) dd = np.sum(np.equal(y_pre, yy) == True) print('matchs:{0}/{1}'.format(np.sum(np.equal(y_pre, yy) == True), yy.shape[0])) print(dd / yy.shape[0]) data_reg = [] # obtain data for linear regression with open('linear-regression.txt', 'r') as file: line = file.readline().strip('\n').split(',') while line != ['']: line = list(map(float, line)) data_reg.append(line) line = file.readline().strip('\n').split(',')
combine = [train_df, test_df] """ It's part of Model construction """ X_train = train_df.drop("Survived", axis=1) Y_train = train_df["Survived"] X_test = test_df.drop("PassengerId", axis=1).copy() # Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, Y_train) Y_pred1 = logreg.predict(X_test) acc_log = round(logreg.score(X_train, Y_train) * 100, 2) print('acc_log:', acc_log) coeff_df = pd.DataFrame(train_df.columns.delete(0)) coeff_df.columns = ['Feature'] coeff_df["Correlation"] = pd.Series( logreg.coef_[0] ) # Each logistic reg has a coef result and use this way could get values . # print(coeff_df.sort_values(by='Correlation', ascending=False)) svc = SVC() svc.fit(X_train, Y_train) Y_pred2 = svc.predict(X_test) acc_svc = round(svc.score(X_train, Y_train) * 100, 2) print('acc_svc:', acc_svc)
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2, random_state=31) train_sizes = range(10,len(X_train),25) lr = LogisticRegression() nb = GaussianNB() lr_scores = [] nb_scores = [] # Epoches for train_size in train_sizes: X_slice,_,y_slice,_=train_test_split(X_train,y_train,train_size=train_size,stratify=y_train,random_state=31) nb.fit(X_slice,y_slice) nb_scores.append((nb.score(X_test,y_test))) lr.fit(X_slice,y_slice) lr_scores.append((lr.score(X_test,y_test))) # Figure plt.figure() plt.title("Naive Bayes and Logistic Regression Accuracies") plt.xlabel("Number of training instances") plt.ylabel("Test set accuracy") plt.grid(True) plt.plot(train_sizes,nb_scores,label='Naive Bayes') plt.plot(train_sizes,lr_scores,label='Logistic Regression',linestyle='--') plt.legend() plt.savefig('Naive Bayes and Logistic Regression Accuracies.png') # plt.show()
# print all lambda and cv error for key in result.keys(): print('when lambda =', key, 'cv error rate =', result[key]) # select the best model for key in result.keys(): if result[key] == minimum_cv_error: print('the best model is when lambda =', key) print('cv error rate =', minimum_cv_error) train, validation = train_test_split(train_data_frame, test_size=0.33, random_state=42) classify = LogisticRegression(C=1 / key, penalty='l1', solver='liblinear') classify.fit(train.drop(['default payment next month'], axis=1), train['default payment next month']) # print test error print('test error =', 1 - classify.score(test_data_frame.drop(['default payment next month'], axis=1), test_data_frame['default payment next month'])) # print train error for whole train set print('train error =', 1 - classify.score(train.drop(['default payment next month'], axis=1), train['default payment next month'])) break print('') # logis-l2 print('logis-l2') result = {} minimum_cv_error = 1 lmbd = [10 ** x for x in np.arange(-2, 2, 0.1)] for lamda in lmbd: cv_accuracy = []
def train_model(X, Y, name, plot=False): """ Training the model and saving it to disk. """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iterations=1, test_fraction=0.3, indices=True, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #saving the trained model to disk joblib.dump(clf, 'saved_model/model_ceps.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
x_test = digits[N_train:, :] y_test = dig_label[N_train:] # do logistic regression lr = LogisticRegression() lr.fit(x_train, y_train) pred_train = lr.predict(x_train) pred_test = lr.predict(x_test) # calculate train/test accuracy acc_train = accuracy_score(y_train, pred_train) acc_test = accuracy_score(y_test, pred_test) print("accuracy train = %f, accuracy_test = %f" % (acc_train, acc_test)) score_train = lr.score(x_train, y_train) score_test = lr.score(x_test, y_test) print("score_train = %f, score_test = %f" % (score_train, score_test)) # + from sklearn.metrics import confusion_matrix # plot confusion matrix cm = confusion_matrix(y_test, pred_test) plt.matshow(cm) plt.title(u'Confusion Matrix') plt.colorbar() plt.ylabel(u'Groundtruth') plt.xlabel(u'Predict') plt.show()
data = np.zeros((N, len(word_index_map) + 1)) i = 0 for tokens in positive_tokenized: xy = tokens_to_vector(tokens, 1) data[i, :] = xy i += 1 for tokens in negative_tokenized: xy = tokens_to_vector(tokens, 0) data[i, :] = xy i += 1 np.random.shuffle(data) X = data[:, :-1] Y = data[:, -1] Xtrain = X[:-100, ] Ytrain = Y[:-100, ] Xtest = X[-100:, ] Ytest = Y[-100:, ] model = LogisticRegression() model.fit(Xtrain, Ytrain) print("Classification rate:", model.score(Xtest, Ytest)) threshold = 0.5 for word, index in word_index_map.items(): weight = model.coef_[0][index] if weight > threshold or weight < -threshold: print(word, " : ", weight)
def plotshow(solver, trainingSet, testSet): data = [] label = [] data_test = [] label_test = [] for i in range(len(trainingSet)): data.append(list(map(eval, trainingSet[i][:-1]))) label.append(list(map(eval, trainingSet[i][-1]))) for n in range(len(testSet)): data_test.append(list(map(eval, testSet[n][:-1]))) label_test.append(list(map(eval, testSet[n][-1]))) global clf clf = LogisticRegression(C=1000.0, solver=solver, multi_class="ovr") clf.fit(data, label) score = clf.score(data_test, label_test) data = np.array(data) label = np.array(label) x_min, x_max = data[:, 0].min() - .5, data[:, 0].max() + .5 y_min, y_max = data[:, 1].min() - .5, data[:, 1].max() + .5 h = .02 # step size in the mesh xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z = Z.reshape(xx.shape) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) plt.title('Logistic') #plt.figure(1, figsize=(4, 3)) plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired) # Plot also the training points plt.scatter(data[:, 0], data[:, 1], c=np.squeeze(label), edgecolors='k', cmap=plt.cm.Paired) plt.xlabel('petal length') plt.ylabel('petal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.subplot(1, 2, 2) z = np.arange(-5, 5, 0.05) phi_z = sigmoid(z) plt.title('Sigmoid') plt.plot(z, phi_z) plt.axvline(0.0, color='k') plt.ylim(-0.1, 1.1) plt.yticks([0.0, 0.5, 1.0]) ax = plt.gca() ax.yaxis.grid(True) plt.tight_layout() plt.savefig( "E:/Anaconda/Scripts/CorsApi/snippets/static/picture/logistic.jpg") return score
def train_model(X, Y, name, plot=False): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. """ labels = np.unique(Y) print labels cv = ShuffleSplit(n=len(X), n_iter=1, test_size=0.3, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict(list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median],fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model/model_ceps.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
clf = SVC() clf.fit(X_train,y_train) print "使用支持向量分类算法分类结果:" print clf.score(X_test,y_test) #支持向量分类 #nusvm clf=NuSVC() clf.fit(X_train,y_train) print "使用支持向量分类算法分类结果:" print clf.score(X_test,y_test)#核支持向量分类 clf = GaussianNB() clf.fit(X_train,y_train) print "使用朴素贝叶斯分类算法分类结果:" print clf.score(X_test,y_test)#朴素贝叶斯分类 classifier=LogisticRegression() classifier.fit(X_train,y_train) print "使用逻辑回归算法分类结果:" print classifier.score(X_test,y_test)#逻辑回归 classifier=tree.DecisionTreeClassifier() classifier.fit(X_train,y_train) print "使用决策树算法分类结果:" print classifier.score(X_test,y_test) classifier=GradientBoostingClassifier(n_estimators=200) classifier.fit(X_train,y_train) print "使用GBDT算法分类结果:" print classifier.score(X_test,y_test)
def train_model(X, Y, name, plot=False, outModelName=outModelName, testSize=0.3): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. Parameters ---------- outModelName : path to save the trained model (*.pkl) testsize : fracion of the data used for testing Returns ------- outModelName, np.mean(train_errors) np.mean(test_errors) np.asarray(cms) """ labels = np.unique(Y) cv = ShuffleSplit(n=len(X), n_iter=1, test_size=testSize, random_state=0) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] for train, test in cv: X_train, y_train = X[train], Y[train] X_test, y_test = X[test], Y[test] clf = LogisticRegression() clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) cm = confusion_matrix(y_test, y_pred) cms.append(cm) for label in labels: y_label_test = np.asarray(y_test == label, dtype=int) proba = clf.predict_proba(X_test) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk if outModelName: joblib.dump(clf, outModelName) return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
def Logistic_Regression(iter_argument=200): classifier_LG = LogisticRegression(max_iter=iter_argument) classifier_LG.fit(X_train, y_train) test_accur = (classifier_LG.score(X_test, y_test)) train_accur = (classifier_LG.score(X_train, y_train)) return test_accur, train_accur
def test_fit_credit_backupsklearn(): df = pd.read_csv("./open_data/creditcard.csv") X = np.array(df.iloc[:, :df.shape[1] - 1], dtype='float32', order='C') y = np.array(df.iloc[:, df.shape[1] - 1], dtype='float32', order='C') Solver = h2o4gpu.LogisticRegression enet_h2o4gpu = Solver(glm_stop_early=False) print("h2o4gpu fit()") enet_h2o4gpu.fit(X, y) print("h2o4gpu predict()") print(enet_h2o4gpu.predict(X)) print("h2o4gpu score()") print(enet_h2o4gpu.score(X, y)) enet = Solver(dual=True, max_iter=100, tol=1E-4, random_state=1234) print("h2o4gpu scikit wrapper fit()") enet.fit(X, y) print("h2o4gpu scikit wrapper predict()") print(enet.predict(X)) print("h2o4gpu scikit wrapper predict_proba()") print(enet.predict_proba(X)) print("h2o4gpu scikit wrapper predict_log_proba()") print(enet.predict_log_proba(X)) print("h2o4gpu scikit wrapper score()") print(enet.score(X, y)) print("h2o4gpu scikit wrapper decision_function()") print(enet.decision_function(X)) print("h2o4gpu scikit wrapper densify()") print(enet.densify()) print("h2o4gpu scikit wrapper sparsify") print(enet.sparsify()) from sklearn.linear_model.logistic import LogisticRegression enet_sk = LogisticRegression(dual=True, max_iter=100, tol=1E-4, random_state=1234) print("Scikit fit()") enet_sk.fit(X, y) print("Scikit predict()") print(enet_sk.predict(X)) print("Scikit predict_proba()") print(enet_sk.predict_proba(X)) print("Scikit predict_log_proba()") print(enet_sk.predict_log_proba(X)) print("Scikit score()") print(enet_sk.score(X, y)) print("Scikit decision_function()") print(enet_sk.decision_function(X)) print("Scikit densify()") print(enet_sk.densify()) print("Sciki sparsify") print(enet_sk.sparsify()) enet_sk_coef = csr_matrix(enet_sk.coef_, dtype=np.float32).toarray() print(enet_sk.coef_) print(enet_sk_coef) print(enet.coef_) print(enet_sk.intercept_) print("Coeffs, intercept, and n_iters should match") assert np.allclose(enet.coef_, enet_sk_coef) assert np.allclose(enet.intercept_, enet_sk.intercept_) assert np.allclose(enet.n_iter_, enet_sk.n_iter_) print("Preds should match") assert np.allclose(enet.predict_proba(X), enet_sk.predict_proba(X)) assert np.allclose(enet.predict(X), enet_sk.predict(X)) assert np.allclose(enet.predict_log_proba(X), enet_sk.predict_log_proba(X))
# get word vector x = np.zeros((len(raw_x), 100)) for i in range(len(raw_x)): x[i] = model[raw_x[i]] print(x.shape, y.shape) print(np.unique(y)) # training set and test set split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y) # logistic regression lr = LogisticRegression(C=1.0, solver='sag', max_iter=400, n_jobs=-1) lr.fit(x_train, y_train) print('Logistic regression accuracy: ', lr.score(x_test, y_test)) y_pred = lr.predict(x_test) ac = np.sum([1 if y_pred[i] == y_test[i] else 0 for i in range(len(y_pred))]) / len(y_pred) # svm sigma = 0.1 # sigma影响聚集程度,从而影响泛化程度, sigma小则高斯分布基本作用于支持向量 # 附近,sigma大则可作用范围变大,泛化程度提高. sigma过小会造成过拟合 gamma = np.power(sigma, -2.) / 2. svm_model = SVC(C=1.0, kernel='rbf', gamma=gamma) svm_model.fit(x_train, y_train) print('svm accuracy: ', svm_model.score(x_test, y_test)) # neural networks # 先将y转化为one-hot形式, 即(len(y), np.unique(y).shape]) ==> (30804, 3)
vect__norm: 'l2' vect__use_idf: True """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix __author__ = 'gavin' import pandas as pd df = pd.read_csv('sms/sms.csv') X_train_r, X_test_r, y_train, y_test = train_test_split( df['message'], df['label']) vectorizer = TfidfVectorizer(max_df=0.5, max_features=None, ngram_range=(1, 1), norm='l2', use_idf=True) X_train = vectorizer.fit_transform(X_train_r) X_test = vectorizer.transform(X_test_r) classifier = LogisticRegression(penalty='l2', C=7) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) print 'score', classifier.score(X_test, y_test) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print confusion_matrix(y_test, predictions)
#plot and show the confusion matrix with color bar plt.matshow(confusion_matrix) plt.title('Sentiment Analysis from reviews') plt.ylabel('True Values') plt.xlabel('Predicted Values') plt.colorbar() plt.show() # %% #recall precision accuracy and f1 scores print('Accuracy: %s' % accuracy_score(Y_test, regressor.predict(X_test))) #print('Recall: %s'%recall_score(Y_test, regressor.predict(X_test), average='macro')) #print('Precision: %s'%precision_score(Y_test, regressor.predict(X_test), average='macro')) #print('F1: %s'%f1_score(Y_test, regressor.predict(X_test), average='macro')) print('CR: %s' % classification_report(Y_test, regressor.predict(X_test))) print('R Square: %s' % regressor.score(X_test, Y_test)) print('Mean sqared error: %s' % msq(Y_test, regressor.predict(X_test))) ### USING GRID SEARCH ### (called only when the funtion main is called) # %% #CROSS VAL SCORE #print('Cross Val Score: %s'%cross_val_score(regressor, X_vec, Y, cv=5)) # %% def main(): pipeline = Pipeline([('vect', TfidfVectorizer(stop_words='english')), ('reg', LogisticRegression())]) parameters = { 'vect__max_df': (0.25, 0.5, 0.75), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)),
clf__C: 7.0 clf__penalty: 'l2' vect__max_df: 0.5 vect__max_features: None vect__ngram_range: (1, 2) vect__norm: 'l2' vect__use_idf: True """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix __author__ = 'gavin' import pandas as pd df = pd.read_csv('sms/sms.csv') X_train_r, X_test_r, y_train, y_test = train_test_split(df['message'], df['label']) vectorizer = TfidfVectorizer(max_df=0.5, max_features=None, ngram_range=(1, 1), norm='l2', use_idf=True) X_train = vectorizer.fit_transform(X_train_r) X_test = vectorizer.transform(X_test_r) classifier = LogisticRegression(penalty='l2', C=7) classifier.fit(X_train, y_train) predictions = classifier.predict(X_test) print 'score', classifier.score(X_test, y_test) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print confusion_matrix(y_test, predictions)
for num in number: data = [] mark = [] #with open('/Users/hhy/Desktop/1/node/final/cos/herb_only_meansum_'+str(threshold)+'.csv','r',encoding='utf-8_sig') as f: with open('/Users/hhy/Desktop/1/node/final/cmp/' + str(num) + 'emb' + str(threshold) + '.csv', 'r', encoding='utf-8_sig') as f: csv_reader = csv.reader(f) for x in csv_reader: data.append(list(map(float, x[0:-1]))) mark.append(float(x[-1])) tmp_acc = [] tmp_auc = [] name = str(num) + 'emb' + str(threshold) for i in range(10): X_train, X_test, y_train, y_test = cross_validation.train_test_split( data, mark, test_size=0.05, random_state=i) clf = LogisticRegression(C=4.8, random_state=1113) clf.fit(X_train, y_train) tmp_acc.append(clf.score(X_test, y_test)) y_predict = clf.predict_proba(X_test)[:, 1] tmp_auc.append(metrics.roc_auc_score(y_test, y_predict)) # 验证集上的auc值 final_acc[name] = (sum(tmp_acc) / len(tmp_acc)) final_auc[name] = (sum(tmp_auc) / len(tmp_auc)) final_acc = sorted(final_acc.items(), key=lambda x: x[1], reverse=True) final_auc = sorted(final_auc.items(), key=lambda x: x[1], reverse=True) print('final acc:', final_acc) print('final auc:', final_auc)
def train(data): X, Y, _ = vectorize(data) classifier = LogisticRegression() classifier.fit(X,Y) print(classifier.score(X,Y))
def train_model(X_train, y_train, X_test, y_test, name, plot=False): """ train_model(vector, vector, name[, plot=False]) Trains and saves model to disk. """ labels = np.unique(y_train) train_errors = [] test_errors = [] scores = [] pr_scores = defaultdict(list) precisions, recalls, thresholds = defaultdict(list), defaultdict( list), defaultdict(list) roc_scores = defaultdict(list) tprs = defaultdict(list) fprs = defaultdict(list) clfs = [] # for the median cms = [] # print "X_train::" # print X_train # print "X_test::" # print X_test # print "y_train::" # print y_train # print "y_test::" # print y_test clf = LogisticRegression() #clf=GaussianNB() #clf=SVC(probability=True) clf.fit(X_train, y_train) clfs.append(clf) train_score = clf.score(X_train, y_train) test_score = clf.score(X_test, y_test) print "train_score:: " + str(train_score) print "test_score:: " + str(test_score) scores.append(test_score) train_errors.append(1 - train_score) test_errors.append(1 - test_score) y_pred = clf.predict(X_test) print y_pred cm = confusion_matrix(y_test, y_pred) cms.append(cm) # cms = np.asarray(cms) # cm_avg = np.mean(cms, axis=0) # cm_norm = cm_avg / np.sum(cm_avg, axis=0) # plot_confusion_matrix(cm_norm, genre_list, "ceps","CEPS classifier - Confusion matrix") for label in labels: #print "label "+str(label) y_label_test = np.asarray(y_test == label, dtype=int) #print "y_label_test "+str(y_label_test) proba = clf.predict_proba(X_test) #print str(len(proba))+"proba "+str(proba) proba_label = proba[:, label] fpr, tpr, roc_thresholds = roc_curve(y_label_test, proba_label) roc_scores[label].append(auc(fpr, tpr)) tprs[label].append(tpr) fprs[label].append(fpr) #sys.exit(1) if plot: for label in labels: scores_to_sort = roc_scores[label] median = np.argsort(scores_to_sort)[len(scores_to_sort) / 2] desc = "%s %s" % (name, genre_list[label]) plot_roc_curves(roc_scores[label][median], desc, tprs[label][median], fprs[label][median], label='%s vs rest' % genre_list[label]) all_pr_scores = np.asarray(pr_scores.values()).flatten() summary = (np.mean(scores), np.std(scores), np.mean(all_pr_scores), np.std(all_pr_scores)) #print("%.3f\t%.3f\t%.3f\t%.3f\t" % summary) #save the trained model to disk joblib.dump(clf, 'saved_model/model_ceps.pkl') return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)
data_frame=pd.read_csv('data.csv') X = data_frame[['speed_p','speed_r','speed_d','distance_d_p','distance_d_r','distance_d1_p','distance_d2_r' ,'angle_d','angle_d1_p','angle_d2_r' , 'possTimePre','possessionTime']] Y = data_frame[['flag']] #用pandas加载数据.csv文件,然后用train_test_split分成训练集(75%)和测试集(25%): X_train, X_test, y_train, y_test = train_test_split(X,Y.values.T[0],random_state=1) #LogisticRegression同样实现了fit()和predict()方法 classifier=LogisticRegression() classifier.fit(X_train,y_train) predictions=classifier.predict(X_test) probabilities = classifier.predict_proba(X_test) score = classifier.score(X_test,y_test) print("probabilities" ,probabilities) print("R-requested", score) def rmse(y_test, y): return sp.sqrt(sp.mean((y_test - y) ** 2)) # 均方误差及log-loss print("rmse" ,rmse(predictions,y_test)) print("log_loss" ,log_loss(y_test,predictions)) # 线性组合系数数组 coef = classifier.coef_ print(coef)
def classify(data_set_df, user_info_df, feat_set_name, features=None, label='gender', classifier=None, reg_param=1.0, selection=False, num_feat=20, sel_method='LR', cv=10): instance_num = len(data_set_df.columns) df_filtered, y_v = pc.get_filtered_x_y(data_set_df, user_info_df, label) x = df_filtered if features is None else df_filtered.loc[features] x = x.dropna(how='all', axis=0) x = x.dropna(how='all', axis=1) if x.isnull().any().any() or (x == np.inf).any().any() or (x == -np.inf).any().any(): x_imp = pc.fill_nan_features(x) # x_imp = dense_df.loc[x.index, x.columns] else: x_imp = x y_filtered = y_v[(map(int, x.columns.values))] clf = LogisticRegression(C=reg_param) if classifier is None else classifier cv_num = min(len(y_filtered), cv) score_mean = 0.0 miss_clf_rate = 1.0 if cv_num > 1 and len(y_filtered.unique()) > 1: kf = KFold(y_filtered.shape[0], n_folds=cv_num, shuffle=True) # skf = StratifiedKFold(y_filtered, n_folds=cv_num, shuffle=True) fold = 0 result_str = "" matrix_str = "" for tr_index, te_index in kf: fold += 1 x_train, x_test = x_imp.T.iloc[tr_index], x_imp.T.iloc[te_index] y_train, y_test = y_filtered.iloc[tr_index], y_filtered.iloc[te_index] if selection: if sel_method == 'LR' or 'RF' in sel_method: feat_index = fimp.feature_selection(x_train.T, user_info_df, num_feat, method=sel_method, label=label) else: x_tr_df, x_te_df = x.T.iloc[tr_index].T, x.T.iloc[te_index].T feat_index = fimp.feature_selection(x_tr_df, user_info_df, num_feat, method=sel_method, label=label) x_train = x_train.loc[:, feat_index].values x_test = x_test.loc[:, feat_index].values try: clf.fit(x_train, y_train) score = clf.score(x_test, y_test) score_mean += score result_str += "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s\n" \ % (label, True if param.FILL_SUFFIX in feat_set_name else False, True if param.SCALING_SUFFIX in feat_set_name else False, selection, 'LR', reg_param, cv, fold, x_train.shape[1], score) cf_mat = confusion_matrix(y_test, clf.predict(x_test), labels=range(len(info.LABEL_CATEGORY[label]))) matrix_str += np.array_str(cf_mat) + "\n" except ValueError: pass # traceback.print_exc() # print i, "why error? skip!" print result_str file_name = "%s/new_%s.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(result_str) file_name = "%s/new_%s_mat.csv" % (param.EXPERIMENT_PATH, feat_set_name) with open(file_name, mode='a') as f: f.write(matrix_str) if fold > 0: score_mean = score_mean / fold miss_clf_rate = (float(instance_num - len(y_filtered)) / instance_num) return score_mean, miss_clf_rate
def Logistic_train(X_in, y_in, X_out, cs, file_log=None): if file_log: file_log.writelines('# of Samples: {}, # of Features: {}\n'.format(len(X_in), len(X_in[0]))) M = len(X_in[0]) #Number of features seed(time()) #To prevent data snooping, breakes the input set into train. cross validation and test sets, with sizes proportional to 8-1-1 #First puts aside 10% of the data for the tests test_indices, train_indices = split_indices(len(X_in), int(round(0.1*len(X_in)))) X_scaler = [X_in[i] for i in test_indices] y_scaler = [y_in[i] for i in test_indices] X_in = [X_in[i] for i in train_indices] y_in = [y_in[i] for i in train_indices] #scale data first scaler = Scaler(copy=False) #in place modification #Normalize the data and stores as inner parameters the mean and standard deviation #To avoid data snooping, normalization is computed on training set only, and then reported on data scaler.fit(X_scaler, y_scaler) X_scaler = scaler.transform(X_scaler) X_in = scaler.transform(X_in) X_out = scaler.transform(X_out) #uses the same transformation (same mean_ and std_) fit before std_test = X_scaler.std(axis=0) f_indices = [j for j in range(M) if std_test[j] > 1e-7] #Removes feature with null variance X_in = [[X_in[i][j] for j in f_indices] for i in range(len(X_in))] X_scaler = [[X_scaler[i][j] for j in f_indices] for i in range(len(X_scaler))] X_out = [[X_out[i][j] for j in f_indices] for i in range(len(X_out))] M = len(X_in[0]) #Then, on the remaining data, performs a ten-fold cross validation over the number of features considered best_cv_accuracy = 0. best_c = 0. for c in cs: kfold = cross_validation.StratifiedKFold(y_in, k=10) lrc = LogisticRegression(C=c, tol=1e-5) in_accuracy = 0. cv_accuracy = 0. for t_indices, cv_indices in kfold: X_train = array([X_in[i][:] for i in t_indices]) y_train = [y_in[i] for i in t_indices] X_cv = array([X_in[i][:] for i in cv_indices]) y_cv = [y_in[i] for i in cv_indices] lrc.fit(X_train, y_train) in_accuracy += lrc.score(X_train, y_train) cv_accuracy += lrc.score(X_cv, y_cv) in_accuracy /= kfold.k cv_accuracy /= kfold.k if file_log: file_log.writelines('C: {}\n'.format(c)) file_log.writelines('\tEin= {}\n'.format(1. - in_accuracy)) file_log.writelines('\tEcv= {}\n'.format(1. - cv_accuracy)) if (cv_accuracy > best_cv_accuracy): best_c = c best_cv_accuracy = cv_accuracy #Now tests the out of sample error if file_log: file_log.writelines('\nBEST result: E_cv={}, C={}\n'.format(1. - best_cv_accuracy, best_c)) lrc = LogisticRegression(C=best_c, tol=1e-5) lrc.fit(X_in, y_in) if file_log: file_log.writelines('Ein= {}\n'.format(1. - lrc.score(X_in, y_in))) file_log.writelines('Etest= {}\n'.format(1. - lrc.score(X_scaler, y_scaler))) y_out = lrc.predict(X_out) return y_out