class NormalSVCTrainer(AbstractLearner): def __init__(self, kernel='linear', gamma='auto', penalty=1.0, cache=200, scale=True, scheme='ovr', class_w='balanced'): self.learner = SVC(C=penalty, kernel=kernel, gamma=gamma, probability=True, cache_size=cache, decision_function_shape=scheme, class_weight=class_w) self.kernel = kernel self.gamma = gamma self.penalty = penalty self.scheme = scheme self.scale = scale def _train(self, x_train, y_train): if self.scale: self.scaler = preprocessing.StandardScaler().fit(x_train) x_scaled = self.scaler.transform(x_train) self.learner = self.learner.fit(x_scaled, y_train) else: self.learner = self.learner.fit(x_train, y_train) def _predict(self, x): if self.scale: x_scaled = self.scaler.transform(x) return self.learner.predict(x_scaled) else: return self.learner.predict(x) def _predict_proba(self, x): if self.scale: x_scaled = self.scaler.transform(x) return self.learner.predict_proba(x_scaled) else: return self.learner.predict_proba(x) def __str__(self): return 'SVC (kernel=%s, penalty: %f, scheme: %s, gamma=%s)' % \ (self.kernel, self.penalty, self.scheme, str(self.gamma))
def pipeline(iteration, C, gamma, random_seed): x_train, _x, y_train, _y = train_test_split(train_x, train_y, test_size=0.4, random_state=random_seed) print x_train.shape clf = SVC( C=C, kernel="rbf", gamma=gamma, probability=True, cache_size=7000, class_weight="balanced", verbose=True, random_state=random_seed, ) clf.fit(x_train, y_train) # predict test set pred = clf.predict_proba(test_x) test_result = pd.DataFrame(columns=["Idx", "score"]) test_result.Idx = test_Idx test_result.score = pred[:, 1] test_result.to_csv("./test/svm_{0}.csv".format(iteration), index=None) # predict val set pred = clf.predict_proba(val_x) val_result = pd.DataFrame(columns=["Idx", "score"]) val_result.Idx = val_Idx val_result.score = pred[:, 1] val_result.to_csv("./val/svm_{0}.csv".format(iteration), index=None)
def svm_grid_search(): #get data training_input,training_target,validation_input,validation_target = prepare_input() #set up scorer for grid search. log-loss is error, not score, so set greater_is_better to false, #and log-loss requires a probability log_loss_scorer = make_scorer(log_loss,greater_is_better=False,needs_proba=True) training_input = training_input[:100000] training_target = training_target[:100000] print training_input.shape[0] print training_target.shape[0] start = time.time() svm = SVC(random_state=31,probability=True) svm_parameters = {'C':[.001,.01,.1,1,10,100],'kernel':["rbf","sigmoid"]} svm_grid_obj = GridSearchCV(svm,svm_parameters,log_loss_scorer,verbose=2,n_jobs=-1) svm_grid_obj = svm_grid_obj.fit(training_input,training_target) svm = svm_grid_obj.best_estimator_ print "Best params: " + str(svm_grid_obj.best_params_) svm_train_error = log_loss(training_target,svm.predict_proba(training_input)) svm_validation_error = log_loss(validation_target,svm.predict_proba(validation_input)) print "Best SVM training error: {:02.4f}".format(svm_train_error) print "Best SVM validation error: {:02.4f}".format(svm_validation_error) end = time.time() print "RF grid search took {:02.4f} seconds".format(end-start) return svm
def svm_solver(train_data, train_label, validation, test, dimreduce, convertbinary) : """ """ logging.info ('begin to train the svm classifier') # train_data = train_data[:100,:] # validation = validation[:100,:] # test = test[:100,:] # train_label = train_label[:100] train_data, validation, test = dimreduce(train_data, train_label, validation, test) # print new_train_data.shape train_data, validation, test = convertbinary(train_data, validation, test) """ svc = SVC () params_rbf = {"kernel": ['rbf'], "class_weight": ['auto'], "C": [0.1 ,0.2 ,0.3 ,0.5 ,1, 2, 3, 5, 10], "gamma": [0.01, 0.03, 0.05, 0.1, 0.2, 0.3, 0.5], "tol": 10.0** -np.arange(1, 5), "random_state": [1000000007]} logging.info ("Hyperparameter opimization using RandomizedSearchCV...") rand_search_result = RandomizedSearchCV (svc, param_distributions = params_rbf, n_jobs = -1, cv = 3, n_iter = 30) # rand_search_result = GridSearchCV (svc , param_grid = params_rbf , n_jobs = 8 , cv = 3) rand_search_result.fit (train_data , train_label) params = tools.report (rand_search_result.grid_scores_) """ params = {'kernel': 'poly', 'C': 0.1, 'random_state': 1000000007, 'tol': 0.001, 'gamma': 0.1, 'class_weight': 'auto'} svc = SVC (probability = True, **params) svc.fit (train_data , train_label) evaluate.get_auc (svc.predict_proba (validation)[:,1]) return svc.predict_proba (test)[:,1]
def main(): ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required = True, help = "Path to the image") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) rects, img = detect(image) cropped = [] for idx, (x1, y1, x2, y2) in enumerate(rects): crop_img = image[y1:y1 + (y2 - y1), x1:x1 + (x2 - x1)] crop_img = cv2.resize(crop_img, (100,100), interpolation = cv2.INTER_AREA) cv2.imshow("image" + str(idx), crop_img) new_img = crop_img.reshape(crop_img.shape[0] * crop_img.shape[1], 3) cropped.append(new_img.flatten()) # reduce feature size cropped_pca = [] pca = RandomizedPCA(n_components=100) cropped_pca = pca.fit_transform(cropped) # training (hardcoded for now) clf = SVC(probability=True) train = cropped_pca[:7] test = cropped_pca[7:13] # clf.fit([[0,0],[1,1]], [1, 2]) clf.fit(train, [1,2,2,1,2,1,1]) for item in test: print clf.predict_proba(item) print clf.predict(item) cv2.waitKey(0)
def go_by_category_2(category): input, targets, scaler = TrainingFactory.get_training_data_by_category(category,10000) input_train, input_test, target_train, target_test = train_test_split(input, targets, test_size=0.1) test_data_sparse = TestingFactory.get_test_data(limit=1000) test_data_scaled = scaler.transform(test_data_sparse) test_data = csr_matrix(test_data_scaled) classif = SVC(kernel='rbf',C=0.1, tol=0.001, probability=True) classif.fit(input_train, target_train) output_targets_proba = classif.predict_proba(input_test) outputs_predicted_proba = [item[1] for item in output_targets_proba] output_targets = classif.predict(input_test) # print output_targets.tolist() # print outputs_predicted_proba # print target_test print log_loss(target_test, output_targets) accuracy = accuracy_score(target_test, output_targets) print accuracy print confusion_matrix(target_test, output_targets) testing_output = classif.predict_proba(test_data) testing_output_proba = [item[1] for item in testing_output] print testing_output_proba return accuracy, output_targets, testing_output_proba
def svm_classify(threshold): data=pd.DataFrame() i=0 xprev=0 xprev2=0 for x in cot.columns[:-1]: data[x]=cot[x]/pd.rolling_mean(cot[x],5) data[x+'_polynomial2']=data[x]*data[x] data[x+'_polynomial3']=data[x]*data[x]*data[x] if (xprev!=0): data[x+'_polynomial_x_2']=data[x]*data[xprev] if (xprev2!=0): data[x+'_polynomial_x_3']=data[x]*data[xprev2]*data[xprev] i=i+1 xprev=x xprev2=xprev data['return']=((brent.shift(-5).Rate/brent.shift(-1).Rate)-1)>0 data=data[8:].dropna(1) x_train, x_test, y_train, y_test = train_test_split(data.iloc[:-1,:-1], data.iloc[:-1,-1], test_size=0.5) gbc=SVC (kernel='rbf',probability=True,C=1) gbc.fit(x_train,y_train) #min_max_scaler=MinMaxScaler() #mms=min_max_scaler.fit(list(max(a) for a in gbc.predict_proba(x_train))) pr=list(max(a) for a in gbc.predict_proba(x_test)) Y=pd.DataFrame() Y['actual']=y_test Y['predicted']=gbc.predict(x_test) Y['P']=mms.transform(list(max(a) for a in gbc.predict_proba(x_test))) Y_filtered=Y[Y.P>threshold] cm=confusion_matrix(Y_filtered.actual,Y_filtered.predicted) return [gbc.score(x_test,y_test,pr>threshold),cm,'Prediction of UP is %s; P = %s' %(gbc.predict(data.iloc[-1:,:-1])[0], list((max(x)) for x in gbc.predict_proba(data.iloc[-1:,:-1]))[0] ),brent]
def grid_searcher(self): X_train, X_test, Y_train, Y_test = self.cv_data[-1] X_train = np.vstack((X_train, X_test)) Y_train = np.concatenate((Y_train, Y_test)) stratifiedCV = StratifiedKFold(Y_train, 10) ansDict = {} ansDict["train"] = {} ansDict["test"] = {} C_range = 10.0 ** np.arange(-4, 9) gamma_range = 10.0 ** np.arange(-5, 4) for ind, i in enumerate(C_range): for jnd, j in enumerate(gamma_range): # Cantor's pairs dictInd = ((ind + jnd + 2) ** 2 + (ind + 1) - (jnd + 1)) / 2 ansDict["train"][dictInd] = [] ansDict["test"][dictInd] = [] for train, test in stratifiedCV: X_trainT, X_testT, Y_trainT, Y_testT = ( X_train[train, :], X_train[test, :], Y_train[train, :], Y_train[test, :], ) svc = SVC(kernel="rbf", C=i, gamma=j, probability=True, class_weight="auto") svc.fit(X_trainT, Y_trainT) ansDict["train"][dictInd].append(logloss(Y_trainT, svc.predict_proba(X_trainT)[:, 1])) ansDict["test"][dictInd].append(svc.predict_proba(self.testMat)[:, 1]) meanScores = [] for i, j in ansDict["train"].items(): wut = np.array(j) meanScores.append(wut.mean()) meanScores = np.array(meanScores) meanScores[meanScores < 0] = 1.0 print(meanScores.min()) paramGood = np.where(meanScores == meanScores.min())[0][0] testPred = ansDict["test"][paramGood] finalPred = np.vstack(testPred).mean(axis=0) def write_prediction(f): g = open("sc_prediction.csv", "w") for i in f: g.write(str(i) + "\n") g.close() write_prediction(finalPred)
def svcmodel(d,X_2,y_2,X_3,y_3,X_test,y_test): X_3_copy = X_3.copy(deep=True) X_3_copy['chance']=0 index = 0 ########## k折交叉验证 ########################### scores = cross_val_score(SVC(), X_2, y_2, cv=5, scoring='accuracy') score_mean =scores.mean() print(d+'5折交互检验:'+str(score_mean)) ################################################# svc = SVC(probability=True).fit(X_2,y_2) ################ 预测测试集 ################ answer_svc = svc.predict(X_test) accuracy = metrics.accuracy_score(y_test,answer_svc) print(d+'预测:'+str(accuracy)) ############################################### chance = svc.predict_proba(X_3)[:,1] for c in chance: X_3_copy.iloc[index,len(X_3_copy.columns)-1]=c index += 1 chance_que = X_3_copy.iloc[:,len(X_3_copy.columns)-1] return chance_que
class LinearSVMPredictor(PredictorBase): ''' Linear SVM ''' def __init__(self, animal_type): self.animal_type = animal_type self.clf = SVC( kernel="linear", C=1.0, probability=True, random_state=0) def fit(self, X_train, y_train): self.clf.fit(X_train, y_train) def predict(self, X_test): predictions = self.clf.predict_proba(X_test) predictions_df = self.bundle_predictions(predictions) return predictions_df def find_best_params(self): parameters = {'kernel': ["linear"], 'C': [0.025, 1.0]} svc = SVC() clf = grid_search.GridSearchCV(svc, parameters) train_data = get_data('../data/train.csv') train_data = select_features(train_data, self.animal_type) X = train_data.drop(['OutcomeType'], axis=1) y = train_data['OutcomeType'] clf.fit(X, y) print clf.best_params_
def svc(n_components=10): """ Train a support vector classifier after dimensionality reduction with PCA. Each fold takes ~10 min. First fold gave log loss: 0.684875244651 """ train = pandas.read_csv('train.csv') y = train['target'].values X = raw_scaled_features(train) folds = StratifiedKFold(train['target'], 10) for train_indices, test_indices in folds: #print train_indices, test_indices X_train = X[train_indices] y_train = y[train_indices] X_test = X[test_indices] y_test = y[test_indices] pca = PCA(n_components=n_components) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) #print X_train.shape svc = SVC(probability=True, verbose=False) svc.fit(X_train, y_train) y_prob = svc.predict_proba(X_test) print log_loss(y_test, y_prob, svc.classes_)
def support_vector(XTrain, yTrain, XTest): svm = SVC(kernel='linear',probability = True) svm.fit(XTrain, yTrain) scores = svm.predict_proba(XTest) labels = svm.predict(XTest) return (labels, scores)
class SVMPredictor(object): """" A simple application of SVM classifier @author: Shaun """ def __init__(self): self.clf = SVC(probability=True) @abstractmethod def fit(self, X, y): """ Method to fit the model. Parameters: X - 2d numpy array of training data y - 1d numpy array of training labels """ self.clf = self.clf.fit(X, y) @abstractmethod def predict(self, X): """ Method to apply the model data Parameters: X - 2d numpy array of test data """ return self.clf.predict_proba(X)[:, 1]
def test(self): X, y = self.dataMat,self.labelMat X_test = self.testData clf = SVC(kernel='linear', C= 0.001, probability=True) clf.fit(X, y); y_pred = clf.predict(X_test[1,:]); y_predprob = clf.predict_proba(X_test[1,:]);
def predict_svc(X_train, y_train, X_test, sample_weight): clf = SVC(degree=3, gamma=0.0, kernel='rbf', probability=True) clf.fit(X_train, y_train, sample_weight=sample_weight) predictions = clf.predict_proba(X_test) return predictions
def svc((C, gamma)): s = SVC(C=C, gamma=gamma, probability=True) start = time.time() s.fit(X[:border], y[:border]) train_time = time.time() - start pred = s.predict_proba(X[border:])[:, 0] test_time = (time.time() - start) - train_time # This is the literal is-it-the-right-answer binary score. # This measure is what we try to maximize but its relation to question # accuracy is complicated accu = np.sum((pred > 0.5) == y) / len(y) ### This is the actual question prediction error, in bits # First, find the probabilities pred_y = pred * y[border:] # These are the probabilities for right answers pred_y = pred_y[pred_y.nonzero()] # the same, stripped of 0's mean_bits = np.mean(-np.log(pred_y) / np.log(2)) # measured in mean bits ### This is the literal accuracy - it gets complicated # Sort the answers by probability, descending (only getting the indices) confidence_order = np.argsort(pred) # This indexing trick always takes the last assignment for each index # This will hold the index of the best answer for each question best_answer = np.zeros(np.max(q.astype(int))+1) best_answer[q[confidence_order].astype(int)] = confidence_order # Take the average correctness of the best answer accu_by_q = y[border:][best_answer.astype(int)].mean() return [C, gamma, accu, mean_bits, accu_by_q, train_time, test_time]
def train_validate_test(parameter_dict, X_train, X_validate, y_train, y_validate, scaler): classifier = SVC(C=parameter_dict["C"], kernel=parameter_dict["kernel"], degree=parameter_dict["degree"], gamma=parameter_dict["gamma"], coef0=parameter_dict["coef0"], probability=parameter_dict["probability"], shrinking=parameter_dict["shrinking"], tol=parameter_dict["tol"], cache_size=parameter_dict["cache_size"], class_weight=parameter_dict["class_weight"], verbose=parameter_dict["verbose"], max_iter=parameter_dict["max_iter"], random_state=parameter_dict["random_state"]) print "training ..." classifier.fit(X_train, y_train) print "trained." print "testing ..." y_train_proba = classifier.predict_proba(X_train) y_validate_proba = classifier.predict_proba(X_validate) print "tested." acc_train = classifier.score(X_train, y_train) acc_validate = classifier.score(X_validate, y_validate) print "mean accuracy on training set: %s" % str(acc_train) print "mean accuracy on validation set: %s" % str(acc_validate) encoder = LabelEncoder() logloss_train = logloss_mc(encoder.fit_transform(y_train), y_train_proba) print "logarithmic loss on training set: %s" % str(logloss_train) logloss_validate = logloss_mc(encoder.fit_transform(y_validate), y_validate_proba) print "logarithmic loss on validateion set: %s" % str(logloss_validate) info_dict = parameter_dict.copy() info_dict["acc_train"] = acc_train info_dict["acc_validate"] = acc_validate info_dict["logloss_train"] = logloss_train info_dict["logloss_validate"] = logloss_validate make_submission(classifier, encoder, scaler, info_dict) return info_dict
def svc(X,y,Z,test_data): from sklearn.svm import SVC svc = SVC(probability=True) svc.fit(X, y) test_probs_svc = svc.predict_proba(Z)[:1] sub = pd.DataFrame({'enrollment_id':test_data["enrollment_id"], 'truth':test_probs_svc}).set_index("enrollment_id") sub.to_csv('data\\result\\seventh_svc.csv')
def __metric_pipeline(self, metric, params={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) metric_stage = wrap_and_make_instance(metric, **params) in_keys = metric_stage.input_keys out_keys = metric_stage.output_keys p = Pipeline() node_X_in = p.add(NumpyRead(X_in)) node_y_in = p.add(NumpyRead(y_in)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_X_in["output"] > node_split["input0"] node_y_in["output"] > node_split["input1"] ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = train_test_split(X_in, y_in, random_state=0) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_split["train0"] > node_clf["X_train"] node_split["train1"] > node_clf["y_train"] node_split["test0"] > node_clf["X_test"] ctrl_clf = SVC(random_state=0, probability=True) ctrl_clf.fit(ctrl_X_train, ctrl_y_train) node_proba_1 = p.add(SplitY(1)) node_clf["pred_proba"] > node_proba_1["input"] ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1] node_metric = p.add(metric_stage) ctrl_metric_args = {} if "y_true" in in_keys: node_split["test1"] > node_metric["y_true"] ctrl_metric_args["y_true"] = ctrl_y_test if "y_score" in in_keys: node_proba_1["y"] > node_metric["y_score"] ctrl_metric_args["y_score"] = ctrl_y_score if "probas_pred" in in_keys: node_proba_1["y"] > node_metric["probas_pred"] ctrl_metric_args["probas_pred"] = ctrl_y_score out_nodes = [p.add(CSVWrite(self._tmp_files("out_{}.csv".format(out_key)))) for out_key in out_keys] [node_metric[out_key] > out_nodes[i]["input"] for i, out_key in enumerate(out_keys)] self.run_pipeline(p) ctrl_returns = metric(**ctrl_metric_args) if len(out_keys) == 1: ctrl_returns = (ctrl_returns,) for i, out_key in enumerate(out_keys): control = ctrl_returns[i] result = self._tmp_files.csv_read("out_{}.csv".format(out_key), as_nd=True) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def inter_svm(train_data, test_data, num_fold): tmp = inter_kernel(test_data[:, 1:], train_data[:, 1:]) test_data = numpy.hstack([test_data[:, :1], tmp]) tmp = inter_kernel(train_data[:, 1:], train_data[:, 1:]) train_data = numpy.hstack([train_data[:, :1], tmp]) best_valid_ACC = 0 # Tune parameters for z in range(-8, 8): C = pow(2, z) result_train = list() result_valid = list() # Do cross-validation skf = cross_validation.StratifiedKFold( train_data[:, 0], n_folds=num_fold ) clf = SVC(C, kernel='precomputed', probability=True) for train_index, valid_index in skf: clf.fit(train_data[train_index, :][:, train_index+1], train_data[train_index, 0]) train_pred = clf.predict_proba(train_data[train_index, :][:, train_index+1]) valid_pred = clf.predict_proba(train_data[valid_index, :][:, train_index+1]) train_acc = roc_auc_score(train_data[train_index, 0], train_pred[:, 1]) valid_acc = roc_auc_score(train_data[valid_index, 0], valid_pred[:, 1]) #train_pred = clf.predict(train_data[train_index, :][:, train_index+1]) #valid_pred = clf.predict(train_data[valid_index, :][:, train_index+1]) #train_acc = accuracy_score(train_data[train_index, 0], train_pred) #valid_acc = accuracy_score(train_data[valid_index, 0], valid_pred) result_train.append(train_acc) result_valid.append(valid_acc) # If mean accuracy greater than best accuracy, then record it if sum(result_valid)/num_fold > best_valid_ACC: best_valid_ACC = sum(result_valid)/num_fold best_train_ACC = sum(result_train)/num_fold best_C = C # Predict test data with best C clf = SVC(best_C, kernel='precomputed', probability=True) clf.fit(train_data[:, 1:], train_data[:, 0]) test_pred = clf.predict_proba(test_data[:, 1:]) test_ACC = roc_auc_score(test_data[:, 0], test_pred[:, 1]) return best_train_ACC, best_valid_ACC, test_ACC, best_C
def main(): # Load the data print('Reading data...') main_data = pd.read_csv('../data/main_data.csv') targets = pd.read_csv('../data/target.csv') big_array = pd.concat([main_data, targets], axis=1) big_array = big_array.sample(frac=0.010) print(len(main_data.index)) print(len(big_array.index)) # Split the Data print('Splitting...') X_train, X_test, y_train, y_test = train_test_split(main_data, targets, test_size=0.25, random_state=42) # Train tree print('Training Tree...') tree = DecisionTreeClassifier() tree.fit(X_train, y_train) print('Predicting Tree...') tree_pred = tree.predict_proba(X_test)[:, 1] tree_fpr, tree_tpr, _ = roc_curve(y_test, tree_pred) # train random forest print('Training Random Forest...') rf = RandomForestClassifier() rf.fit(X_train, y_train) print('Predicting Random Forest...') rf_pred = rf.predict_proba(X_test)[:, 1] rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_pred) # train svm # Had to split it to a subset, way too mcuh data, too long to run svm_train, svm_test, svm_y_train, svm_y_test = train_test_split(big_array.drop('TARGET', axis=1), big_array['TARGET'], test_size=0.75) print('Training SVM...') svm_c = SVC(kernel='linear', probability=True) svm_c.fit(svm_train, svm_y_train) print('Predicting SVM...') svm_pred = svm_c.predict_proba(X_test)[:, 1] svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_pred) # plot model comparison print('Creating Plot...') plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(tree_fpr, tree_tpr, label='Tree') plt.plot(rf_fpr, rf_tpr, label='RF') plt.plot(svm_fpr, svm_tpr, label='SVM') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') print('Saving Plot...') plt.savefig('rocCurve.png')
def predictSVC(train, labels, test): print 'start SVC' clf = SVC(probability=True) clf.fit(train, labels) svc_predictions = clf.predict(test) svc_probs = clf.predict_proba(test) svc_bestProbs = svc_probs.max(axis=1) print 'svc done!' return svc_predictions, svc_bestProbs
def predict_SVM(): X,y,test_x,test_uid = loadData() model = SVC(C=2.5,gamma=0.03,kernel='rbf',probability=True,random_state=23333) model.fit(X,y) test_y = model.predict_proba(test_x) result = pd.DataFrame(columns=["uid","score"]) result.uid=test_uid result.score = test_y[:,1] result.to_csv('../result/result_SVM_rank.csv',index=None,encoding='utf-8')
def pipeLine(iteration,C,gamma,random_seed): X,y,test_x,test_uid = loadData() model = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,random_state=random_seed) model.fit(X,y) pred = model.predict_proba(test_x) test_result = pd.DataFrame(columns=["uid","score"]) test_result.uid=test_uid test_result.score = pred[:,1] test_result.to_csv('../result/svm_pred{0}.csv'.format(iteration),index=None,encoding='utf-8')
def SVM(X, Y, XTest, YTest): print '-----------------------------------------------------' # grid search over these to find parameters CList = [.001, .003, .01, .03, .1, .3, 1, 3, 6, 10, 15, 30, 40] gammaList = [.001, .003, .01, .03, .1, .3, 1, 2, 3, 4, 5, 6, 7] param_grid = [{'C': CList, 'gamma': gammaList, 'kernel': ['rbf', 'sigmoid', 'linear']}] # grid search over these to find parameters # rbf_grid = GridSearchCV(SVC(probability=True), param_grid=param_grid) rbf_grid = SVC(C=500, gamma=0.1, probability=True) # fit the models rbf_grid.fit(X, Y) # print("The best parameters are %s with a score of %0.2f" # % (rbf_grid.best_params_, rbf_grid.best_score_)) print "Computing training statistics" rbf_predict_time_training = time.time() Ypred_rbf_training = rbf_grid.predict(X) rbf_predict_time_training = time.time() - rbf_predict_time_training rbf_accuracy_training = metrics.accuracy_score(Y, Ypred_rbf_training) rbf_precision_training = metrics.precision_score(Y, Ypred_rbf_training, average='binary') rbf_recall_training = metrics.recall_score(Y, Ypred_rbf_training, average='binary') print "SVM RBF training prediction time: " + str(rbf_predict_time_training) print "SVM RBF training accuracy Score: " + str(rbf_accuracy_training) print "SVM RBF training precision Score: " + str(rbf_precision_training) print "SVM RBF training recall Score: " + str(rbf_recall_training) print "Computing testing statistics" rbf_predict_time_test = time.time() Ypred_rbf_test = rbf_grid.predict(XTest) rbf_predict_time_test = time.time() - rbf_predict_time_test rbf_accuracy_test = metrics.accuracy_score(YTest, Ypred_rbf_test) rbf_precision_test = metrics.precision_score(YTest, Ypred_rbf_test, average='binary') rbf_recall_test = metrics.recall_score(YTest, Ypred_rbf_test, average='binary') print "SVM RBF test prediction time: " + str(rbf_predict_time_test) print "SVM RBF test accuracy Score: " + str(rbf_accuracy_test) print "SVM RBF test precision Score: " + str(rbf_precision_test) print "SVM RBF test recall Score: " + str(rbf_recall_test) print "Creating ROC curve" y_true = YTest y_score = rbf_grid.predict_proba(XTest) fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true, y_score=y_score[:, 0], pos_label=0) plt.plot(fprSVM, trpSVM, 'b-', label='SVM')
def classify_SVC(train, test, kernel = 'rbf', verbose = False): from sklearn.svm import SVC x, y = train clf = SVC(probability = True, verbose = verbose) clf.fit(x, y) x, y = test proba = clf.predict_proba(x, kernel = kernel) return proba
def pipeline(iteration,C,gamma,random_seed): clf = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,cache_size=7000,class_weight='balanced',verbose=True,random_state=random_seed) clf.fit(X,y) joblib.dump(clf, './model/svm{0}.pkl'.format(iteration)) pred = clf.predict_proba(test_x) test_result = pd.DataFrame(columns=["uid","score"]) test_result.uid = test_uid test_result.score = pred[:,1] test_result.to_csv('./preds/svm_pred{0}.csv'.format(iteration),index=None)
def svm_train(train_file,test_file): _,x,y = readFile(train_file) id, tx = readFile(test_file) #feature selection from sklearn.feature_selection import SelectKBest,chi2 fselect = SelectKBest(chi2, k =5000) x = fselect.fit_transform(x,y) tx = fselect.transform(tx) print x.shape print tx.shape hehe = np.concatenate((x,tx)) from sklearn.preprocessing import scale hehe = scale(hehe,with_mean=False) x = hehe[0:x.shape[0]] tx = hehe[x.shape[0]:] from sklearn.cross_validation import train_test_split tmp_array = np.arange(x.shape[0]) train_i,test_i = train_test_split(tmp_array, train_size = 0.8, random_state = 1024) train_x = x[train_i] train_y = y[train_i] test_x = x[test_i] test_y = y[test_i] from sklearn.svm import SVC model = SVC(probability=True) model.fit(x,y) res1 = model.predict_proba(train_x) res2 = model.predict_proba(test_x) from sklearn.metrics import roc_auc_score score1 = roc_auc_score(train_y, res1[:,1]) score2 = roc_auc_score(test_y, res2[:,1]) print score1 print score2 res = model.predict_proba(tx) output = pd.DataFrame( data={"id":id, "sentiment":res[:,1]} ) output.to_csv( "/home/chuangxin/SVM_result.csv", index=False, quoting=3 ) return model
def fit_model_22(self, lol = 2, toWrite = False): model = SVC(probability = True, kernel = 'sigmoid', tol = 1e-3, coef0 = lol) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 22 score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model22/model.pkl','w') pickle.dump(model,f2)
def fit_model_20(self, lol = 0.0025, toWrite = False): model = SVC(probability = True, kernel = 'linear', class_weight = 'auto', tol = 1e-3) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict_proba(X_test)[:,1] print("Model 20 score: %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model20/model.pkl','w') pickle.dump(model,f2)
class PseudoRelevanceClassifierReranker: def __init__(self, lucene_index: str, vectorizer_class: str, clf_type: List[ClassifierType], r=10, n=100, alpha=0.5): self.r = r self.n = n self.alpha = alpha self.clf_type = clf_type # get vectorizer module = importlib.import_module("pyserini.vectorizer") VectorizerClass = getattr(module, vectorizer_class) self.vectorizer = VectorizerClass(lucene_index, min_df=5) if len(clf_type) > 2: raise Exception('Re-ranker takes at most two classifiers') def _set_classifier(self, clf_type: ClassifierType): if clf_type == ClassifierType.LR: self.clf = LogisticRegression(random_state=42) elif clf_type == ClassifierType.SVM: self.clf = SVC(kernel='linear', probability=True, random_state=42) else: raise Exception("Invalid classifier type") def _get_prf_vectors(self, doc_ids: List[str]): train_docs = doc_ids[:self.r] + doc_ids[-self.n:] train_labels = [1] * self.r + [0] * self.n train_vecs = self.vectorizer.get_vectors(train_docs) test_vecs = self.vectorizer.get_vectors(doc_ids) return train_vecs, train_labels, test_vecs def _rerank_with_classifier(self, doc_ids: List[str], search_scores: List[float]): train_vecs, train_labels, test_vecs = self._get_prf_vectors(doc_ids) # classification self.clf.fit(train_vecs, train_labels) pred = self.clf.predict_proba(test_vecs) classifier_scores = self._normalize([p[1] for p in pred]) search_scores = self._normalize(search_scores) # interpolation interpolated_scores = [a * self.alpha + b * (1-self.alpha) for a, b in zip(classifier_scores, search_scores)] return self._sort_dual_list(interpolated_scores, doc_ids) def rerank(self, doc_ids: List[str], search_scores: List[float]): # one classifier if len(self.clf_type) == 1: self._set_classifier(self.clf_type[0]) return self._rerank_with_classifier(doc_ids, search_scores) # two classifier with FusionMethod.AVG doc_score_dict = {} for i in range(2): self._set_classifier(self.clf_type[i]) i_scores, i_doc_ids = self._rerank_with_classifier(doc_ids, search_scores) for score, doc_id in zip(i_scores, i_doc_ids): if doc_id not in doc_score_dict: doc_score_dict[doc_id] = set() doc_score_dict[doc_id].add(score) r_scores, r_doc_ids = [], [] for doc_id, score in doc_score_dict.items(): avg = sum(score) / len(score) r_doc_ids.append(doc_id) r_scores.append(avg) return r_scores, r_doc_ids def _normalize(self, scores: List[float]): low = min(scores) high = max(scores) width = high - low return [(s-low)/width for s in scores] # sort both list in decreasing order by using the list1 to compare def _sort_dual_list(self, list1, list2): zipped_lists = zip(list1, list2) sorted_pairs = sorted(zipped_lists) tuples = zip(*sorted_pairs) list1, list2 = [list(tuple) for tuple in tuples] list1.reverse() list2.reverse() return list1, list2
class SVM(RecognitionModel): """This is the Sci-Kit version of SVM. .. seealso:: `sklearn.svm.SVC <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html>`_ """ def __init__(self, C=100, gamma=0.000001): """ Initialize a SVM classifier. The default kernel is RBF(Radial Basis Function). :param C: The **C** parameter trades off misclassification of training examples against simplicity of the decision surface :param gamma: the gamma parameter defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’ .. seealso:: see more about **C** and **gamma** : `RBF SVM parameters <http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html>`_ """ super().__init__() self.model = SVC() self.parameters.append(('C', C)) self.parameters.append(('gamma', gamma)) self.model.C = C self.model.gamma = gamma self.model.probability = True self.model.cache_size = 1024 @staticmethod def grid_search_c_gamma(samples, labels): """ Grid Search for best combination of **C** and **gamma**. :param samples: the samples :param labels: the corresponding labels :return: the best combination of **C** and **gamma** :rtype: Dictionary .. note:: This method uses a linear kernel, not the RBF kernel. """ # GridSearch for gamma and C C_range = np.logspace(-2, 10, 13) gamma_range = np.logspace(-9, 3, 13) param_grid = dict(gamma=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42) grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv) grid.fit(samples, labels) return grid.best_params_ @staticmethod def reduce_class_3_4(samples, labels): new_samples = list() new_labels = list() for index in range(len(labels)): if labels[index] < 3: new_labels.append(labels[index]) new_samples.append(samples[index]) if labels[index] == 5: new_labels.append(3) new_samples.append(samples[index]) return new_samples, new_labels @staticmethod def reduce_class_1_5(samples, labels): new_samples = list() new_labels = list() for index in range(len(labels)): if labels[index] == 0: new_labels.append(0) new_samples.append(samples[index]) if 1 < labels[index] < 5: new_labels.append(labels[index] - 1) new_samples.append(samples[index]) return new_samples, new_labels @staticmethod def mixed_demo(samples, labels): """ This is a demo trying to combine Feature Selection and Grid Search. But it's too slow to produce a good result. To be improved. :param samples: the samples :param labels: the corresponding labels """ X, y = samples, labels # This dataset is way too high-dimensional. Better do PCA: pca = PCA(n_components=2) # Maybe some original features where good, too? selection = SelectKBest(k=1) # Build estimator from PCA and Univariate selection: combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) # Use combined features to transform dataset: X_features = combined_features.fit(X, y).transform(X) svm = SVC(kernel="linear") # Do grid search over k, n_components and C: pipeline = Pipeline([("features", combined_features), ("svm", svm)]) param_grid = dict(features__pca__n_components=[1, 2, 3], features__univ_select__k=[1, 2], svm__C=[0.1, 1, 10]) grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10) # grid_search.fit(X, y) # print(grid_search.best_estimator_) grid_search.fit(X_features, y) logging.info(grid_search.best_estimator_) def train(self, samples, labels): """ Train the model. :param samples: the samples :type samples: [float] :param labels: the corresponding labels :type labels: [str] """ self.model.fit(samples, labels) def predict(self, samples): """ Predict the samples. :param samples: the samples :return: a list of label-result """ return self.model.predict(samples) def predict_proba(self, samples): # Seems unused """ Predict the probabilities of each given sample to each label. :param samples: the samples :return: a list of probas-result """ return self.model.predict_proba(samples) def evaluate_model(self, samples, labels): """ Evaluate the model using the given samples and labels. :param samples: the samples :type samples: [float] :param labels: the corresponding labels :type labels: [str] :return: The Evaluation of the model :rtype: Evaluation """ resp = self.predict(samples) reco = (labels == resp).mean() class_names = sorted(list(set(labels))) nb_classes = len(class_names) confusion = np.zeros((nb_classes, nb_classes), np.int32) # the vertical i is the correct answer, # while the horizon j is the prediction result for i, j in zip(labels, resp): confusion[class_names.index(i), class_names.index(j)] += 1 return Evaluation(reco, confusion) def train_and_evaluate(self, samples_train, labels_train, samples_test, labels_test): """ A combination of train() and evaluate_model() :param samples_train: the samples to train the model :param labels_train: the corresponding labels to train samples :param samples_test: the samples to evaluate the model :param labels_test: the corresponding labels to evaluate samples :return: the error rate """ self.model.fit(samples_train, labels_train) return self.evaluate_model(samples_test, labels_test) def auto_predict(self, samples): """This function will use the model to predict the samples, generate probability map and the final result. """ probs = self.model.predict_proba(samples) labels = list() labels.append(self.model.predict(samples)[0]) return probs, labels def set_parameters(self, parameters: {str, float}): self.parameters = list() self.model.C = parameters['C'] self.model.gamma = parameters['gamma'] self.parameters.append(('C', self.model.C)) self.parameters.append(('gamma', self.model.gamma)) def auto_config(self, samples, labels): best_parameters = self.grid_search_c_gamma(samples, labels) self.set_parameters(best_parameters)
# reduce the dimension # X_train_redu = tsne.fit_transform(X_train_std) # perform classification svm = SVC(kernel='rbf', random_state=0, gamma=GAMMA, C=C, verbose=True, probability=True) svm.fit(X_train_redu, y_train) # performance metrics y_train_pred = svm.predict(X_train_redu) # the training set predictions scores = svm.predict_proba(X_train_redu) print(np.shape(scores)) ACCURACY_SCORE = svm.score(X_train_redu, y_train, sample_weight=None) NMI = metrics.adjusted_mutual_info_score(y_train, y_train_pred) # print the summary print('\n') print('Time Points Used : ', DIV) print('Total Datapoints used : ', np.size(X_train_redu, 0)) print('Remove Low Variance Variables : ', REMOVE_LOW_VARIANCE_PARAMS) if (REMOVE_LOW_VARIANCE_PARAMS): print('Low Variance Threshold : ', LOW_VARIANCE_THRESHOLD) print('Remove Outliers : ', REMOVE_OUTLIERS) if (REMOVE_OUTLIERS): print('Absolute distance to the median : ', M) print('Perplexity of TSNE : ', PERPLEXITY)
fig = plt.figure() sns.scatterplot(data=dfe, x='r_mag', y='v_mag', hue='errors', markers=['o', 's', 'v', 'D']) plt.savefig('./plots/mag_errors.png', bbox_inches="tight") plt.figure() sns.histplot(dfe, x='r_mag', y='v_mag', hue='errors') plt.savefig('./plots/error_hist_mag.png', bbox_inches="tight") # %% run_proba = True if run_proba: probs = best_model.predict_proba(dfe.drop(columns=['errors'])) probs = probs[:, 0:2] prob_diff = probs[:, 0] - probs[:, 1] dfp = pd.DataFrame() dfp['probability'] = prob_diff dfp['error_type'] = dfe.errors sns.catplot(x='error_type', y='probability', data=dfp) plt.xticks(rotation=45) plt.savefig('./plots/error_probs.png', bbox_inches="tight") # %% run_tsne = True if run_tsne: cl_1, cl_2 = 0, 1 x_11 = X_test[(y_test == cl_1) & (y_pred == cl_1)] x_12 = X_test[(y_test == cl_1) & (y_pred == cl_2)] x_21 = X_test[(y_test == cl_2) & (y_pred == cl_1)]
def Co_KNN_SVM(train_Y, train_X, test_Y, test_X, savepath=None): model_max = None accuracy_max = 0 # 每次迭代,添加到对方分类器训练集的样本数 temp_num_svm = 44 temp_num_knn = 44 # 迭代次数 loop_num = 10 # knn中的K K = 4 # KNN和SVM用来测试的样本及测试的标签(不变) fixed_test_X = test_X.copy() fixed_test_Y = test_Y.copy() # KNN保存准确率 accuracy_knn_list = [] # SVM保存准确率 accuracy_svm_list = [] # knn训练标签和训练集特征组成的元组list train_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list( train_Y.copy(), train_X.copy()) # knn测试标签和测试集特征组成的元组list test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list( test_Y.copy(), test_X.copy()) # svm训练标签和训练集特征组成的元组list train_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list( train_Y.copy(), train_X.copy()) # svm测试标签和测试集特征组成的元组list test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list( test_Y.copy(), test_X.copy()) # 协同训练 for h in range(1, loop_num + 1): print(len(train_knn_Y_X_tuple_list)) print(len(test_knn_Y_X_tuple_list)) print(len(train_svm_Y_X_tuple_list)) print(len(test_svm_Y_X_tuple_list)) # 得到svm的训练集标签和训练集的特征 train_Y_svm_from_tuple, train_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple( train_svm_Y_X_tuple_list.copy()) # 得到svm的测试集标签和测试集的特征 test_Y_svm_from_tuple, test_X_svm_from_tuple = utilities.get_Y_and_X_list_from_tuple( test_svm_Y_X_tuple_list.copy()) # 得到knn的训练集标签和训练集的特征 train_Y_knn_from_tuple, train_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple( train_knn_Y_X_tuple_list) # 得到knn的测试集标签和测试集的特征 test_Y_knn_from_tuple, test_X_knn_from_tuple = utilities.get_Y_and_X_list_from_tuple( test_knn_Y_X_tuple_list) # KNN计算准确率 knn = KNeighborsClassifier(n_neighbors=K, weights='distance') # 训练 knn.fit(train_X_knn_from_tuple, train_Y_knn_from_tuple) # 获得准确率 accuracy_knn = knn.score(fixed_test_X, fixed_test_Y) accuracy_knn_list.append(accuracy_knn * 100) print("预测结果(KNN)") print(h) print(accuracy_knn) # svm计算准确率 svc = SVC(C=15, kernel='rbf', degree=3, gamma=2, probability=True) # 训练 svc.fit(train_X_svm_from_tuple, train_Y_svm_from_tuple) # 获得准确率 accuracy_svm = svc.score(fixed_test_X, fixed_test_Y) accuracy_svm_list.append(accuracy_svm * 100) print("预测结果(SVM)") print(h) print(accuracy_svm) if accuracy_svm > accuracy_max: accuracy_max = accuracy_svm model_max = svc if h == loop_num: break # KNN和SVM半监督训练过程 # ---------------------------------KNN测试样本预测和置信度计算过程 ---------------------------------- # 根据模型,预测样本 # 获得预测可能性 probility_knn = knn.predict_proba(test_X_knn_from_tuple) # knn的置信list confidence_knn_list = [] for i in range(0, probility_knn.shape[0]): probility_knn_temp = probility_knn[i] confidence_knn_list.append( utilities.get_confidence_knn(probility_knn_temp.copy())) # 获得预测标签 predict_Y_knn = knn.predict(test_X_knn_from_tuple) # ---------------------------------SVM测试样本预测和置信度计算过程 ---------------------------------- # 根据模型,预测样本 # 获得预测可能性 probility_svm = svc.predict_proba(test_X_svm_from_tuple) # svm的置信list confidence_svm_list = [] for i in range(0, probility_svm.shape[0]): probility_svm_temp = probility_svm[i] confidence_svm_list.append( utilities.get_confidence_svm(probility_svm_temp.copy())) # 获得预测标签 predict_Y_svm = svc.predict(test_X_svm_from_tuple) # KNN和SVM伪标签添加过程 # ---------------------------------------KNN--------------------------------------------- index_svm_label_high_confidence = utilities.get_confidence_svm_index( confidence_svm_list.copy(), predict_Y_svm.copy(), predict_Y_knn.copy(), temp_num_svm) temp_test_X_svm = [] temp_test_Y_svm = [] for i in index_svm_label_high_confidence: temp_test_X_svm.append(test_X_svm_from_tuple[i]) temp_test_Y_svm.append(predict_Y_svm[i]) temp_test_svm_Y_X_tuple_list = utilities.get_Y_X_tuple_list( temp_test_Y_svm.copy(), temp_test_X_svm.copy()) # 把svm的置信度较高的样本加入到knn的训练集中 train_knn_Y_X_tuple_list.extend(temp_test_svm_Y_X_tuple_list) # 获取新的测试样本 index_all_test_svm_Y_X_tuple_list = np.arange( 0, len(test_svm_Y_X_tuple_list)) diff_index_test_svm_Y_X_tuple_list = np.setdiff1d( index_all_test_svm_Y_X_tuple_list, np.array(index_svm_label_high_confidence)) diff_test_svm_Y_X_tuple_list = [] for i in diff_index_test_svm_Y_X_tuple_list: diff_test_svm_Y_X_tuple_list.append(test_svm_Y_X_tuple_list[i]) test_svm_Y_X_tuple_list = diff_test_svm_Y_X_tuple_list # ---------------------------------------SVM--------------------------------------------- index_knn_label_high_confidence = utilities.get_confidence_knn_index( confidence_knn_list.copy(), predict_Y_svm.copy(), predict_Y_knn.copy(), temp_num_knn) temp_test_X_knn = [] temp_test_Y_knn = [] for i in index_knn_label_high_confidence: temp_test_X_knn.append(test_X_knn_from_tuple[i]) temp_test_Y_knn.append(predict_Y_knn[i]) temp_test_knn_Y_X_tuple_list = utilities.get_Y_X_tuple_list( temp_test_Y_knn.copy(), temp_test_X_knn.copy()) # 把knn的置信度较高的样本加入到svm的训练集中 train_svm_Y_X_tuple_list.extend(temp_test_knn_Y_X_tuple_list) # 获取新的测试样本 index_all_test_knn_Y_X_tuple_list = np.arange( 0, len(test_knn_Y_X_tuple_list)) diff_index_test_knn_Y_X_tuple_list = np.setdiff1d( index_all_test_knn_Y_X_tuple_list, np.array(index_knn_label_high_confidence)) diff_test_knn_Y_X_tuple_list = [] for i in diff_index_test_knn_Y_X_tuple_list: diff_test_knn_Y_X_tuple_list.append(test_knn_Y_X_tuple_list[i]) test_knn_Y_X_tuple_list = diff_test_knn_Y_X_tuple_list if model_max is not None: print(accuracy_max * 100) joblib.dump(model_max, savepath) print("KNN的准确率:") print(accuracy_knn_list) print("SVM的准确率:") print(accuracy_svm_list)
w2vec_model = Word2Vec.load('data/raw/amazon/Electronics.bin') alx = pd.read_csv('data/processed/lexicon_table_asp_raw_09.csv', index_col=['WORD', 'ASP']) _w = w2vec_model['killer'].reshape(1, -1) _a = w2vec_model['performance'].reshape(1, -1) if variant == 'avg': _x = (_w + _a) / 2 elif variant == 'add': _x = _w + _a else: raise NotImplementedError svc.predict(_x) svc.predict_proba(_x) # SCORE # ----- conf_threshold = 0.7 w2vec_model = Word2Vec.load('data/raw/amazon/Electronics.bin') lx_df = pd.read_csv('data/processed/lexicon_table_asp_raw_09.csv', index_col=['WORD', 'ASP']) lx_words = lx_df.index.tolist() # 119673 (13297x9) vocabs = set(w2vec_model.wv.index2entity) # 43750 score_words = [(w, a) for w, a in lx_words if w in vocabs] # 51183 (5687x9) prediction = [] for w, a in score_words: _w = w2vec_model[w].reshape(1, -1)
activation='relu') net = tflearn.fully_connected(net, 2, activation='softmax') net = tflearn.regression(net) # Training clf = tflearn.DNN(net, tensorboard_verbose=0) clf.fit(X_train, labels_train) fprs = [] fnrs = [] thresholds = [] if THRESHOLD: if classifier == 'mlp': pred = clf.predict(X_test) else: pred = clf.predict_proba(X_test) for t in np.arange(0, 1, 0.0005): pred_t = prob_to_class_threshold(pred, t) print "Threshold:", t # print "Accuracy:", acc print "Accuracy:", accuracy_score(labels_test, pred_t) print "F1Score:", f1_score(labels_test, pred_t) print "Recall:", recall_score(labels_test, pred_t) print "Precision:", precision_score(labels_test, pred_t) cm = confusion_matrix(labels_test, pred_t) print cm tn = cm[0][0] fn = cm[1][0] tp = cm[1][1] fp = cm[0][1] f1 = f1_score(labels_test, pred_t)
def classifier(args,args_mode,dataset,sess): # Check that there are at least one training image per class for cls in dataset: #print(cls.name,'@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') if(len(cls.image_paths)<1): print(cls.image_paths,"@@@@@@@@@@@@@@@@@@@@@@@") assert (len(cls.image_paths) > 0, 'There must be at least one image for each class in the dataset') paths, labels,class_labels = get_image_paths_and_labels(dataset) print('Number of classes: %d' % len(dataset)) print('Number of images: %d' % len(paths)) # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] # Run forward pass to calculate embeddings print('Calculating features for images') nrof_images = len(paths) nrof_batches_per_epoch = int(math.ceil(1.0 * nrof_images / args.batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) for i in range(nrof_batches_per_epoch): start_index = i * args.batch_size end_index = min((i + 1) * args.batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = facenet.load_data(paths_batch, False, False, args.image_size) feed_dict = {images_placeholder: images, phase_train_placeholder: False} emb_array[start_index:end_index, :] = sess.run(embeddings, feed_dict=feed_dict) classifier_filename_exp = os.path.expanduser(args.classifier_filename) if (args_mode == 'TRAIN'): # Train classifier print('Training classifier+++++++++++++++++++++++++',args.classifier) if args.classifier == 'LinearSvm': # clf = SVC(C=1, kernel='linear', probability=True) model = SVC(kernel='linear', probability=True) elif args.classifier == 'GridSearchSvm': print(""" Warning: In our experiences, using a grid search over SVM hyper-parameters only gives marginally better performance than a linear SVM with C=1 and is not worth the extra computations of performing a grid search. """) param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']} ] model = GridSearchCV(SVC(C=1, probability=True), param_grid, cv=5) elif args.classifier == 'GMM': # Doesn't work best model = GMM(n_components=nClasses) # ref: # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html#example-classification-plot-classifier-comparison-py elif args.classifier == 'RadialSvm': # Radial Basis Function kernel # works better with C = 1 and gamma = 2 model = SVC(C=1, kernel='rbf', probability=True, gamma=2) elif args.classifier == 'DecisionTree': # Doesn't work best model = DecisionTreeClassifier(max_depth=20) elif args.classifier == 'GaussianNB': model = GaussianNB() # ref: https://jessesw.com/Deep-Learning/ elif args.classifier == 'DBN': from nolearn.dbn import DBN model = DBN([embeddings.shape[1], 500, labelsNum[-1:][0] + 1], # i/p nodes, hidden nodes, o/p nodes learn_rates=0.3, # Smaller steps mean a possibly more accurate result, but the # training will take longer learn_rate_decays=0.9, # a factor the initial learning rate will be multiplied by # after each iteration of the training epochs=300, # no of iternation # dropouts = 0.25, # Express the percentage of nodes that # will be randomly dropped as a decimal. verbose=1) elif args.classifier == 'KNN': model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') model.fit(emb_array, labels) # Create a list of class names class_names = [cls.name.replace('_', ' ') for cls in dataset] # Saving classifier model with open(classifier_filename_exp, 'wb') as outfile: pickle.dump((model, class_names), outfile) print('Saved classifier model to file "%s"' % classifier_filename_exp) elif (args_mode == 'CLASSIFY'): # Classify images print('Testing classifier~~~~~~~~~~~~~~~~~~~~~~~~') with open(classifier_filename_exp, 'rb') as infile: (model, class_names) = pickle.load(infile) predictions = np.zeros((nrof_images, len(class_names))) print('Loaded classifier model from file "%s"' % classifier_filename_exp) correctPrediction = 0 inCorrectPrediction = 0 sumConfidence = 0.0 correctConfidence = 0.0 inCorrectConfidence = 0.0 ''' batch_size =args.batch_size #batch_size = 1 for i in range(nrof_batches_per_epoch): start_index = i * batch_size end_index = min((i + 1) * batch_size, nrof_images) starttime = time.time() mini_emb_array = emb_array[start_index:end_index, :] predictions[start_index:end_index, :] = model.predict_proba(mini_emb_array) print("start_index:{} end_index:{} time:{}".format(start_index, end_index, time.time() - starttime)) ''' predictions = model.predict_proba(emb_array) best_class_indices = np.argmax(predictions, axis=1) best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices] results = {'name': [], 'bestname': [], 'probabilities': []} for i in range(len(best_class_indices)): #print(len(class_names)) #print(i,len(labels),labels[i]) #print(i,len(best_class_indices),best_class_indices[i]) print('%4d %s:%s: %.3f' % ( i, class_labels[i], class_names[best_class_indices[i]], best_class_probabilities[i])) results['name'].append(class_labels[i]) results['bestname'].append(class_names[best_class_indices[i]]) results['probabilities'].append(best_class_probabilities[i]) sumConfidence += best_class_probabilities[i] if (class_labels[i] == class_names[best_class_indices[i]]): correctPrediction += 1 correctConfidence += best_class_probabilities[i] else: inCorrectPrediction += 1 inCorrectConfidence += best_class_probabilities[i] #accuracy = np.mean(np.equal(best_class_indices, labels)) accuracy = float(correctPrediction) / (correctPrediction + inCorrectPrediction) Avg_Confidence = float(sumConfidence) / (correctPrediction + inCorrectPrediction) Avg_correctConfidence = float(correctConfidence/correctPrediction) Avg_inCorrectConfidence = float(inCorrectConfidence / inCorrectPrediction) results['name'].append('Accuracy:') results['bestname'].append('Accuracy:') results['probabilities'].append(accuracy) dataname = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S') data_frame = pd.DataFrame( data={'name': results['name'], 'bestname': results['bestname'], 'probabilities': results['probabilities']}) data_frame.to_csv(args.data_dir + '/results_' + dataname + '.csv') print("Correct Prediction :" + str(correctPrediction)) print("In-correct Prediction: " + str(inCorrectPrediction)) print('Accuracy: %.3f' % accuracy) print("Avg Confidence: " + str(Avg_Confidence)) print("Avg CorrectConfidence: " + str(Avg_correctConfidence)) print("Avg inCorrectConfidence: " + str(Avg_inCorrectConfidence))
clf_svm = SVC(probability=True) clf_rf = RandomForestClassifier(n_estimators=100) y_test_list = [] y_pred_list_dtcr = [] y_pred_list_svm = [] y_pred_list_rf = [] for i in range(100): DTCRS.Get_Train_Valid_Test() DTCRS.Train(use_only_seq=True, num_fc_layers=0, units_fc=256) y_pred_list_dtcr.append(DTCRS.y_pred) #Kmer clf_svm.fit(kmer_features[DTCRS.train[6]], np.argmax(DTCRS.train[-1], -1)) svm_pred = clf_svm.predict_proba(kmer_features[DTCRS.test[6]]) y_pred_list_svm.append(svm_pred) #RF clf_rf.fit(kmer_features[DTCRS.train[6]], np.argmax(DTCRS.train[-1], -1)) rf_pred = clf_rf.predict_proba(kmer_features[DTCRS.test[6]]) y_pred_list_rf.append(rf_pred) y_test_list.append(DTCRS.test[-1]) auc = [] method = [] antigen = [] for y_test, y_pred_svm, y_pred_dtcr, y_pred_rf in zip(y_test_list, y_pred_list_svm, y_pred_list_dtcr,
#%% #%% clf_svc = SVC(probability=True) #%% clf_lgbm = LGBMClassifier() #%% clf_svc.fit(prepedX, y) #%% clf_lgbm.fit(prepedX, y) #%% y_svc = clf_svc.predict_proba(prepedX) #%% y_lgbm = clf_lgbm.predict_proba(prepedX) #%% eclf = VotingClassifier(estimators=[ ('svc', clf_svc), ('lgbm', clf_lgbm), ], voting='soft') #%% eclf.fit(prepedX, y) #%%
''' from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score, roc_curve, roc_auc_score from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.svm import SVC import matplotlib.pyplot as plt data = load_iris() X_train, X_test, Y_train, Y_test = train_test_split(data.data, data.target, test_size=0.3) svm_model = SVC(kernel='rbf', random_state=0, probability=True) svm_model.fit(X_train, Y_train) y_pre = svm_model.predict(X_test) y_pre_ = svm_model.predict_proba(X_test) FPR, RECALL, thresholds = roc_curve(y_true=Y_test, y_score=y_pre_[:, 1], pos_label=1) # y_score=y_pre_[:,1] 也可以用svm_model.decision_function(X_test)作为参数,根据距离决策边界来分类 auc = roc_auc_score(Y_test, y_score=y_pre_, multi_class='ovo', labels=[0, 1, 2], max_fpr=1.0) #这里是多分类,用multi_class参数 print(auc) import matplotlib.pyplot as plt
def classificationResults(feature, results, featureDescription): # The two lines below convert the lists passed into the function to arrays. X = np.array(feature) y = np.array(results) # Splits the data into training and testing sets using 5 split k fold: skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=12345) skf.split(X, y) X_train = [] X_test = [] y_train = [] y_test = [] for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Fits the data to a model. The model is initially instantiated as SVC so that the definitions of 'classifier' in # the 'if' statements below it aren't out of scope of the rest of the module. model = SVC(gamma='scale', kernel='linear', probability=True).fit(X_train, y_train) if classifier == "Logistic Regression": model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=1000).fit(X_train, y_train) if classifier == "Multinomial Bayes": model = MultinomialNB().fit(X_train, y_train) if classifier == "Random Forest": model = RandomForestClassifier().fit(X_train, y_train) # Generates a prediction for each sentence, and stores them in a list called 'predictions'. predictions = model.predict(np.array(X_test)) # Calculates true positives, true negatives, false positives and false negatives: truePositives = 0 trueNegatives = 0 falsePositives = 0 falseNegatives = 0 numberInList = 0 for prediction in predictions: # Is this a formal sentence which was predicted to be formal? if y_test[numberInList] and prediction: truePositives = truePositives + 1 # Is this an informal sentence which was predicted to be informal? if not y_test[numberInList] and not prediction: trueNegatives = trueNegatives + 1 # Is this an informal sentence which was predicted to be formal? if not y_test[numberInList] and prediction: falsePositives = falsePositives + 1 # Is this a formal sentence which was predicted to be informal? if y_test[numberInList] and not prediction: falseNegatives = falseNegatives + 1 numberInList = numberInList + 1 # Performance metrics if (truePositives + trueNegatives + falsePositives + falseNegatives) > 0: accuracy = (truePositives + trueNegatives) / ( truePositives + trueNegatives + falsePositives + falseNegatives) else: accuracy = 0 if (truePositives + falsePositives) > 0: precision = truePositives / (truePositives + falsePositives) else: precision = 0 if (truePositives + falseNegatives) > 0: recall = truePositives / (truePositives + falseNegatives) else: recall = 0 if (trueNegatives + falsePositives) > 0: fallout = falsePositives / ( trueNegatives + falsePositives ) # 'Fallout' is the same as the false positive rate. else: fallout = 0 balAccuracy = balanced_accuracy_score(y_test, predictions) # Area under roc curve calculations y_scores = model.predict_proba(X_test) y_scores = y_scores[:, 1] rocAreaUnderCurve = roc_auc_score(y_test, y_scores) # Console output print("\nRESULTS SUMMARY\n" + "---------------\n") print("Feature(s) tested: ", featureDescription) print("Classifier: " + classifier, "\n") print("Total predictions: ", numberInList) print("TRUE POSITIVES: ", truePositives) print("FALSE POSITIVES: ", falsePositives) print("TRUE NEGATIVES: ", trueNegatives) print("FALSE NEGATIVES: ", falseNegatives) # Division by zero is illegal, so if the denominator is zero, then 'N/A' is given as the metric's value. if accuracy > 0: print("Accuracy: %3.2f" % accuracy) else: print("Accuracy: N/A") if precision > 0: print("Precision: %3.2f" % precision) else: print("Precision: N/A") if recall > 0: print("Recall: %3.2f" % recall) else: print("Recall: N/A") if fallout > 0: print("False positive rate: %3.2f" % fallout) else: print("False positive rate: N/A") print("AUC: %3.2f" % rocAreaUnderCurve) print("Balanced accuracy: %3.2f" % balAccuracy)
def active_most_proba_svm(difficulty='EASY', num_init_label=500): random.seed(0) num_init_label_copy = num_init_label current_model = None # This function selecte # Input: difficulty - the difficulty as a string, 'EAST' or "MODERATE' # Additionally, you will implement a random learner for performing the # same task and compare the performance of both algorithms # generate the data. # XTrain is a 1 by num_samples vector of values in the interval [0,1]. # YTrain is a 1 by num_samples vector of labels (either 0 or 1) # YTrain is the true model X_train, y_train = read_train_test('{}_TRAIN.csv'.format( difficulty.upper())) X_test, y_test = read_train_test('{}_TEST.csv'.format(difficulty.upper())) num_samples = X_train.shape[0] num_test = X_test.shape[0] num_features = X_train.shape[1] assert y_train.shape == (num_samples, 1) assert X_test.shape == (num_test, num_features) assert y_test.shape == (num_test, 1) selected_label = np.full((num_samples, 1), -1, dtype=np.int) selected_mask = np.full((num_samples, 1), 0, dtype=np.int) # fill a base number of samples to selected for _ in range(num_init_label): x = select_random_unlabeled_point(selected_mask) selected_mask[x, 0] = 1 selected_label[x, 0] = y_train[x, 0] # continue to fill until has at least a 1 and a 0 while not (np.any(selected_label == 0) and np.any(selected_label == 1)): x = select_random_unlabeled_point(selected_mask) selected_mask[x, 0] = 1 selected_label[x, 0] = y_train[x, 0] selector = SelectKBest(chi2, k=25) selector.fit(select(X_train, selected_mask), select(selected_label, selected_mask)) current_model = None r_label = np.full((num_samples, 1), -1, dtype=np.int) r_mask = np.full((num_samples, 1), 0, dtype=np.int) for _ in range(np.sum(selected_mask)): x = select_random_unlabeled_point(r_mask) r_mask[x, 0] = 1 r_label[x, 0] = y_train[x, 0] r_selector = SelectKBest(chi2, k=25) r_selector.fit(select(X_train, r_mask), select(r_label, r_mask)) hB = DefaultModel() B_predictions = hB.predict(X_test) # metrics needs to be recorded svm_errors = [] random_errors = [] blank_errors = [] svm_f1s = [] random_f1s = [] blank_f1s = [] t = np.sum(selected_mask) while np.sum(selected_mask) < 2500: t = np.sum(selected_mask) model = SVC(class_weight='balanced', probability=True) labels_ = select(selected_label, selected_mask) model.fit(selector.transform(select(X_train, selected_mask)), np.reshape(labels_, labels_.size)) current_model = model predictions_with_proba = model.predict_proba( selector.transform(X_train)) assert predictions_with_proba.shape == (num_samples, 2) classes = model.classes_ assert classes.shape == (2, ) pos_class_idx = np.where(classes == 1)[0][0] assert pos_class_idx == 0 or pos_class_idx == 1 max_proba = 0 max_idx = 0 for i in range(num_samples): if selected_mask[i, 0] == 0: # only consider unlabeled points if predictions_with_proba[i, pos_class_idx] > max_proba: max_proba = predictions_with_proba[i, pos_class_idx] max_idx = i selected_mask[max_idx, 0] = 1 selected_label[max_idx, 0] = y_train[max_idx, 0] predictions = model.predict(selector.transform(X_test)) if len(predictions.shape) == 1: predictions = np.reshape(predictions, (predictions.size, 1)) assert predictions.shape == (num_test, 1) svm_error = np.sum(np.absolute(np.subtract(predictions, y_test))) / y_test.size print('SVM error after {} queries is {}'.format(t, svm_error)) svm_errors.append(svm_error) svm_f1_score = f1_score(y_test, predictions) print('SVM F1 after {} queries is {}'.format(t, svm_f1_score)) svm_f1s.append(svm_f1_score) # Random selection Model xr = select_random_unlabeled_point(r_mask) r_mask[xr, 0] = 1 r_label[xr, 0] = y_train[xr, 0] r = np.sum(r_mask) t = np.sum(selected_mask) if r != t: print("r = {}, t = {}".format(r, t)) train_r = select(X_train, r_mask) train_r_label = select(y_train, r_mask) assert train_r.shape == (r, num_features) assert train_r_label.shape == (r, 1) model_r = SVC(class_weight='balanced') labels_ = select(r_label, r_mask) model_r.fit(r_selector.transform(select(X_train, r_mask)), np.reshape(labels_, labels_.size)) assert model_r.classes_.size == 2 predictions = model_r.predict(selector.transform(X_test)) if len(predictions.shape) == 1: predictions = np.reshape(predictions, (predictions.size, 1)) assert predictions.shape == (num_test, 1) random_error = np.sum(np.absolute(np.subtract(predictions, y_test))) / y_test.size print('Random error after {} queries is {}'.format(r, random_error)) random_errors.append(random_error) random_f1_score = f1_score(y_test, predictions) print('Random F1 after {} queries is {}'.format(t, random_f1_score)) random_f1s.append(random_f1_score) # Blank Model (prediction all negative from the start) blank_error = np.sum(np.absolute(np.subtract(B_predictions, y_test))) / y_test.size print('Blank learner error queries is {}'.format(blank_error)) blank_errors.append(blank_error) blank_f1_score = f1_score(y_test, B_predictions) print('Blank F1 after {} queries is {}'.format(t, blank_f1_score)) blank_f1s.append(random_f1_score) # Final writings predictions = current_model.predict(selector.transform(X_test)) if len(predictions.shape) == 1: predictions = np.reshape(predictions, (predictions.size, 1)) final_error = np.sum(np.absolute(np.subtract(predictions, y_test))) / y_test.size print('final SVM error is {}'.format(final_error)) final_f1_score = f1_score(y_test, predictions) print('final SVM F1 is {}'.format(final_f1_score)) print('final number of queries is'.format(t)) feature_matrix, id_vector = read_blind('{}_BLINDED.csv'.format( difficulty.upper())) blinded_predictions = current_model.predict( selector.transform(feature_matrix)) blinded_predictions = np.reshape(blinded_predictions, blinded_predictions.size) write_prediction( 'FS_AMP_{}_BLINDED_PREDICTION_{}.csv'.format(difficulty.upper(), num_init_label_copy), id_vector, blinded_predictions) with open('output/FS_AMP_{}_metrics_{}.txt'.format(difficulty.upper(), num_init_label_copy), mode='w') as f: f.write('SVM errors\n') f.write(svm_errors.__str__()) f.write('\n') f.write('Random errors\n') f.write(random_errors.__str__()) f.write('\n') f.write('Blank errors\n') f.write(blank_errors.__str__()) f.write('\n') f.write('SVM F1 scores\n') f.write(svm_f1s.__str__()) f.write('\n') f.write('Random F1 scores\n') f.write(random_f1s.__str__()) f.write('\n') f.write('Blank F1 scores\n') f.write(blank_f1s.__str__()) f.write('\n') f.flush()
if __name__ == "__main__": print "training..." C = 0.19 gamma = 0.0028 shrinking = True #auto_class_weights = False probability = True verbose = True svc = SVC(C=C, gamma=gamma, shrinking=shrinking, probability=probability, verbose=verbose) svc.fit(x_train, y_train) p = svc.predict_proba(x_test) p = p[:, 1] # make sure both y and p are of shape (n,1) and not (n,) ids_and_p = np.hstack((ids.reshape((-1, 1)), p.reshape((-1, 1)))) np.savetxt(output_file, ids_and_p, fmt=['%d', '%.10f'], delimiter=',', header='UserID,Probability1', comments='')
def onnx_test_svm_single_classreg(self, dtype, n_targets=1, debug=False, add_noise=False, runtime='python', target_opset=None, kind='reg', level=1, **kwargs): iris = load_iris() X, y = iris.data, iris.target if add_noise: X += numpy.random.randn(X.shape[0], X.shape[1]) * 10 if kind == 'reg': y = y.astype(dtype) elif kind == 'bin': y = (y % 2).astype(numpy.int64) elif kind == 'mcl': y = y.astype(numpy.int64) else: raise AssertionError("unknown '{}'".format(kind)) if n_targets != 1: yn = numpy.empty((y.shape[0], n_targets), dtype=dtype) for i in range(n_targets): yn[:, i] = y + i y = yn X_train, X_test, y_train, _ = train_test_split(X, y, random_state=11) X_test = X_test.astype(dtype) if kind in ('bin', 'mcl'): clr = SVC(**kwargs) elif kind == 'reg': clr = SVR(**kwargs) clr.fit(X_train, y_train) model_def = to_onnx(clr, X_train.astype(dtype), rewrite_ops=True, target_opset=target_opset) if 'onnxruntime' in runtime: model_def.ir_version = get_ir_version_from_onnx() try: oinf = OnnxInference(model_def, runtime=runtime) except RuntimeError as e: if debug: raise RuntimeError( "Unable to create a model\n{}".format(model_def)) from e raise e if debug: y = oinf.run({'X': X_test}, verbose=level, fLOG=print) else: y = oinf.run({'X': X_test}) lexp = clr.predict(X_test) if kind == 'reg': self.assertEqual(list(sorted(y)), ['variable']) if dtype == numpy.float32: self.assertEqualArray(lexp.ravel(), y['variable'].ravel(), decimal=5) else: self.assertEqualArray(lexp, y['variable'], decimal=5) else: self.assertEqual(list(sorted(y)), ['output_label', 'output_probability']) self.assertEqualArray(lexp, y['output_label']) lprob = clr.predict_proba(X_test) self.assertEqualArray(lprob, DataFrame(y['output_probability']).values, decimal=5)
if auroc is not None: arq.write("{metric:<18}{value:.4f}\n".format(metric="AUROC:", value=auroc)) if aupr is not None: arq.write("{metric:<18}{value:.4f}\n".format(metric="AUPR:", value=aupr)) # %% for n in range(5): with gzip.open(caminho + ' ' + str(n), 'rb') as arquivo: treino, validacao, teste = pickle.load(arquivo) svc_clf = SVC(probability=True, verbose=True, random_state=n) # Modifique aqui os hyperparâmetros svc_clf.fit(treino.iloc[:, :-2], treino.iloc[:, -2]) svc_pred_class = svc_clf.predict(validacao.iloc[:, :-2]) svc_pred_scores = svc_clf.predict_proba(validacao.iloc[:, :-2]) accuracy, recall, precision, f1, auroc, aupr = compute_performance_metrics( validacao.iloc[:, -2], svc_pred_class, svc_pred_scores) print('Performance no conjunto de validação:') print_metrics_summary(accuracy, recall, precision, f1, auroc, aupr) with open('resultados.txt', 'a+') as arq_resul: arq_resul.write('Resultado utilizando o banco de dados ' + str(n) + ':\n') print_metrics_summary2(accuracy, recall, precision, f1, arq_resul, auroc, aupr) arq_resul.write('\n')
# print(X_train) # print(X_test) # Training the Kernel SVM model on the Training set classifier = SVC( C= 20, kernel ='rbf', random_state = 0, probability= True) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # ROC Curve plot prob = classifier.predict_proba(X_test) prob = prob[:, 1] auc = metrics.roc_auc_score(y_test, prob) print('AUC: {}\n'.format(auc)) fpr, tpr, thresholds = metrics.roc_curve(y_test, prob) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = {}'.format(auc)) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
shu = data shu = scale(shu) label1 = np.ones((150, 1)) #Value can be changed label2 = np.zeros((150, 1)) label = np.append(label1, label2) X = shu y = label sepscores = [] ytest = np.ones((1, 2)) * 0.5 yscore = np.ones((1, 2)) * 0.5 cv_clf = SVC(probability=True) skf = StratifiedKFold(n_splits=5) for train, test in skf.split(X, y): y_train = utils.to_categorical(y[train]) hist = cv_clf.fit(X[train], y[train]) y_score = cv_clf.predict_proba(X[test]) yscore = np.vstack((yscore, y_score)) y_test = utils.to_categorical(y[test]) ytest = np.vstack((ytest, y_test)) fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0]) roc_auc = auc(fpr, tpr) y_class = utils.categorical_probas_to_classes(y_score) y_test_tmp = y[test] acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace( len(y_class), y_class, y_test_tmp) sepscores.append( [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc]) print( 'SVC:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f' % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc)) scores = np.array(sepscores)
# ================================== # TESTING PART: # ------------ # Extracting Testing Data: (X_test, y_test) = ef.fetch_test() x_m, y_sd, X_test = ef.normalize( X_test, xm=x_m, ysd=y_sd ) # Normalizes the testing data with the mean and SD of the training set. # Fitting the classifier: print '\nUsing %s estimators of depth %s.\n' % (str(num_est), str(tree_depth)) # Deriving the ROC curve: y_check = clf.predict(X_test) y_hat = clf.predict_proba(X_test) y_hat = array([entry[1] for entry in y_hat]) fpr, tpr, thresholds = roc_curve(y_test, y_hat, pos_label=1) # Plotting the result: plt.plot(fpr, tpr) plt.plot([0, 1], [0, 1], 'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.show() # Evaluating the AUC: area = trapz(tpr, fpr) print 'Area under the AUC curve:', area
print("getting accuracies %s" % i) #Use score() function to get accuracy npar_pred = np.array(prediction_data) pred_lin = clf.score(npar_pred, prediction_labels) print "linear: ", pred_lin accur_lin.append(pred_lin) #Store accuracy in a list #print("Mean value lin svm: %s" %np.mean(accur_lin)) #FGet mean accuracy of the 10 runs test_image = sys.argv[1] image = cv2.imread(test_image) #open image gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #convert to grayscale clahe_image = clahe.apply(gray) get_landmarks(clahe_image) if data['landmarks_vectorised'] == "error": print("no face detected on this one") else: val = clf.predict_proba([data['landmarks_vectorised']]) anger.append(val[0][0]) fear.append(val[0][1]) happy.append(val[0][2]) sadness.append(val[0][3]) ''' if val==0: print("anger") elif val==1: print("contempt") elif val==2: print("disgust") elif val==3: print("fear") elif val==4: print("happy")
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: np.random.seed(seed=args.seed) if args.use_split_dataset: dataset_tmp = facenet.get_dataset(args.data_dir) train_set, test_set = split_dataset(dataset_tmp, args.min_nrof_images_per_class, args.nrof_train_images_per_class) if (args.mode=='TRAIN'): dataset = train_set elif (args.mode=='CLASSIFY'): dataset = test_set else: dataset = facenet.get_dataset(args.data_dir) # Check that there are at least one training image per class for cls in dataset: assert(len(cls.image_paths)>0, 'There must be at least one image for each class in the dataset') paths, labels = facenet.get_image_paths_and_labels(dataset) print('Number of classes: %d' % len(dataset)) print('Number of images: %d' % len(paths)) # Load the model print('Loading feature extraction model') facenet.load_model(args.model) # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name("input:0") embeddings = tf.get_default_graph().get_tensor_by_name("embeddings:0") phase_train_placeholder = tf.get_default_graph().get_tensor_by_name("phase_train:0") embedding_size = embeddings.get_shape()[1] # Run forward pass to calculate embeddings print('Calculating features for images') nrof_images = len(paths) nrof_batches_per_epoch = int(math.ceil(1.0*nrof_images / args.batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) for i in range(nrof_batches_per_epoch): start_index = i*args.batch_size end_index = min((i+1)*args.batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = facenet.load_data(paths_batch, False, False, args.image_size) feed_dict = { images_placeholder:images, phase_train_placeholder:False } emb_array[start_index:end_index,:] = sess.run(embeddings, feed_dict=feed_dict) classifier_filename_exp = os.path.expanduser(args.classifier_filename) if (args.mode=='TRAIN'): # Train classifier print('Training classifier') model = SVC(kernel='linear', probability=True) model.fit(emb_array, labels) # Create a list of class names class_names = [ cls.name.replace('_', ' ') for cls in dataset] # Saving classifier model with open(classifier_filename_exp, 'wb') as outfile: pickle.dump((model, class_names), outfile) print('Saved classifier model to file "%s"' % classifier_filename_exp) elif (args.mode=='CLASSIFY'): # Classify images print('Testing classifier') with open(classifier_filename_exp, 'rb') as infile: (model, class_names) = pickle.load(infile) print('Loaded classifier model from file "%s"' % classifier_filename_exp) predictions = model.predict_proba(emb_array) best_class_indices = np.argmax(predictions, axis=1) best_class_probabilities = predictions[np.arange(len(best_class_indices)), best_class_indices] for i in range(len(best_class_indices)): print('%4d %s: %.3f' % (i, class_names[best_class_indices[i]], best_class_probabilities[i])) accuracy = np.mean(np.equal(best_class_indices, labels)) print('Accuracy: %.3f' % accuracy)
# In[10]: y_test['target'].values # In[11]: error = 0 for i, v in enumerate(svm.predict(X_test_std)): if v != y_test['target'].values[i]: error += 1 print(error) # In[12]: svm.predict_proba(X_test_std) # In[13]: from matplotlib.colors import ListedColormap def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02): # setup marker generator and color map markers = ('s', 'x', 'o', '^', 'v') colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan') cmap = ListedColormap(colors[:len(np.unique(y))]) # 畫出決定的平面 x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
## At this point, the new code begins ## I run the SVC code on the cleaned data ## I run the clf.predict to get a binary output and call that array 'bout' ## And then to generate the ROC curve I run the clf.predict_proba to get ## probabilities for each prediction t = .001 # tolerance parameter kp = 'rbf' # kernel parameter print('\n\nSupport Vector Machine classifier\n') clf = SVC(kernel=kp, probability=True, tol=t) clf.fit(train_features, train_targets) print("predictions for test set:") bout = clf.predict(test_features) ## and now the probability version prob_out = clf.predict_proba(test_features) print(bout) print('actual class values:') target_array = np.array(test_targets) print(target_array) print('The number of predictions that differ from actual') print(sum(bout != target_array)) ## and Now I just re-use to code from L09-AccuracyMeasures tutorial CM = confusion_matrix(target_array, bout) print("\n\nConfusion matrix:\n", CM) tn, fp, fn, tp = CM.ravel() print("\nTP, TN, FP, FN:", tp, ",", tn, ",", fp, ",", fn) AR = accuracy_score(target_array, bout) print("\nAccuracy rate:", AR)
def mipsMain(thresh): print thresh tex = loadtxt('/home/aditya/Project/csvFiles/trainNLP.csv', delimiter=',') tex_dev = loadtxt('/home/aditya/Project/csvFiles/devNLP.csv', delimiter=',') vals = tex.shape vals_dev = tex_dev.shape ty = tex[:, vals[1] - 1] tx = tex[:, 0:vals[1] - 2] ty_dev = tex_dev[:, vals_dev[1] - 1] tx_dev = tex_dev[:, 0:vals_dev[1] - 2] clf = SVC(C=4.0, cache_size=200, coef0=0.0, degree=3, gamma=9.0, kernel='linear', probability=True, scale_C=False, shrinking=True, tol=0.0001) clf.fit(tx, ty) #prev_pred=clf.predict(tx_dev[:,0:vals[1]-2]) #temp = clf.predict_proba(tx_dev[:,0:vals[1]-2]) prev_pred = clf.predict(tx[:, 0:vals[1] - 2]) temp = clf.predict_proba(tx[:, 0:vals[1] - 2]) (ninst, nclass) = temp.shape print temp.shape obj_coeff = temp.tolist() # Mixed Integer Programming m = Model() # declare Variables vars = {} for i in range(ninst): for j in range(nclass): vars[i, j] = m.addVar(obj=-1 * math.log(obj_coeff[i][j]), vtype=GRB.BINARY, name='x' + str(i) + '_' + str(j)) m.update() # Add Constraints # everything is a Regulation:1, # Activation:2 and Inhibition:3 do not occur together # Requirement:4 participates in Activation and Inhibition # Binding:5 may or may not coexist Transcription:6 for i in range(ninst): # constraint on number of labels predicted num_labels = sum( np.array([obj_coeff[i][jo] for jo in range(nclass)]) > thresh) if num_labels > 0: # making sure Regulation always exist # m.addConstr(vars[i,0]==1) m.addConstr( quicksum(vars[i, j] for j in range(nclass)) == num_labels) # making sure Activation and Inhibition do not occur together m.addConstr(vars[i, 1] + vars[i, 2] <= 1) m.addConstr((vars[i, 1] > 0 and vars[i, 0] >= 0) or (vars[i, 1] <= 0)) # adding a constraint restricting the co-existance of Inhibition and Transcription # m.addConstr(vars[i,2]+vars[i,5] <= 1) # m.addConstr(vars[i,1]+vars[i,3] == 2) m.update() # Optimize Model m._vars = vars m.optimize() final = [] for i in range(ninst): final.append( [float(j + 1) for j in range(nclass) if m._vars[i, j].x == 1.0]) return [final, prev_pred]
def nestedCrossValidation(data, cVals, kFolds=10, mFolds=5): ''' Nested 10x5 crossvalidation by default. Tests given cVals to determine best hyperparameters :param data: Data being trained on and tested on :param cVals: cVals being tested :param kFolds: number of Outer folds :param mFolds: number of inner folds ''' target = 0 data = data.sample(frac=1).reset_index(drop=True) wineData1 = data.copy() wineData1.loc[wineData1[target] != 1, target] = 0 wineData2 = data.copy() wineData2.loc[wineData2[target] != 2, target] = 0 wineData3 = data.copy() wineData3.loc[wineData3[target] != 3, target] = 0 numRows = len(data) oneKthRows = round(numRows / kFolds) resultsTable1 = pd.DataFrame(columns=["OuterFold", "cVal", "TrainAccuracy", \ "TrainPrecision", "TrainRecall", "TestAccuracy", "TestPrecision", "TestRecall"]) resultsTable2 = pd.DataFrame(columns=["OuterFold", "cVal", "TrainAccuracy", \ "TrainPrecision", "TrainRecall", "TestAccuracy", "TestPrecision", "TestRecall"]) resultsTable3 = pd.DataFrame(columns=["OuterFold", "cVal", "TrainAccuracy", \ "TrainPrecision", "TrainRecall", "TestAccuracy", "TestPrecision", "TestRecall"]) combinedResults = pd.DataFrame(columns=["OuterFold", "Combined TestAccuracy", \ "Combined TestPrecision", "Combined TestRecall"]) _, ax1 = plt.subplots(1) _, ax2 = plt.subplots(1) _, ax3 = plt.subplots(1) ax1.set_title("ROC-AUC Curve for Class 1") ax2.set_title("ROC-AUC Curve for Class 2") ax3.set_title("ROC-AUC Curve for Class 3") for k in range(kFolds): testDataK1 = wineData1[(k * oneKthRows):oneKthRows * (k + 1)] trainDataK1 = wineData1.drop( wineData1.index[(k * oneKthRows):oneKthRows * (k + 1)]) testDataK1 = testDataK1.reset_index(drop=True) trainDataK1 = trainDataK1.reset_index(drop=True) testDataK2 = wineData2[(k * oneKthRows):oneKthRows * (k + 1)] trainDataK2 = wineData2.drop( wineData2.index[(k * oneKthRows):oneKthRows * (k + 1)]) testDataK2 = testDataK2.reset_index(drop=True) trainDataK2 = trainDataK2.reset_index(drop=True) testDataK3 = wineData3[(k * oneKthRows):oneKthRows * (k + 1)] trainDataK3 = wineData3.drop( wineData3.index[(k * oneKthRows):oneKthRows * (k + 1)]) testDataK3 = testDataK3.reset_index(drop=True) trainDataK3 = trainDataK3.reset_index(drop=True) oneMthRows = round(len(trainDataK1) / mFolds) bestAcc1 = 0 bestC1 = 0 bestAcc2 = 0 bestC2 = 0 bestAcc3 = 0 bestC3 = 0 for m in range(mFolds): testDataM1 = trainDataK1[(m * oneMthRows):oneMthRows * (m + 1)] trainDataM1 = trainDataK1.drop( trainDataK1.index[(m * oneMthRows):oneMthRows * (m + 1)]) testDataM2 = trainDataK2[(m * oneMthRows):oneMthRows * (m + 1)] trainDataM2 = trainDataK2.drop( trainDataK2.index[(m * oneMthRows):oneMthRows * (m + 1)]) testDataM3 = trainDataK3[(m * oneMthRows):oneMthRows * (m + 1)] trainDataM3 = trainDataK3.drop( trainDataK3.index[(m * oneMthRows):oneMthRows * (m + 1)]) # normalize training and test data with zscore normalization trainDataM1, normParams, _ = pf.zScoreNormalization( trainDataM1, target) testDataM1, _, _ = pf.zScoreNormalization(testDataM1, target, normParams=normParams) trainDataM2, normParams, _ = pf.zScoreNormalization( trainDataM2, target) testDataM2, _, _ = pf.zScoreNormalization(testDataM2, target, normParams=normParams) trainDataM3, normParams, _ = pf.zScoreNormalization( trainDataM3, target) testDataM3, _, _ = pf.zScoreNormalization(testDataM3, target, normParams=normParams) # Try all combinations of gamma and C values from the lists given at each inner fold for c in cVals: x_train1 = trainDataM1.drop(target, axis=1) y_train1 = trainDataM1[target] x_test1 = testDataM1.drop(target, axis=1) y_test1 = testDataM1[target] x_train2 = trainDataM2.drop(target, axis=1) y_train2 = trainDataM2[target] x_test2 = testDataM2.drop(target, axis=1) y_test2 = testDataM2[target] x_train3 = trainDataM3.drop(target, axis=1) y_train3 = trainDataM3[target] x_test3 = testDataM3.drop(target, axis=1) y_test3 = testDataM3[target] # gaussian kernel classifier1 = SVC(kernel='linear', C=c, probability=True) classifier2 = SVC(kernel='linear', C=c, probability=True) classifier3 = SVC(kernel='linear', C=c, probability=True) # Train model classifier1.fit(x_train1, y_train1) classifier2.fit(x_train2, y_train2) classifier3.fit(x_train3, y_train3) # TESTING model y_pred1 = classifier1.predict(x_test1) y_pred2 = classifier2.predict(x_test2) y_pred3 = classifier3.predict(x_test3) y_prob1 = classifier1.predict_proba(x_test1) y_prob1 = y_prob1[:, [1]] y_prob2 = classifier2.predict_proba(x_test2) y_prob2 = y_prob2[:, [1]] y_prob3 = classifier3.predict_proba(x_test3) y_prob3 = y_prob3[:, [1]] acc1 = metrics.accuracy_score(y_test1, y_pred1) acc2 = metrics.accuracy_score(y_test2, y_pred2) acc3 = metrics.accuracy_score(y_test3, y_pred3) if acc1 > bestAcc1: bestAcc1 = acc1 bestC1 = c if acc2 > bestAcc2: bestAcc2 = acc2 bestC2 = c if acc3 > bestAcc3: bestAcc3 = acc3 bestC3 = c print("Kfold:", k + 1, "Mfold:", m + 1, "Class1BestC:", bestC1, \ "Class2BestC:", bestC2, "Class3BestC:", bestC3) # normalize training and test data with zscore normalization trainDataK1, normParams, _ = pf.zScoreNormalization( trainDataK1, target) testDataK1, _, _ = pf.zScoreNormalization(testDataK1, target, normParams=normParams) trainDataK2, normParams, _ = pf.zScoreNormalization( trainDataK2, target) testDataK2, _, _ = pf.zScoreNormalization(testDataK2, target, normParams=normParams) trainDataK3, normParams, _ = pf.zScoreNormalization( trainDataK3, target) testDataK3, _, _ = pf.zScoreNormalization(testDataK3, target, normParams=normParams) x_train1 = trainDataK1.drop(target, axis=1) y_train1 = trainDataK1[target] x_test1 = testDataK1.drop(target, axis=1) y_test1 = testDataK1[target] x_train2 = trainDataK2.drop(target, axis=1) y_train2 = trainDataK2[target] x_test2 = testDataK2.drop(target, axis=1) y_test2 = testDataK2[target] x_train3 = trainDataK3.drop(target, axis=1) y_train3 = trainDataK3[target] x_test3 = testDataK3.drop(target, axis=1) y_test3 = testDataK3[target] # gaussian kernel classifier1 = SVC(kernel='linear', C=bestC1, probability=True) classifier2 = SVC(kernel='linear', C=bestC2, probability=True) classifier3 = SVC(kernel='linear', C=bestC3, probability=True) # Train model classifier1.fit(x_train1, y_train1) classifier2.fit(x_train2, y_train2) classifier3.fit(x_train3, y_train3) # TESTING model y_pred1 = classifier1.predict(x_test1) y_pred2 = classifier2.predict(x_test2) y_pred3 = classifier3.predict(x_test3) y_prob1 = classifier1.predict_proba(x_test1) y_prob1 = y_prob1[:, [1]] y_prob2 = classifier2.predict_proba(x_test2) y_prob2 = y_prob2[:, [1]] y_prob3 = classifier3.predict_proba(x_test3) y_prob3 = y_prob3[:, [1]] y_preds = multiclassPredict(y_prob1, y_prob2, y_prob3) y_tests = multiclassPredict(np.transpose([np.array(y_test1)]), np.transpose([np.array(y_test2)])\ , np.transpose([np.array(y_test3)])) acc1 = metrics.accuracy_score(y_test1, y_pred1) acc2 = metrics.accuracy_score(y_test2, y_pred2) acc3 = metrics.accuracy_score(y_test3, y_pred3) prec1 = metrics.precision_score(y_test1, y_pred1) prec2 = metrics.precision_score(y_test2, y_pred2, average=None) prec2 = prec2[1] prec3 = metrics.precision_score(y_test3, y_pred3, average=None) prec3 = prec3[1] rec1 = metrics.recall_score(y_test1, y_pred1) rec2 = metrics.recall_score(y_test2, y_pred2, average=None) rec2 = rec2[1] rec3 = metrics.recall_score(y_test3, y_pred3, average=None) rec3 = rec3[1] y_pred1Tr = classifier1.predict(x_train1) y_pred2Tr = classifier2.predict(x_train2) y_pred3Tr = classifier3.predict(x_train3) acc1Tr = metrics.accuracy_score(y_train1, y_pred1Tr) acc2Tr = metrics.accuracy_score(y_train2, y_pred2Tr) acc3Tr = metrics.accuracy_score(y_train3, y_pred3Tr) prec1Tr = metrics.precision_score(y_train1, y_pred1Tr) prec2Tr = metrics.precision_score(y_train2, y_pred2Tr, average=None) prec2Tr = prec2Tr[1] prec3Tr = metrics.precision_score(y_train3, y_pred3Tr, average=None) prec3Tr = prec3Tr[1] rec1Tr = metrics.recall_score(y_train1, y_pred1Tr) rec2Tr = metrics.recall_score(y_train2, y_pred2Tr, average=None) rec2Tr = rec2Tr[1] rec3Tr = metrics.recall_score(y_train3, y_pred3Tr, average=None) rec3Tr = rec3Tr[1] testAcc = metrics.accuracy_score(y_tests, y_preds) testPrec = metrics.precision_score(y_tests, y_preds, average='weighted') testRec = metrics.recall_score(y_tests, y_preds, average='weighted') resultsTable1.loc[len(resultsTable1)] = [ k + 1, bestC1, acc1Tr, prec1Tr, rec1Tr, acc1, prec1, rec1 ] resultsTable2.loc[len(resultsTable2)] = [ k + 1, bestC2, acc2Tr, prec2Tr, rec2Tr, acc2, prec2, rec2 ] resultsTable3.loc[len(resultsTable3)] = [ k + 1, bestC3, acc3Tr, prec3Tr, rec3Tr, acc3, prec3, rec3 ] combinedResults.loc[len(combinedResults)] = [ k + 1, testAcc, testPrec, testRec ] metrics.plot_roc_curve(classifier1, x_test1, y_test1, name="AUC fold#" + str(k + 1), ax=ax1) metrics.plot_roc_curve(classifier2, x_test2, y_test2, name="AUC fold#" + str(k + 1), ax=ax2) metrics.plot_roc_curve(classifier3, x_test3, y_test3, name="AUC fold#" + str(k + 1), ax=ax3) cList = resultsTable1["cVal"].tolist() accListTr = resultsTable1["TrainAccuracy"].tolist() precListTr = resultsTable1["TrainPrecision"].tolist() recListTr = resultsTable1["TrainRecall"].tolist() accList = resultsTable1["TestAccuracy"].tolist() precList = resultsTable1["TestPrecision"].tolist() recList = resultsTable1["TestRecall"].tolist() resultsTable1.loc[len(resultsTable1)] = ["Avg", st.mean(cList), \ st.mean(accListTr), st.mean(precListTr), st.mean(recListTr), st.mean(accList), st.mean(precList), st.mean(recList)] resultsTable1.loc[len(resultsTable1)] = ["StdDev", st.stdev(cList), \ st.stdev(accListTr), st.stdev(precListTr), st.stdev(recListTr), st.stdev(accList), st.stdev(precList), st.stdev(recList)] cList = resultsTable2["cVal"].tolist() accListTr = resultsTable2["TrainAccuracy"].tolist() precListTr = resultsTable2["TrainPrecision"].tolist() recListTr = resultsTable2["TrainRecall"].tolist() accList = resultsTable2["TestAccuracy"].tolist() precList = resultsTable2["TestPrecision"].tolist() recList = resultsTable2["TestRecall"].tolist() resultsTable2.loc[len(resultsTable2)] = ["Avg", st.mean(cList), \ st.mean(accListTr), st.mean(precListTr), st.mean(recListTr), st.mean(accList), st.mean(precList), st.mean(recList)] resultsTable2.loc[len(resultsTable2)] = ["StdDev", st.stdev(cList), \ st.stdev(accListTr), st.stdev(precListTr), st.stdev(recListTr), st.stdev(accList), st.stdev(precList), st.stdev(recList)] cList = resultsTable3["cVal"].tolist() accListTr = resultsTable3["TrainAccuracy"].tolist() precListTr = resultsTable3["TrainPrecision"].tolist() recListTr = resultsTable3["TrainRecall"].tolist() accList = resultsTable3["TestAccuracy"].tolist() precList = resultsTable3["TestPrecision"].tolist() recList = resultsTable3["TestRecall"].tolist() resultsTable3.loc[len(resultsTable3)] = ["Avg", st.mean(cList), \ st.mean(accListTr), st.mean(precListTr), st.mean(recListTr), st.mean(accList), st.mean(precList), st.mean(recList)] resultsTable3.loc[len(resultsTable3)] = ["StdDev", st.stdev(cList), \ st.stdev(accListTr), st.stdev(precListTr), st.stdev(recListTr), st.stdev(accList), st.stdev(precList), st.stdev(recList)] accList = combinedResults["Combined TestAccuracy"].tolist() precList = combinedResults["Combined TestPrecision"].tolist() recList = combinedResults["Combined TestRecall"].tolist() combinedResults.loc[len(combinedResults)] = [ "Avg", st.mean(accList), st.mean(precList), st.mean(recList) ] combinedResults.loc[len(combinedResults)] = [ "StdDev", st.stdev(accList), st.stdev(precList), st.stdev(recList) ] return resultsTable1, resultsTable2, resultsTable3, combinedResults
class SVCT(BaseEstimator): """ 2-phase SVC for SASSE ERA5 polygon classification """ def __init__(self, args1={ 'kernel': 'rbf', 'probability': True }, args2={ 'kernel': DotProduct(), 'probability': True }, verbose=False): """ ... """ self.model1 = SVC(**args1) self.model2 = SVC(**args2) self.verbose = verbose def fit(self, X, y): """ Fit in two phases """ X1 = X y1 = y.copy() y1[(y1 > 0)] = 1 X2 = X.copy() y2 = y.copy() X2 = X2[(y > 0)] y2 = y2[(y2 > 0)] if self.verbose: logging.info('Fitting model 1...') self.model1.fit(X1.values, y1) if self.verbose: logging.info('Fitting model 2...') self.model2.fit(X2.values, y2) return self def predict(self, X): """ Predict """ #y_pred_proba = self.predict_proba(X) #return np.argmax(y_pred_proba, axis=1) # Alternative, more straight forward method y1_ = self.model1.predict(X) X2_ = X[(y1_ > 0)] y2_ = self.model2.predict(X2_) y1_[(y1_ > 0)] = y2_ return y1_ def predict_proba(self, X): """ Predict with probabilities """ yp1_ = self.model1.predict_proba(X) yp2_ = self.model2.predict_proba(X) y_pred_proba = np.zeros((len(X), 3)) y_pred_proba[:, 0] = yp1_[:, 0] y_pred_proba[:, 1] = np.where(yp1_[:, 1] >= .5, yp2_[:, 0], yp1_[:, 1]) y_pred_proba[:, 2] = np.where(yp1_[:, 1] >= .5, yp2_[:, 1], yp1_[:, 1]) return y_pred_proba
def Datasets(): """ just wants to split the dataset into 2 sets : train_set, test_set train_set : to build model test_set : for testing model valid_set : optional """ # READ CSV diabetes = pd.read_csv(filename) #print(diabetes) : to see the table # DEPENDENT VARs X = diabetes.iloc[:, [1, 2]].values # INDEPENDENT VAR : Outcome y = diabetes['Outcome'].values # Visualize the dataset before scalling sns.lmplot('Glucose', 'BloodPressure', data=diabetes, hue='Outcome', palette='Set1', fit_reg=False, scatter_kws={'s': 70}) # Splitting the data into training, and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) print("training set: ", X_train.shape) print("testing set: ", X_test.shape) # Feature scalling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) #scaler = MinMaxScaler() #X_train = scaler.fit_transform(X_train) #X_test = scaler.transform(X_test) # Build model svm = SVC(kernel='rbf', probability=True) svm.fit(X_train, y_train) # Visualize support vectors of training set support_vectors = svm.support_vectors_ plt.scatter(X_train[:, 0], X_train[:, 1]) plt.scatter(support_vectors[:, 0], support_vectors[:, 1], color='blue') plt.title('Support Vectors') plt.xlabel('X') plt.ylabel('y') plt.show() # Visualize training & Testing set # Replace X_set, y_set according to the visualization X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),\ np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, svm.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape), alpha=0.75, cmap=ListedColormap(('red', 'green'))) plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c=ListedColormap(('red', 'green'))(i), label=j) plt.title('SVM (training)') plt.xlabel('Glucose') plt.ylabel('Blood Pressure') plt.legend() plt.show() print("\n") # Predict y_pred = svm.predict(X_test) # Accuracy test_accuracy = accuracy_score(y_test, y_pred) print("Accuracy of prediction: ", test_accuracy) # Figure out the confusion matrix suchas recall, precision,.. confusionmatrix = confusion_matrix(y_test, y_pred) print("The confusion matrix: ", confusionmatrix) print("\n") # Evaluating by using Cross val on Training Set & Testing set test_scores = cross_val_score(svm, X_test, y_test, scoring='accuracy', cv=5) print("Accuracy of testing set by using cross val : ", test_scores) print("Mean of test scores: ", test_scores.mean()) print("\n") # Cross checking back # Check the probability of prediction value predict_proba_value = svm.predict_proba(X_test) y_pred_valid = svm.predict(X_test) print( "what value of prediction probability so system can determine which class of them: ", predict_proba_value) print("what system predicts toward the validating set of svm: ", y_pred_valid)