def main(): data = pickle.load(open('../submodular_20.pickle')) train, train_labels, test, test_labels = Load20NG() vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=True, lowercase=False) vectorizer.fit(train + test) train_vectors = vectorizer.transform(train) test_vectors = vectorizer.transform(test) svm = sklearn.svm.SVC(probability=True, kernel='rbf', C=10,gamma=0.001) svm.fit(train_vectors, train_labels) json_ret = {} json_ret['class_names'] = ['Atheism', 'Christianity'] json_ret['instances'] = [] explanations = data['explanations']['20ng']['svm'] idxs = data['submodular_idx']['20ng']['svm'][:10] for i in idxs: json_obj = {} json_obj['id'] = i idx = i instance = test_vectors[idx] json_obj['true_class'] = test_labels[idx] json_obj['c1'] = {} json_obj['c1']['predict_proba'] = list(svm.predict_proba(test_vectors[idx])[0]) exp = explanations[idx] json_obj['c1']['exp'] = exp json_obj['c1']['data'] = get_pretty_instance(test[idx], exp, vectorizer) json_ret['instances'].append(json_obj) import json open('static/exp2_local.json', 'w').write('data = %s' % json.dumps(json_ret))
def predict_sentiment(classifier, text): if classifier not in ['ffnn', 'svm', 'rf']: return jsonify(success=False, error="classifier_not_found") tokens = tokenize(text) doc_vector = d2v_model.infer_vector(tokens) prediction = { 'svm': lambda doc_vector: svm.predict_proba(doc_vector)[0], 'rf': lambda doc_vector: rf.predict_proba(doc_vector)[0], 'ffnn': lambda doc_vector: ffnn.predict(np.array([doc_vector]), batch_size=1)[0 ] }[classifier](doc_vector) polarity = 'positive' if prediction[0] > prediction[1] else 'negative' return_json = { 'success': True, 'polarity': polarity, 'score': prediction.tolist(), 'word2vec_hit_rate': 1.00 } return jsonify(**return_json)
def predict_proba(dataset): svm = joblib.load("hog_model/svm%d-%d-%d.model" % (WORD, S, E)) test_data = np.float32([]).reshape(0, WORD) test_target = [] dictIdx = 0 for name in dataset.keys()[S:E]: print(name) for i in dataset[name]: test_data = np.append(test_data, i.reshape(1, -1), axis=0) test_target += [dictIdx for i in range(len(dataset[name]))] dictIdx += 1 p = svm.predict_proba(test_data) print(p) p = p.argsort() print(p) print(test_target) q = 0 l = 0 right = 0 for name in dataset.keys()[S:E]: r = 0 for i in range(l, l + len(dataset[name])): if test_target[i] in p[i][E - 5:E]: r += 1 right += r print("%s : %d/%d" % (name, r, len(dataset[name]))) l += len(dataset[name]) print("%d / %d" % (right, l))
def move_particle(image, centroid, svm, radius, part_width, threshold): img = image.copy() img = np.pad(img, pad_width=((radius,radius),(radius,radius)),mode='median') particles = np.empty((0, part_width * part_width)) # can I replace following loop to increase the speed?? x = centroid[0] y = centroid[1] # x is width, y is height ## print("x is: ", x) ## print("y is: ", y) if radius == 0: print("radius is zero") exit() for i in range(2 * radius + 1): for j in range(2 * radius + 1): left = x + i - int(part_width / 2) right = left + part_width top = y + j - int(part_width / 2) bottom = top + part_width ## segment = img[left:right, top:bottom] segment = img[top:bottom, left:right] ## print("img shape: ", img.shape) if segment.shape[0] == part_width and segment.shape[1] == part_width: ## print("segment shape is: ", segment.shape) segment = segment.reshape((1,-1)) particles = np.append(particles, segment, axis=0) else: particles = np.append(particles, np.zeros((1, part_width * part_width))-1,axis=0) if particles.shape[0] != 0: pred_prob = svm.predict_proba(particles) # here assume the correct label of pred_prob is at 1 along axis 1 best_fit_index = np.argmax(pred_prob[:,1]) best_fit_prob = pred_prob[best_fit_index,1] ## print("best prob is: ", best_fit_prob) bxx = int(best_fit_index / (2 * radius + 1)) byy = int(best_fit_index % (2 * radius + 1)) ## print("best x is %d, y is %d" % (bxx,byy)) # here the best fit index probability is set to -1 to find the second fit index pred_prob[best_fit_index,1] = -1 second_fit_index = np.argmax(pred_prob[:,1]) second_fit_prob = pred_prob[second_fit_index,1] ## print("second prob is: ", second_fit_prob) sxx = int(second_fit_index / (2 * radius + 1)) syy = int(second_fit_index % (2 * radius + 1)) ## print("second x is %d, y is %d" % (sxx,syy)) if second_fit_prob >= threshold: return(np.array([2, bxx + x - radius, byy + y - radius, sxx + x - radius, syy + y - radius]).astype(int)) elif best_fit_prob >= threshold: return(np.array([1, bxx + x - radius, byy + y - radius, -1, -1]).astype(int)) else: return(np.array([0, -1, -1 , -1, -1]).astype(int)) else: return(np.array([0, -1, -1 , -1, -1]).astype(int))
def runSVM(x): #load svm with open("svm.pkl", 'rb') as f: svm = pickle.load(f) ########################################### normal classification with SVM####################### y_pred = svm.predict(x) y_prob = svm.predict_proba(x) y_prob = y_prob[:, 1]
def classify(test_features, svm): max_prob = 0 max_choice = 0 for j in range(4): instance = test_features[j] # print svm.predict_proba(instance) p = svm.predict_proba(instance)[0][0] if p > max_prob: max_prob = p max_choice = j return max_choice
def svm(x_train,y_train,x_test, threshold = 0.5):#SVM with linear kernel (similiar to LogisticRegression) from sklearn import svm svm = svm.SVC( kernel = 'linear') print('Start training') svm.fit(x_train,y_train) print('training finished') print('start prediction') svm_predict = svm.predict_proba(x_test) print('prediction finished') #threshold = 0.0363 #set decision threshold for class imbalance (threshold = positive sample/negative sample) svm_predicted = (svm_predict [:,1] >= threshold).astype('int') dp.save_result(svm_predicted, 'SVM_result13_threshold00363')#save prediction
def predict_svm(X, svm, std_scaler=None, pca=None, probability=True): # Apply PCA if available if pca is not None: X = pca.transform(X) X = np.float32(X) # Standardize data if std_scaler is None: X_std = X else: X_std = std_scaler.transform(X) # Predict the labels if probability: return svm.predict_proba(X_std) else: return svm.predict(X_std)
def predict(dataset): def result(p,test_target,dataset,num): print("###############################") #top num l = 0 right = 0 for name in list(dataset.keys())[S:E]: r = 0 for i in range(l,l+len(dataset[name])): # print(test_target[i],p[i][E-num-1:E-1]) if test_target[i] in p[i][E-num-1:E-1]: r += 1 right+=r print( "%s : %d/%d" %(name,r,len(dataset[name]))) l+=len(dataset[name]) print( "%d / %d"%(right,l)) print("###############################") svm = joblib.load("model/%d_svm%d-%d-%d.model"%(V,WORD,S,E)) clf = joblib.load("model/%d_vocab%d-%d-%d.pkl"%(V,WORD,S,E)) # centers = clf.cluster_centers_ test_data = np.float32([]).reshape(0,WORD) test_target = [] target = {} for i,name in enumerate(list(dataset.keys())): target[name] = i for name in list(dataset.keys())[S:E]: idx = target[name] print( name) for i in dataset[name]: featVec = getFeatVec(i, clf) test_data = np.append(test_data,featVec,axis=0) test_target += [idx for i in range(len(dataset[name]))] p = svm.predict_proba(test_data) p = p.argsort() result(p,test_target,dataset,5) result(p,test_target,dataset,4) result(p,test_target,dataset,3) result(p,test_target,dataset,2) result(p,test_target,dataset,1)
def predict(df, hate=10, threshold=0.6): '''This function takes a dataset with comments labels as tweet and predicts the hatefulness. Optional parameters are hate and threshold. It returns 5 elements in this order [first_hate, count_hate, count_comments, hate_ratio] - first_hate: First hate comments as dict - count_hate: Total amount of hate comments= sum(y_pred_svm) - count_comments: Total amount of comments and subcomments - hate_ratio = count_hate/count_comments ''' test = pd.DataFrame(df) test_tweet = clean_tweets(test["tweet"]) test["clean_tweet"] = test_tweet x_test_vec = vectorizer.transform(test_tweet) y_pred_svm = svm.predict(x_test_vec) test["prediction"] = y_pred_svm y_pred_proba = svm.predict_proba(x_test_vec) proba = [] for i in y_pred_proba: a = i[0] proba.append(a) test["proba"] = proba test_sort = test.sort_values(by=["proba"], ascending=False) hateful_comments = test_sort[(test_sort["prediction"] == 1) & (test_sort["proba"] >= threshold)] count_hate = len(hateful_comments) count_comments = len(y_pred_svm) hate_ratio = count_hate / count_comments percentage = round(hate_ratio * 100, 2) first_hate = hateful_comments[["tweet", "proba"]][:hate] return [ first_hate.to_dict('records'), count_hate, count_comments, percentage ]
def testmodel(path): def getFeatVec(features, clf): featVec = np.zeros((1, 1000)) res = clf.predict(features) for i in res: featVec[0][i] += 1 return featVec def result(predict_y, test_y, num): print("###############################") right = 0 res = [] for i, tag in enumerate(test_y): if tag in predict_y[i][21 - num:21]: right += 1 res.append(True) else: res.append(False) print("%d/%d = %f" % (right, len(test_y), right * 1.0 / len(test_y))) print("###############################") return res print('load model') svm = joblib.load("./05svm.model") clf = joblib.load("./05vocab.pkl") data, tags = loaddata(path) features, tags = extractfeature(data, tags) print('predict') test_x = np.float32([]).reshape(0, 1000) for feature in features: featVec = getFeatVec(feature, clf) test_x = np.append(test_x, featVec, axis=0) p = svm.predict_proba(test_x) p = p.argsort() res = result(p, tags, 5) return res
def silence_removal(signal, sampling_rate, st_win, st_step, smooth_window=0.5, weight=0.5, plot=False): """ Event Detection (silence removal) ARGUMENTS: - signal: the input audio signal - sampling_rate: sampling freq - st_win, st_step: window size and step in seconds - smoothWindow: (optinal) smooth window (in seconds) - weight: (optinal) weight factor (0 < weight < 1) the higher, the more strict - plot: (optinal) True if results are to be plotted RETURNS: - seg_limits: list of segment limits in seconds (e.g [[0.1, 0.9], [1.4, 3.0]] means that the resulting segments are (0.1 - 0.9) seconds and (1.4, 3.0) seconds """ if weight >= 1: weight = 0.99 if weight <= 0: weight = 0.01 # Step 1: feature extraction # signal = audioBasicIO.stereo_to_mono(signal) st_feats, _ = feature_extraction(signal, sampling_rate, st_win * sampling_rate, st_step * sampling_rate) # Step 2: train binary svm classifier of low vs high energy frames # keep only the energy short-term sequence (2nd feature) st_energy = st_feats[1, :] en = np.sort(st_energy) # number of 10% of the total short-term windows st_windows_fraction = int(len(en) / 10) # compute "lower" 10% energy threshold low_threshold = np.mean(en[0:st_windows_fraction]) + 1e-15 # compute "higher" 10% energy threshold high_threshold = np.mean(en[-st_windows_fraction:-1]) + 1e-15 # get all features that correspond to low energy low_energy = st_feats[:, np.where(st_energy <= low_threshold)[0]] # get all features that correspond to high energy high_energy = st_feats[:, np.where(st_energy >= high_threshold)[0]] # form the binary classification task and ... features = [low_energy.T, high_energy.T] # normalize and train the respective svm probabilistic model # (ONSET vs SILENCE) features_norm, mean, std = normalize_features(features) svm = train_svm(features_norm, 1.0) # Step 3: compute onset probability based on the trained svm prob_on_set = [] for index in range(st_feats.shape[1]): # for each frame cur_fv = (st_feats[:, index] - mean) / std # get svm probability (that it belongs to the ONSET class) prob_on_set.append(svm.predict_proba(cur_fv.reshape(1, -1))[0][1]) prob_on_set = np.array(prob_on_set) # smooth probability: prob_on_set = smooth_moving_avg(prob_on_set, smooth_window / st_step) # Step 4A: detect onset frame indices: prog_on_set_sort = np.sort(prob_on_set) # find probability Threshold as a weighted average # of top 10% and lower 10% of the values nt = int(prog_on_set_sort.shape[0] / 10) threshold = (np.mean((1 - weight) * prog_on_set_sort[0:nt]) + weight * np.mean(prog_on_set_sort[-nt::])) max_indices = np.where(prob_on_set > threshold)[0] # get the indices of the frames that satisfy the thresholding index = 0 seg_limits = [] time_clusters = [] # Step 4B: group frame indices to onset segments while index < len(max_indices): # for each of the detected onset indices cur_cluster = [max_indices[index]] if index == len(max_indices) - 1: break while max_indices[index + 1] - cur_cluster[-1] <= 2: cur_cluster.append(max_indices[index + 1]) index += 1 if index == len(max_indices) - 1: break index += 1 time_clusters.append(cur_cluster) seg_limits.append( [cur_cluster[0] * st_step, cur_cluster[-1] * st_step]) # Step 5: Post process: remove very small segments: min_duration = 0.2 seg_limits_2 = [] for s_lim in seg_limits: if s_lim[1] - s_lim[0] > min_duration: seg_limits_2.append(s_lim) return seg_limits_2
def predictSVM(svm, x, y, x_test): print("[SVM] Testando modelo...") yPredito = svm.predict(x_test) return yPredito, svm.predict_proba(x_test)[:, 1]
for col in cat: train[col] = le.fit_transform(train[col]) test[col] = le.fit_transform(test[col]) #no shirts, no shoes train_X = train.drop(['year','oscar', 'movie_name', 'actor_name', 'href'], axis=1) test_X = test.drop(['year','oscar', 'movie_name', 'actor_name', 'href'], axis = 1) train_Y = train['oscar'] #Fights will go on as long as they want to svm =svm.SVC(kernel='rbf',C=1).fit(train_X,train_Y) svm.score(train_X, train_Y) #If this is your first night at Fight Club, you have to fight. pred_svm = svm.predict_proba(test_X)[:,1] svm_prediction = pd.DataFrame(pred_svm, test['movie_name'])
def predict_svm(svm, hog_window): """Return the confidence of classifying as a car.""" return svm.predict_proba(hog_window.reshape(1, -1))[:, 1]
datax = scaler.transform(datax_temp) xgb = pickle.load(open("xgboost.dat", "rb")) svm = pickle.load(open("svm.dat", "rb")) lr = pickle.load(open("lr.dat", "rb")) randomforest = pickle.load(open("randomforest.dat", "rb")) xgb_pred = pd.DataFrame(xgb.predict_proba(datax)[:, 1]) rf_pred = pd.DataFrame(randomforest.predict_proba(datax)[:, 1]) temp = pd.concat([xgb_pred, rf_pred], axis=1) temp['avg'] = temp.mean(axis=1) combined_df = pd.concat([ pd.DataFrame(data_parent[['subject_id', 'datetime']]), pd.DataFrame(datax_temp), temp['avg'] ], axis=1) combined_df['patient_category'] = combined_df.apply(f, axis=1) # critical_patients = combined_df.loc[combined_df['patient_category'].isin(['very-critical', 'critical', 'moderate-critical'])] combined_df.to_csv('critical_patients_records.csv') xgb_prob = pd.DataFrame(xgb.predict_proba(datax)) svm_prob = pd.DataFrame(svm.predict_proba(datax)) lr_prob = pd.DataFrame(lr.predict_proba(datax)) rf_prob = pd.DataFrame(randomforest.predict_proba(datax)) final_df = pd.concat([xgb_prob, rf_prob, lr_prob, svm_prob], axis=1) final_df.to_csv("prob_predictions_model-level.csv")
lenzip = 0 count = 0 for func,name in zip([triangle(width=img_size, height=img_size), rectangle(width=img_size, height=img_size), trapazoid(width=img_size, height=img_size), rhombus(width=img_size, height=img_size)], ['triangle', 'rectangle', 'trapazoid', 'rhombus']): print name for shape in func: #print "tick" #Failed fix this!!!!!!!!!!!!!!!!!!1 intersect_list = radial_intercepts(shape, img_size) intersect_list = np.asarray(scaler.transform(intersect_list)) svm_prediction = svm.predict(intersect_list) #dtree_prediction = dtree.predict(intersect_list) if svm_prediction != name: prob = svm.predict_proba(intersect_list) svm_errors.append([name, svm_prediction, shape, prob, intersect_list]) else: count+=1 ''' img = cv2.imread('/home/lie/Desktop/p/cut1.jpg') letter = cv2.Canny(img,100,200,apertureSize=3) contours, hierarchy = cv2.findContours(letter.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) cnts =[] for cnt in contours: cnt = cv2.approxPolyDP(cnt,.05,True) if cv2.contourArea(cnt)>2400: cnts.append(cnt)
test_est_p_ann = ann.predict_proba(x_test)[:,1] train_est_p_ann = ann.predict_proba(x_train)[:,1] fpr_test_ann, tpr_test_ann, th_test_ann=metrics.roc_curve(y_test,test_est_p_ann,pos_label=1) fpr_train_ann, tpr_train_ann, th_train_ann=metrics.roc_curve(y_train,train_est_p_ann,pos_label=1) #用svm创建模型 from sklearn import svm svm=svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='linear', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) svm.fit(x_train,y_train) test_est_svm = svm.predict(x_test) train_est_svm = svm.predict(x_train) test_est_p_svm = svm.predict_proba(x_test)[:,1] train_est_p_svm = svm.predict_proba(x_train)[:,1] fpr_test_svm, tpr_test_svm, th_test_svm=metrics.roc_curve(y_test,test_est_p_svm,pos_label=1) fpr_train_svm, tpr_train_svm, th_train_svm=metrics.roc_curve(y_train,train_est_p_svm,pos_label=1) #用随机森林建模 import json from operator import itemgetter from sklearn.externals import joblib from sklearn.ensemble import RandomForestClassifier from sklearn.cross_validation import cross_val_score from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold clf=RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=5, min_samples_split=1,
def computeLikelihood(patch): X = hog.compute(patch).T prediction = svm.predict_proba(X) return prediction[0][1]
dt.fit(X_train, y_train) y_pred_dt = dt.predict_proba(X_test)[:, 1] fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred_dt) precision_dt, recall_dt, _ = precision_recall_curve(y_test, y_pred_dt) roc_auc_dt = auc(fpr_dt, tpr_dt) mlp = MLPClassifier() mlp.fit(X_train, y_train) y_pred_mlp = mlp.predict_proba(X_test)[:, 1] fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_pred_mlp) precision_mlp, recall_mlp, _ = precision_recall_curve(y_test, y_pred_mlp) roc_auc_mlp = auc(fpr_mlp, tpr_mlp) svm = svm.SVC(probability=True) svm.fit(X_train, y_train) y_pred_svm = svm.predict_proba(X_test)[:, 1] fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred_svm) precision_svm, recall_svm, _ = precision_recall_curve(y_test, y_pred_svm) roc_auc_svm = auc(fpr_svm, tpr_svm) sgd = SGDClassifier(loss='log') sgd.fit(X_train, y_train) y_pred_sgd = sgd.predict_proba(X_test)[:, 1] fpr_sgd, tpr_sgd, _ = roc_curve(y_test, y_pred_sgd) precision_sgd, recall_sgd, _ = precision_recall_curve(y_test, y_pred_sgd) roc_auc_sgd = auc(fpr_sgd, tpr_sgd) gb = GaussianNB() gb.fit(X_train, y_train) y_pred_gb = gb.predict_proba(X_test)[:, 1] fpr_gb, tpr_gb, _ = roc_curve(y_test, y_pred_gb)
Tree_proba1 = Tree_clf.predict_proba(X) print("the f1 score of decision tree in the test data = ", f1_score(Y, Tree_pred1, average='weighted')) print("the jaccard score of the decison tree in the test data = ", jaccard_score(Y, Tree_pred1, average='weighted')) print("the logloss for decision tree in the test datset = ", log_loss(Y, Tree_proba1)) f_lst.append(f1_score(Y, Tree_pred1, average='weighted')) j_lst.append(jaccard_score(Y, Tree_pred1, average='weighted')) l_lst.append(log_loss(Y, Tree_proba1)) n_lst.append('Descion Tree') #SVM for the test data SVM_pred1 = svm.predict(X) SVM_proba = svm.predict_proba(X) print("the f1 score for the svm in test data = ", f1_score(Y, SVM_pred1, average='weighted')) print("the jaccard score for the svm in the test data = ", jaccard_score(Y, SVM_pred1, average="weighted")) print(" log loss for SVM in the test datset = ", log_loss(Y, SVM_proba)) f_lst.append(f1_score(Y, SVM_pred1, average='weighted')) j_lst.append(jaccard_score(Y, SVM_pred1, average='weighted')) l_lst.append(log_loss(Y, SVM_proba)) n_lst.append('SVM') # Final Report Report = pd.DataFrame(columns=['Algorithm', 'Jaccard', 'F1-Score', 'LogLoss']) Report['Algorithm'] = n_lst Report['Jaccard'] = j_lst
max_depth=3, random_state=57) bayes = GaussianNB(priors=[0.25, 0.25, 0.25, 0.25]) svm = svm.SVC(probability=True, C=0.01, gamma=1, random_state=1289) last_clf.fit(meta_train_data, meta_train_true_labels) forest.fit(train_input, train_data['true_class']) bayes.fit(train_input, train_data['true_class']) #bayes.fit(train_input_iid, train_data['true_class']) svm.fit(train_input, train_data['true_class']) test_proba_forest = forest.predict_proba(test_input) test_proba_bayes = bayes.predict_proba(test_input) #test_proba_bayes = bayes.predict_proba(test_input_iid) test_proba_svm = svm.predict_proba(test_input) test_proba = np.concatenate( (test_proba_forest, test_proba_bayes, test_proba_svm), axis=1) # test_proba = np.concatenate((test_proba_forest, test_proba_svm), axis=1) # test_proba = (test_proba_forest + test_proba_svm + test_proba_bayes)/3 pred = last_clf.predict(test_proba) differ = abs(pred - test_data['true_class']) accu = 1 - np.count_nonzero(differ) / test_data.shape[0] print("Stacked accuracy:") print(accu) print("Stacked confusion matrix")
def score(svm, datax, datay): return np.mean(svm.predict(datax) == datay) print("Lineairement séparable") # Lineairement separable avec un peu de bruit datax, datay = gen_arti(nbex=1000, data_type=0, epsilon=1) testx, testy = gen_arti(nbex=1000, data_type=0, epsilon=1) # lineaire avec paramètres par défaut svm = sklearn.svm.SVC(probability=True, kernel='linear') svm.fit(datax, datay) plot_frontiere_proba(datax, lambda x: svm.predict_proba(x)[:, 0], step=50) plot_data(datax, datay) plt.show() print("Parametres par defaut : ", score(svm, testx, testy)) # lineaire avec C très fort svm = sklearn.svm.SVC(probability=True, kernel='linear', C=99) svm.fit(datax, datay) plot_frontiere_proba(datax, lambda x: svm.predict_proba(x)[:, 0], step=50) plot_data(datax, datay) plt.show() print("C fort : ", score(svm, testx, testy)) print("Non lineaire") # Non-lineairement separable avec un peu de bruit
max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, oob_score=False, verbose=0, warm_start=False) rf.fit(x_train, y_train) rf_pred = rf.predict(x_test) rf_prob = rf.predict_proba(x_test) # SVM svm = svm.SVC(gamma='scale', C=30000, probability=True) svm.fit(x_train, y_train) svm_pred = svm.predict(x_test) svm_prob = svm.predict_proba(x_test) # Gaussian Naive Bayes gnb = GaussianNB() gnb.fit(x_train, y_train) gnb_pred = gnb.predict(x_test) gnb_prob = gnb.predict_proba(x_test) # Quadratic Discriminant Analysis qda = QuadraticDiscriminantAnalysis() qda = qda.fit(x_train, y_train) qda_pred = qda.predict(x_test) qda_prob = qda.predict_proba(x_test) # Gaussian Process #gp = GaussianProcessClassifier(1.0 * RBF(1.0))
df_data['collection'], test_size=0.1, random_state=1) # vetorize vectorizer = TfidfVectorizer() X_train_v = vectorizer.fit_transform(X_train) # svm svm = svm.SVC(C=1000, gamma='auto', probability=True) svm.fit(X_train_v, y_train) # test model X_test_v = vectorizer.transform(X_test) y_pred = svm.predict(X_test_v) y_pred_proba = svm.predict_proba(X_test_v) # ham probability # summary pp.pprint(confusion_matrix(y_test, y_pred)) score = svm.score(X_test_v, y_test) print(score) # 98% # save results df_results = pd.DataFrame.from_dict({ 'X_test': X_test, 'y_test': y_test, 'y_pred': y_pred, 'y_pred_proba':
output_class = np.array(classif.predict(prediction_features)) return output_class #%% '''Cell 5: Train and Evaluate model on Test''' y_pred = train_and_predict(X_train, y_train, X_test) if y_pred is not None: print(metrics.accuracy_score(y_test, y_pred)) #%% '''Cell 6: ROC Curve, AUC, Confusion Matrix''' # predict probabilities for X_test using predict_proba probabilities = svm.predict_proba(X_test) # select the probabilities for label 1.0 y_proba = probabilities[:, 1] # calculate false positive rate and true positive rate at different thresholds false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_proba, pos_label=1) # calculate AUC roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') # plot the false positive rate on the x axis and the true positive rate on the y axis roc_plot = plt.plot(false_positive_rate,
# logo = StratifiedKFold(n_splits=10) fold = 1 for train_index, val_index in logo.split(train_input, train_true_class, train_group_id): X_train, X_test = train_input[train_index], train_input[val_index] y_train, y_test = train_true_class[train_index], train_true_class[ val_index] # fit base classifiers on training data # and get predictions for test data forest.fit(X_train, y_train) pred_f = forest.predict_proba(X_test) bayes.fit(X_train, y_train) pred_b = bayes.predict_proba(X_test) svm.fit(X_train, y_train) pred_s = svm.predict_proba(X_test) # pred_features = np.concatenate((pred_f, pred_b, pred_s), axis=1) # pred_features = np.concatenate((pred_f, pred_s), axis=1) # pred_features = (pred_f + pred_s + pred_b)/3 train_pred_X.append(pred_features) train_true_labels.append(y_test) train_groups.append(train_group_id[val_index]) print("Fold " + str(fold) + "done") fold = fold + 1 train_level_one_data = np.concatenate(train_pred_X, axis=0) train_level_one_labels = np.concatenate(train_true_labels) train_level_one_groups = np.concatenate(train_groups) with open('..\saves\meta_train_data_mixed.pickle', 'wb') as f:
results_rect = [] results_label = [] results_proba = [] results_feature = [] for featureIndex, feature in enumerate(features): # init decision of current bounding box maxProba = -1 labelInMaxProba = 0 rectInMaxProba = [] for svmIndex, svm in enumerate(svms): svmClassLabel = svmIndex + 1 print("---> svm index :", svmClassLabel) pred = svm.predict_proba([feature.tolist()]) probaArr = pred[0] # not background if (probaArr[0] < probaArr[1]): print(" +++ detect object in this childImg.") print("proba :", probaArr[1]) if (probaArr[1] > maxProba): print("larger probability appear in this svm model.") maxProba = probaArr[1] labelInMaxProba = svmClassLabel rectInMaxProba = verts[featureIndex] # use predict result on max probability as the final result if (maxProba > 0): results_label.append(labelInMaxProba) results_rect.append(rectInMaxProba) results_proba.append(maxProba)
lr.fit(X_train1, y_train1) y_pred1 = lr.predict(X_test1) print('roc_auc_score:', metrics.roc_auc_score(y_test1, y_pred1)) y_predict_probabilities1 = lr.predict_proba(X_test1)[:, 1] fpr1, tpr1, _ = roc_curve(y_test1, y_predict_probabilities1) roc_auc1 = auc(fpr1, tpr1) svm = svm.SVC(kernel='linear',C=10, probability=True) svm.fit(X_train1, y_train1) y_pred2 = svm.predict(X_test1) print('roc_auc_score:', metrics.roc_auc_score(y_test1, y_pred2)) y_predict_probabilities2 = svm.predict_proba(X_test1)[:,1] fpr2, tpr2, _ = roc_curve(y_test1, y_predict_probabilities2) roc_auc2 = auc(fpr2, tpr2) data2 = pd.read_csv('English (unnormalized).csv') y2 = data2['y'] x2 = data2.drop(labels=['y'], axis=1) cv2 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1) for train_index, test_index in cv2.split(x2, y2): X_train2, X_test2, y_train2, y_test2 = x2.iloc[train_index], x2.iloc[test_index], y2.iloc[train_index], y2.iloc[test_index] dt = DecisionTreeClassifier(random_state=10,max_depth=5, min_samples_leaf=1,max_features=0.8) dt.fit(X_train2, y_train2) y_pred3 = dt.predict(X_test2)
'time': time_list }) resultDf['trueMove'] = resultDf['trueMove'].astype(int) resultDf['equal'] = (resultDf.predictionSVM == resultDf.trueMove.astype(int)) print('--------------Plot ROC-AUC --------------') from sklearn import metrics print("NB Accuracy", metrics.accuracy_score(resultDf.trueMove, pred_list_NB)) plt.figure(figsize=(9, 7)) y_pred_proba = nb.predict_proba(X_test_array)[::, 1] fpr, tpr, _ = metrics.roc_curve(pred_list_NB, y_pred_proba) auc = metrics.roc_auc_score(pred_list_NB, y_pred_proba) plt.plot(fpr, tpr, label="NB auc=" + str('% 6.3f' % auc)) y_pred_proba2 = knn.predict_proba(X_test_array)[::, 1] fpr, tpr, _ = metrics.roc_curve(pred_list_KNN, y_pred_proba2) auc2 = metrics.roc_auc_score(pred_list_KNN, y_pred_proba2) plt.plot(fpr, tpr, label="KNN auc=" + str('% 6.3f' % auc2)) y_pred_proba3 = svm.predict_proba(X_test_array)[::, 1] fpr, tpr, _ = metrics.roc_curve(pred_list_SVM, y_pred_proba3) auc3 = metrics.roc_auc_score(pred_list_SVM, y_pred_proba3) plt.plot(fpr, tpr, label="SVM auc=" + str('% 6.3f' % auc3)) plt.xlabel("false positive") plt.ylabel("true positive") plt.legend(loc=4) plt.show()
svm.fit(X_train, y_train) #Save SVM object pickle_dump(svm, path='./checkpoints/SVM_Model') ################### Evaluate Model ################### # generate predictions y_pred = svm.predict(X_valid) # calculate accuracy accuracy = accuracy_score(Y_valid, y_pred) print('Model accuracy is: ', accuracy) ################### ROC curve & AUC ################### # predict probabilities for X_test using predict_proba probabilities = svm.predict_proba(X_valid) # select the probabilities for label 1.0 y_proba = probabilities[:, 1] # calculate false positive rate and true positive rate at different thresholds false_positive_rate, true_positive_rate, thresholds = roc_curve(Y_valid, y_proba, pos_label=1) # calculate AUC roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') # plot the false positive rate on the x axis and the true positive rate on the y axis roc_plot = plt.plot(false_positive_rate,