def train(self, X, y, val_size=0.2): from sklearn.model_selection import train_test_split X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size) self.model.fit(X_train, y_train) if (self.model_type.split("_")[-1] == "Regressor"): from sklearn.metrics import mean_squared_error as mse from sklearn.metrics import r2_score as r2 y_pred_train = self.model.predict(X_train) print("Training Scores:") print("MSE : " + str(mse(y_train, y_pred_train))) print("R-Squared-Score : " + str(r2(y_train, y_pred_train))) if (val_size != 0): y_pred_val = self.model.predict(X_val) print("Validation Scores:") print("MSE : " + str(mse(y_val, y_pred_val))) print("R-Squared-Score : " + str(r2(y_val, y_pred_val))) else: from sklearn.metrics import classification_report as cr y_pred_train = self.model.predict(X_train) print("Training Scores:") print("MSE : " + str(cr(y_train, y_pred_train))) if (val_size != 0): y_pred_val = self.model.predict(X_val) print("Validation Scores:") print("MSE : " + str(cr(y_val, y_pred_val)))
def compute_prf(predictions, true_labels, class_label): # file_pred = open('wrong_pred.txt', 'w') reverse_label = {} for key in class_label: reverse_label[class_label[key]] = key new_predictions = [] new_true_labels = [] try: for i in range(len(predictions)): new_predictions.append(reverse_label[predictions[i]]) new_true_labels.append(reverse_label[true_labels[i]]) except: pass utts = x_text[dev_sample_index:] # # for i, text in enumerate(utts): # new_label = posttopicmerging(text) # if len(new_label) > 0: # new_predictions[i] = new_label # for i, text in enumerate(utts): # if new_predictions[i] != new_true_labels[i]: # file_pred.write(text + '\t' + new_predictions[i] + '\t' + new_true_labels[i]) # file_pred.write('\n') print cr(new_true_labels, new_predictions, digits=3)
def classify(model, featureVectors): z = model.predict(featureVectors[:, :-1]).astype( np.int).reshape(-1).tolist() data = featureVectors[:, -1].flatten() data = data.astype(np.int).tolist() labels = ['DOS', 'Normal', 'Probing', 'R2L', 'U2R'] print cr(data, z)
def classify(model, featureVectors): z = model.predict(featureVectors[:, :-1]).astype(np.int).reshape(-1).tolist() data = featureVectors[:,-1].flatten() data = data.astype(np.int).tolist() labels = ['DOS', 'Normal', 'Probing', 'R2L', 'U2R'] print cr(data, z, target_names=labels, digits = 4) cm = confusion_matrix(data, z) print_cm(cm, labels)
def fit(self, X, y, max_epochs=100): # uniform labels print ("Training is started") self.lb = LabelBinarizer() y = self.lb.fit_transform(y) #print(y) # get all sizes n_samples, n_features = X.shape self.n_outs = y.shape[1] #print(self.n_outs) n_iterations = int(max_epochs * n_samples) # initialize weights #NOTE smart initialization nO = np.sqrt(n_features) nH = np.sqrt(self.n_hidden) self.weights1_ = np.random.uniform(-1/nO, 1/nO, size=(n_features, self.n_hidden)) self.bias1_ = np.zeros(self.n_hidden) self.weights2_ = np.random.uniform(-1/nH, 1/nH, size=(self.n_hidden, self.n_outs)) self.bias2_ = np.zeros(self.n_outs) if self.SGD: # NOTE Stochastic Gradient Descent # initialize hidden-layer and output layer matrices x_hidden = np.empty((1, self.n_hidden)) delta_h = np.empty((1, self.n_hidden)) x_output = np.empty((1, self.n_outs)) delta_o = np.empty((1, self.n_outs)) nrange = range(n_samples) for it in xrange(1, max_epochs+1): np.random.shuffle(nrange) for j in nrange: self._forward(X[j, None], x_hidden, x_output) self._backward(X[j, None], y[j, None], x_hidden, x_output, delta_o, delta_h) pred = self.predict(xtest) #print("p:",pred) print("1: ",cr(ytest, pred)) else: # NOTE Gradient Descent # initialize hidden-layer and output layer matrices x_hidden = np.empty((n_samples, self.n_hidden)) delta_h = np.empty((n_samples, self.n_hidden)) x_output = np.empty((n_samples, self.n_outs)) delta_o = np.empty((n_samples, self.n_outs)) # adjust weights by a forward pass and a backward error propagation for i in xrange(max_epochs): self._forward(X, x_hidden, x_output) self._backward(X, y, x_hidden, x_output, delta_o, delta_h) pred = self.predict(X) print("2: ",cr(y1, pred))
def classify(model, featureVectors): true = 0 total = 0 z = [] for feature in featureVectors: if feature[-1] == predict(model, feature[:-1]): true += 1 z = z + predict(model, feature[:-1]).astype(np.int).tolist() total += 1 data = featureVectors[:,-1].flatten() data = data.astype(np.int).tolist() print cr(data, z) print "Accuracy:", print (true * 100) / total
def classify(model, featureVectors): true = 0 total = 0 z = [] for feature in featureVectors: if feature[-1] == predict(model, feature[:-1]): true += 1 z = z + predict(model, feature[:-1]).astype(np.int).tolist() total += 1 data = featureVectors[:, -1].flatten() data = data.astype(np.int).tolist() print z print cr(data, z) print "Accuracy : ", print(true * 100) / total
def print_metrics(self, predicted_output): """ Print some MVP metrics. sklearn is used for calculation of all the metric values. Confusion matrix values (true positive, false negative, false positive and true negative), precision, recall, f1-score and accuracy is calculated. There are few other metrics which comes under classification report, but meh to them. We need the actual labels and the predicted labels to calculate the metrics. We can get the actual labels from the class variable and the predicted output or predicted labels are passed as a parameter after running each algorithm. :param predicted_output: Predicted labels """ res = cm(self.y_test, predicted_output) tp = res[0][0] fn = res[1][0] fp = res[0][1] tn = res[1][1] print("Accuracy: ", acs(self.y_test, predicted_output)) print("TP: ", tp, ", FN: ", fn, ", FP: ", fp, "TN: ", tn) print(cr(self.y_test, predicted_output))
def predictResult(betterN, x_train, y_train, y_test, x_test): data2 = pd.read_csv("/tmp/predict_result.csv", header=0) # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array cols2 = data2.columns[(data2.columns != columnResultName)] fts2 = np.array(data2[cols2]) #quando nao mandar um vaor de betterN, significa que demos o load do modelo if betterN > 0: knn.n_neighbors = betterN knn.fit(x_train, y_train) # dump(knn, 'models/knn_teste.joblib') prFit = knn.predict(x_test) print("predicao: a", prFit) print("Matriz de Confusao NB:") print(cfm(y_test, prFit)) print("F1 score NB:") print(f1s(y_test, prFit)) print("Precision score NB:") print(ps(y_test, prFit)) print("Recall score NB:") print(rs(y_test, prFit)) print("Classification Report") print(cr(y_test, prFit)) pr1 = knn.predict(fts2) print("predico unica", int(pr1[0])) print("predicao unica score") print(pr1) return pr1
def check(input_file, gold_file): original = [line.strip() for line in open(gold_file, 'r')] submission = [line.strip() for line in open(input_file, 'r')] target_names = ['normal', 'dos', 'r2l', 'u2r', 'probing'] target_names = ['type1', 'type2', 'type4', 'type5', 'type3'] for i in range(len(submission)): if submission[i] not in target_names: submission[i] = '' if len(submission) < len(original): extra = ['' for x in range(len(original) - len(submission))] submission += extra elif len(submission) > len(original): submission = submission[0:len(original)] x = cr(original, submission, digits = 4) x = x.split() ind5 = x.index('type5') score = x[ind5 + 2] j = x.index('avg') score = x[ind5 + 2] precision = x[j + 3] if (float(precision) > 0 and float(score) > 0): return 0,0,(float(score)+float(precision))*50, "Accepted" else: return 0,0,0, "Wrong Answer"
def optimal_features_scores(model, n): model.fit(X_train, y_train) #Get top n features features = pd.DataFrame({'feature':X_train.columns.values, 'importance':model.feature_importances_}) features_sorted = features.sort_values(by = ['importance'], ascending = False) #Dataset with only top n features important_features = features_sorted['feature'].head(n) X_train_feat = X_train.loc[:, important_features] X_test_feat = X_test.loc[:, important_features] model.fit(X_train_feat, y_train) y_predict = model.predict(X_test_feat) acc = accuracy_score(y_test, y_predict) conf_tree = pd.DataFrame(confusion_matrix(y_test, y_predict), columns=['Predicted Benign', 'Predicted Malignant'], index=['True Benign', 'True Malignant']) print(conf_tree, "\n") print("Accuracy: ", acc) print("\n") #Other metrics to show model quality target_names = ['Benign', 'Malignant'] print(cr(y_test, y_predict, target_names=target_names))
def predictResult(x_train, y_train, y_test, x_test): data2 = pd.read_csv("/tmp/predict_result.csv", header=0) # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array cols2 = data2.columns[(data2.columns != columnResultName)] fts2 = data2[cols2] fts2 = Normalizer().fit_transform(fts2) randomForest.fit(x_train, y_train) dump(randomForest, 'randomForest.model') randomForestLoaded = load('randomForest.model') prFit = randomForestLoaded.predict(x_test) print("predicao:", prFit) print("Matriz de Confusao LR:") print(cfm(y_test, prFit)) print("F1 score LR:") print(f1s(y_test, prFit)) print("Precision score LR:") print(ps(y_test, prFit)) print("Recall score LR:") print(rs(y_test, prFit)) print("Classification Report") print(cr(y_test, prFit)) pr1 = randomForestLoaded.predict(fts2) print("predico unica", pr1) return pr1
def lrw(): lw(str(clfr) + '\n') lw(cr(y_test, y_)) lw('\n\n') lw(str(cm(y_test, y_))) lw('\n\n') log.close() log = open(log_file, "a")
def predictResult(x_train, y_train, y_test, x_test): data2 = pd.read_csv("/tmp/predict_result.csv", header=0) # vamos percorrer o arquivo com o valor a ser testado, onde vamos pegar as colunas e jogar os valores numa array cols2 = data2.columns[(data2.columns != columnResultName)] fts2 = data2[cols2] fts2 = Normalizer().fit_transform(fts2) scores = cross_val_score(logisticR, x_train, y_train, n_jobs=30) print("scores cross val") print(scores) logisticR.fit(x_train, y_train) dump(logisticR, 'logistic.model') logisticLoaded = load('logistic.model') prFit = logisticLoaded.predict(x_test) print("predicao:", prFit) print("Matriz de Confusao LR:") print(cfm(y_test, prFit)) print("F1 score LR:") print(f1s(y_test, prFit)) print("Precision score LR:") print(ps(y_test, prFit)) print("Recall score LR:") print(rs(y_test, prFit)) print("Classification Report") print(cr(y_test, prFit)) print("Accuracy score") print(asc(y_test, prFit)) class_names = [0, 1] # name of classes fig, ax = plt.subplots() tick_marks = np.arange(len(class_names)) plt.xticks(tick_marks, class_names) plt.yticks(tick_marks, class_names) # create heatmap sns.heatmap(pd.DataFrame(cfm(y_test, prFit)), annot=True, cmap="YlGnBu", fmt='g') ax.xaxis.set_label_position("top") plt.tight_layout() plt.title('Confusion matrix', y=1.1) plt.ylabel('Actual label') plt.xlabel('Predicted label') plt.show() y_pred_proba = logisticLoaded.predict_proba(x_test)[::, 1] fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba) auc = metrics.roc_auc_score(y_test, y_pred_proba) plt.plot(fpr, tpr, label="data 1, auc=" + str(auc)) plt.legend(loc=4) plt.show() pr1 = logisticLoaded.predict(fts2) print("predico unica", pr1) return pr1
def describe_video_data(video_data, answer, n=3): video_data = add_system_labels(video_data, answer) for v_id in video_data: a = video_data[v_id]["mturk"] video_data[v_id]["mturk"] = find_most_common(a, n=n) df_v = pd.DataFrame.from_dict(video_data, orient="index") r = {} r["Cohen's kappa (mturk and citizen)"] = cks(df_v["mturk"], df_v["citizen"]) r["Cohen's kappa (mturk and researcher)"] = cks(df_v["mturk"], df_v["researcher"]) r["Cohen's kappa (citizen and researcher)"] = cks(df_v["citizen"], df_v["researcher"]) r["Citizen data performance"] = cr(df_v["researcher"], df_v["citizen"], output_dict=True) r["MTurk data performance"] = cr(df_v["researcher"], df_v["mturk"], output_dict=True) return r
def logistic_regression(X, X1, Y, Y1): global accuracy_models lr = LogisticRegression(multi_class='auto', solver='liblinear') lr.fit(X, Y) pred = lr.predict(X1) print(accuracy_score(Y1, pred)) accuracy_models.append(accuracy_score(Y1, pred)) cd = confusion_matrix(Y1, pred, range(len(Y1.unique()))) print(cd) print(cr(Y_test, pred)) return lr
def trainModel(classifier, lTrX, lTrY, lTeX, lTeY, is_neural_net=False): #Fit the training dataset on the classifier classifier.fit(lTrX, lTrY) #Predict the labels on validation dataset lPreds = classifier.predict(lTeX) if is_neural_net: lPreds = lPreds.argmax(axis=-1) else: lNames = list(map(str, dictIdToLab.values())) print(cr(lTeY, lPreds, target_names=lNames)) return accuracy_score(lPreds, lTeY)
def classifier(file_name): review_sparse_vect, rating_sparse_vect = bag_of_words(file_name) # support vector classifier one vs all clf = SVC(C=1, kernel='linear', gamma=1, verbose=False, probability=False, decision_function_shape='ovr') clf.fit(review_sparse_vect, rating_sparse_vect) # Model fitting completeion # print("Fitting completed") predicted = cv.cross_val_predict(clf, review_sparse_vect, rating_sparse_vect, cv=10) # calculation of metrics print("accuracy_score\t", acc_score(rating_sparse_vect, predicted)) print("precision_score\t", pre_score(rating_sparse_vect, predicted)) print("recall_score\t", rc_score(rating_sparse_vect, predicted)) print("\nclassification_report:\n\n", cr(rating_sparse_vect, predicted)) print("\nconfusion_matrix:\n", cm(rating_sparse_vect, predicted))
def random_forest_classifier(X, X1, Y, Y1): global accuracy_models rfc = RandomForestClassifier( criterion='gini', n_estimators=200, random_state=0, max_leaf_nodes=1000, ) rfc.fit(X, Y) pred = rfc.predict(X1) print(accuracy_score(Y1, pred)) accuracy_models.append(accuracy_score(Y1, pred)) cd = confusion_matrix(Y1, pred, range(len(Y1.unique()))) print(cd) print(cr(Y_test, pred)) return rfc
def classification_report(y_true, y_pred, **kwargs): """ Classification report for sequence labeling. Parameters ---------- y_true: 2D np.array or list of lists The true/gold sequence labels. y_pred: 2D np.array of list of lists The prediction labels **kwargs: The parameters of the sklearn.metrics.classification_report function. """ y_gold, y_hat = flatten(y_true, y_pred) return cr(y_gold, y_hat, **kwargs)
def main(): #-- get the data --# dfTrain = pd.read_csv('LabelledData (1).txt', sep=' ,,, ', header=None, engine='python') dfTrain.columns = ['ques', 'type'] trainData, trainLabels = dfTrain.ques.to_list(), dfTrain.type.to_list( ) #--the train data. """ #dfTest = pd.read_csv('train_1000.label', sep='\s+', header=None) #dfTest = pd.read_csv('LabelledData (1).txt', sep=' ,,, ', header=None, engine='python') f = open('train_1000.label', 'r', errors='ignore').read().split('\n') testData, testLabels, testType = [], [], [] for line in f[:len(f)-1]: short = line[:20] tlabel = short.split(':')[0] testLabels.append(tlabel) ttype = short.split(' ')[0].split(':')[1] testType.append(ttype) tdata = line[len(tlabel):] testData.append(tdata) #print(testLabels); print(testType)""" cut = 500 trainDataOld, trainLabelsOld = trainData, trainLabels trainData = trainDataOld[:cut] #int(len(trainData)/2) trainLabels = trainLabelsOld[:cut] testData = trainDataOld[cut:] testLabels = trainLabelsOld[cut:] #-- vectorizing and training --# vectorizer = TfidfVectorizer(min_df=4, max_df=0.9) trainVectors = vectorizer.fit_transform(trainData) testVectors = vectorizer.transform(testData) #-- performing the classification --# model = svm.SVC(kernel='linear') model.fit(trainVectors, trainLabels) prediction = model.predict(testVectors) print(cr(testLabels, prediction))
def benchmark(self, clf, X_train, y_train, X_test, y_test): output(80 * '_') # fit output("Training:") t0 = time() clf.fit(X_train, y_train) train_time = time() - t0 output("train time: %0.3fs" % train_time) # predict t0 = time() pred = clf.predict(X_test) try: proba = clf.predict_proba(X_test) except: proba = None try: log_proba = clf.predict_log_proba(X_test) except: log_proba = None test_time = time() - t0 output("test time: %0.3fs" % test_time) # get metrics for the positve class only (heavy class imbalance) # p_score = mlu.get_pos_precision(cm(y_test, pred)) # r_score = mlu.get_pos_recall(cm(y_test, pred)) # f_measure = mlu.get_f_measure(p_score, r_score) # get metrics p_scores, r_scores, f_measures, support = get_scores(y_test, pred, self.beta) p_score_avg = p_scores.mean() r_score_avg = r_scores.mean() f_measure_avg = f_measures.mean() output("precision: %0.3f \trecall: %0.3f" % (p_score_avg, r_score_avg)) # output results output("Classification results:") output(cr(y_test, pred)) output(cm(y_test, pred)) clf_descr = str(clf).split('(')[0] # get the name of the classifier from its repr() return clf_descr, p_score_avg, r_score_avg, f_measure_avg, train_time, test_time, proba
def k_nearest_neighbors(X, X1, Y, Y1): global accuracy_models nn = range(3, 11) # Empty list that will hold cv scores cv_scores = [] # Perform 5-fold cross validation # --------------------------------- for k in nn: knn = sk.neighbors.KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, X, Y, cv=5, scoring='accuracy') cv_scores.append(scores.mean()) optimal_k = nn[cv_scores.index(max(cv_scores))] knn = sk.neighbors.KNeighborsClassifier(n_neighbors=optimal_k) knn.fit(X, Y) pred = knn.predict(X1) print(accuracy_score(Y1, pred)) accuracy_models.append(accuracy_score(Y1, pred)) cd = confusion_matrix(Y1, pred, range(len(Y1.unique()))) print(cd) print(cr(Y_test, pred)) return knn
def evaluate(model, iterator, criterion): epoch_loss = 0 epoch_acc = 0 epoch_f1 = 0 y_tot = np.array([]) pred_tot = np.array([]) model.eval() with torch.no_grad(): for batch in iterator: text = batch.text[0] predictions = model(text) # predictions=predictions.reshape([predictions.shape[0]]) target = batch.label # target = torch.autograd.Variable(target).long() target = target.reshape([target.shape[0], 1]) loss = criterion(predictions, target) acc, f1, y_mini, pred_mini = binary_accuracy(predictions, target) epoch_loss += loss.item() epoch_acc += acc.item() epoch_f1 += f1 y_tot = np.concatenate([y_tot, y_mini.flatten()]) pred_tot = np.concatenate([pred_tot, pred_mini.flatten()]) f1 = f1_score(y_tot, pred_tot, average='binary') f1_macro = f1_score(y_tot, pred_tot, average='macro') precision = precision_score(y_tot, pred_tot, average='binary') print(len(y_tot)) print(cr(y_tot, pred_tot)) print(cm(y_tot, pred_tot)) return epoch_loss / len(iterator), epoch_acc / len( iterator), epoch_f1 / len(iterator), f1, f1_macro, precision
cf = confusion_matrix(pred_y,test_y,labels) print(cf) print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_x, test_y))) # confusion matrix with details # ----------------------------- ty=list(test_y) py=list(pred_y) cm1=ConfusionMatrix(py,ty) print(cm1) cm1.print_stats() cm1.plot() # Classification report : precision, recall, F-score # --------------------------------------------------- print(cr(test_y, pred_y)) #model number 2 # RFE (recursive feature elimination) # ----------------------------------- logreg = LogisticRegression() # sklearn.feature_selection.RFE # (estimator, n_features_to_select=None, step=1, verbose=0) # get the best 18 features rfe = RFE(logreg, 150) rfe = rfe.fit(data[X], data[Y] ) support = rfe.support_ ranking = rfe.ranking_
def start_split_data(data_list): random_list = dc(data_list) random.shuffle(random_list) predicted_list = [] mark = 0 acc_list = [] act_class_list = [] for i in range(10): # fold range test_list = [] training_list = [] while (mark < int(len(random_list))): for train_ele in range(0, mark): training_list.append(random_list[train_ele]) else: index = mark mark = int(len(random_list) / 10) + index for test_element in range(index, mark): test_list.append(random_list[test_element]) for training_element in range(mark, int(len(random_list))): training_list.append(random_list[training_element]) # print(training_list) # fold completion Node.children = [] Node.leaf_children = [] Node.temp_children = [] Node.new_children = [] Node.len_training_list = len(training_list) Node.old_pessi_err = (node_err_cal(training_list, max_class( training_list, class_column), class_column) + 1) / \ Node.len_training_list root = Node(training_list) # print(root.data) root.node_type = 'root' build_tree(root) predicted_temp_list = [] actual_list = [] temp_root = dc(root) for test_element in test_list: actual_list.append(int(test_element[class_column])) found = int(class_finder(test_element, temp_root)) predicted_temp_list.append(found) predicted_list.append(found) acc_list.append( accuracy(actual_list, predicted_temp_list, class_column)) break print(mean(acc_list)) act_class_list = class_list_gen(random_list) # print(len(act_class_list),len(predicted_list)) while (len(act_class_list) > len(predicted_list)): del act_class_list[-1] c_matrix = cm(act_class_list, predicted_list) print('Confusion matrix\n', c_matrix) c_report = cr(act_class_list, predicted_list) print("All Measures required for this data set \n", c_report) fpr, tpr, thd = rc(act_class_list, predicted_list) roc_auc = auc(fpr, tpr) if formula_input == 2: plt.title('ROC for %s with information gain(red) and gini(blue)' % file_name[0]) plt.plot(fpr, tpr, label='%s AUC = %0.2f' % (formula_measure, roc_auc)) plt.legend(loc='lower right') else: plt.title('ROC for %s ' % file_name[0]) plt.plot(fpr, tpr, label='%s AUC = %0.2f' % (formula_measure, roc_auc)) plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([-0.1, 1.2]) plt.ylim([-0.1, 1.2]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate')
c1 = len(p1[p1 <= 0.5]) c1 c2 = len(p1[p1 > 0.5]) c2 print("<=0.5 {} , >0.5 {} ".format(c1, c2)) predy = p1.copy(deep=True) predy[predy <= 0.5] = 0 predy[predy > 0.5] = 1 predy.value_counts() #confusion matrix ConfusionMatrix(testy, predy) print(cr(testy, predy)) #roc from sklearn import metrics fpr, tpr, threshold = metrics.roc_curve(testy, predy) #auc roc_auc = metrics.auc(fpr, tpr) #plot plt.title('Receiver Operating Characterstics') plt.plot(fpr, tpr, 'b', label='AUC=%0.2f' % roc_auc) plt.legend(loc='Lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate')
for i in range(0, length): if y_results[i] <= 0.5: y_results[i] = 0 else: y_results[i] = 1 # accuracy score print(accuracy_score(test_y, y_results) * 100) # confusion matrix cm = ConfusionMatrix(list(y_results), list(test_y)) print(cm) cm.print_stats() # Classification report : precision, recall, F-score print(cr(test_y, y_results)) # draw the ROC curve from sklearn import metrics import matplotlib.pyplot as plt fpr, tpr, threshold = metrics.roc_curve(test_y, y_results) roc_auc = metrics.auc(fpr, tpr) print(roc_auc) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC = %0.2f' % roc_auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1])
tags.extend([list2]) list1 = [] list2 = [] i = 0 if __name__ == '__main__': words = [] #matrix of words each row containing sentence tags = [] # matrix of tags corresponding to each word process_data(sys.argv[1],words,tags) W2V = Build_W2V(words,tags) #WV.TrainModel() data_input = W2V.wordvec data_output = W2V.lisvec xtrain = data_input[:9000] ytrain = data_output[:9000] xtest = data_input[9000:] ytest = data_output[9000:] #X,y1 = collect_data(file_tra) #print(X.shape) clf = MLPClassifier(n_hidden=50, learning_rate=0.01, SGD=True) clf.fit(xtrain, ytrain, max_epochs=200) #D,O = collect_data(file_test) pred = clf.predict(xtest) #print("O: ",O) #print("pred: ",pred) print("0: ",cr(ytest, pred)) 319,1 Bot
@author: Mukul """ import os os.chdir("C:/Users/Mukul/Documents") import pandas as pd import numpy as np tel_data = pd.read_csv("Telecom_Data.csv") #set depe indep var tel_data.columns x = tel_data.drop(["phone number", "churn"], axis=1) y = tel_data[["churn"]] x = pd.get_dummies(x) #divide train test from sklearn.model_selection import train_test_split xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2) #random forest model from sklearn.ensemble import RandomForestClassifier as rf model = rf() model.fit(xtrain, ytrain) #apply to testset pred_churn = model.predict(xtest) from sklearn.metrics import classification_report as cr cr(ytest, pred_churn) #validation
partial_train_data, validation_data, test_data) ############################### SELECT CNN MODEL ############################## select_model = 0 while select_model < 1 or select_model > 2: select_model = int(input("Select CNN Model [1-2]: ")) if select_model == 1: model_1, result_1 = optimize(cnn_model_one()) # Train CNN Model 1 prediction_1 = model_1.predict(test_data) print("Evaluate Test") model_1.evaluate(test_data, test_label) print( cr(test_label.argmax(axis=1), prediction_1.argmax(axis=1), target_names=label_names)) # Classification Report # Training and Validation Curves training_and_validation_accuracy(result_1) training_and_validation_loss(result_1) # Confusion Matrix Visualization prediction_class_1 = np.argmax( prediction_1, axis=1) # Convert predictions classes to one hot vectors test_label_cfm = np.argmax( test_label, axis=1) # Convert validation observations to one hot vectors confusion_mtx = cfm(test_label_cfm, prediction_class_1) # Compute the confusion matrix plot_confusion_matrix(confusion_mtx,
# #plt.figure(figsize=(11,6)) #sb.countplot(x="purpose", hue="not.fully.paid", data=df, palette="Set1") # #df = df.drop(["purpose"],axis=1) c = np.arange(0, 19, dtype=int) c = np.delete(c, 12) X = final_data.iloc[:, c].values y = final_data.iloc[:, 12].values X = final_data.drop("not.fully.paid", axis=1) y = final_data["not.fully.paid"] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42) from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier(n_estimators=256, n_jobs=-1) rfc.fit(X_train, y_train) y_pred = rfc.predict(X_test) from sklearn.metrics import confusion_matrix, classification_report as cr classification_report = cr(y_test, y_pred) cm = confusion_matrix(y_test, y_pred)
l = line.split("\t") if not l[0] == '\n': s = str(l[1]) s = s[0:len(s)-1] out.append(s) label=pd.Series(ans).unique() print "\nlabels:" + str(label) prf = pr(ans,out,labels=label,beta=1,average='weighted') print "\nPresicion:" + str(prf[0]) print "\nRecall:" + str(prf[1]) print "\nF Score:" + str(prf[2]) acp = acc(ans,out,True) act = acc(ans,out,False) acp = acp*100 print "\nAccuracy:"+str(acp)+"% "+str(act)+"/"+str(len(ans)) report = cr(ans,out,label) print str(report) prf = pr(ans,out,labels=label,beta=1,average=None) sc = pd.DataFrame(index=['Precision','Recall','F Score','Support'],columns=label) sc[:]=prf[:] sc=pd.DataFrame.transpose(sc) print "\n\n" print str(sc) arr = cm(ans,out,label) mat = pd.DataFrame(index=label,columns=label) mat[:]=arr[:] print "\n\n" print u"\u2193"+"Actual/Predicted-->" print (str(mat)) fA.close() fO.close()
elif r == 3: print("PCW") elif r == 4: print("Stoppage") res(10, 8.180509567, 77.418396) # In[50]: #Evaluation from sklearn.metrics import classification_report as cr, confusion_matrix as cm, accuracy_score as acc_s print("Confusion Matrix : \n", cm(Y_test, Y_pred)) print("\n\nClassification report : \n", cr(Y_test, Y_pred)) print("\n\nAccuracy : ", acc_s(Y_test, Y_pred) * 100) # In[51]: res(3, 8.178689957, 77.42429352) # In[18]: #Finding suitable k e = [] for i in range(1, 50): knn = KNC(n_neighbors=i) knn.fit(X_train, Y_train)
def main(train_file, test_file, load_method="csv", opti_method=None, maxiter=100, batch_size=-1, units=None, lmbda=0, alpha=100, beta=1000): """ Manages files and operations for the neural network model creation, training, and testing. @parameters: load_method - the dataset file format, either "csv" or "hdf" opti_method - specifies the optimization method to use, "l-bfgs", "cg", or None (defaults to SGD) maxiter - the maximum number of iterations allowed for training batch_size - the number of instance for each mini-batch, -1 implies batch processing units - a sequence of integers separated by '.' such that each integer represents the number of units in a sequence of hidden layers. lmbda - the regularization term alpha - the numerator for the learning rate schedule (relevant for SGD only) beta - the denominator for the learning rate schedule (relevant for SGD only) """ # open and load csv files if load_method == "csv": X_train, y_train = mlu.load_csv(train_file, True) # load and shuffle training set X_test, y_test = mlu.load_csv(test_file) elif load_method == "hdf": X_train, y_train = mlu.loadh(train_file, True) # load and shuffle training set X_test, y_test = mlu.loadh(test_file) else: raise Exception("Dataset file type not recognized: acceptable formats are 'csv' and 'hfd'.") # perform feature scaling X_train = mlu.scale_features(X_train, 0.0, 1.0) X_test = mlu.scale_features(X_test, 0.0, 1.0) # create the neural network classifier using the training data NNC = NeuralNetClassifier(opti_method, maxiter, batch_size, units, lmbda, alpha, beta) print "\nCreated a neural network classifier\n\t", NNC # fit the model to the loaded training data print "\nFitting the training data..." # costs, mags = NNC.fit(X_train, y_train) NNC.fit(X_train, y_train) # predict the results for the test data print "\nGenerating probability prediction for the test data..." y_pred = NNC.predict(X_test) ### output classification results ### # output class prediction probability for each instance in the test set print "\nThe probabilities for each instance in the test set are:\n" for prob in NNC.predict_proba(X_test): print prob # output accuracy print 'Accuracy: ', mlu.compute_accuracy(y_test, y_pred) # output sklearn style results if the module is availble try: from sklearn.metrics import classification_report as cr from sklearn.metrics import confusion_matrix as cm print print "Classification results:" print cr(y_test, y_pred) print cm(y_test, y_pred) except: pass # save model parameters as a pickle NNC.save_model("NNCModel.p")
def classify(model, featureVectors): z = model.predict(featureVectors[:, :-1]).astype(np.int).reshape(-1).tolist() data = featureVectors[:, -1].flatten() data = data.astype(np.int).tolist() print cr(data, z, target_names=["DOS", "Normal", "Probing", "R2L", "U2R"], digits=4)
def fit_worker(self, rank, world_size, p_model, save_model_path, save_tensorboard_path, save_log_path, p_frame, p_metadata_train, p_metadata_validation, p_metadata_test): # Set logger save_log_path += str(rank) self.create_logger(log_path=save_log_path) self.log("="*60) self.log("="*60) self.log("Use Two-Stream Inflated 3D ConvNet learner") self.log("save_model_path: " + save_model_path) self.log("save_tensorboard_path: " + save_tensorboard_path) self.log("save_log_path: " + save_log_path) self.log("p_metadata_train: " + p_metadata_train) self.log("p_metadata_validation: " + p_metadata_validation) self.log("p_metadata_test: " + p_metadata_test) self.log_parameters() # Set model model = self.set_model(rank, world_size, self.mode, p_model, self.can_parallel, phase="train") if model is None: return None # Load datasets metadata_path = {"train": p_metadata_train, "validation": p_metadata_validation} ts = self.get_transform(self.mode, image_size=self.image_size) transform = {"train": ts, "validation": ts} if self.augment: transform["train"] = self.get_transform(self.mode, phase="train", image_size=self.image_size) dataloader = self.set_dataloader(rank, world_size, metadata_path, p_frame, transform, self.batch_size_train, self.can_parallel) # Create tensorboard writter writer_t = SummaryWriter(save_tensorboard_path + "/train/") writer_v = SummaryWriter(save_tensorboard_path + "/validation/") # Set optimizer optimizer = optim.SGD(model.parameters(), lr=self.init_lr, momentum=self.momentum, weight_decay=self.weight_decay) lr_sche= optim.lr_scheduler.MultiStepLR(optimizer, milestones=self.milestones, gamma=self.gamma) # Set logging format log_fm = "%s step: %d lr: %r loc_loss: %.4f cls_loss: %.4f loss: %.4f" # Train and validate steps = 0 epochs = 0 nspu = self.num_steps_per_update nspc = self.num_steps_per_check nspu_nspc = nspu * nspc accum = {} # counter for accumulating gradients tot_loss = {} # total loss tot_loc_loss = {} # total localization loss tot_cls_loss = {} # total classification loss pred_labels = {} # predicted labels true_labels = {} # true labels for phase in ["train", "validation"]: accum[phase] = 0 tot_loss[phase] = 0.0 tot_loc_loss[phase] = 0.0 tot_cls_loss[phase] = 0.0 pred_labels[phase] = [] true_labels[phase] = [] while steps < self.max_steps: # Each epoch has a training and validation phase for phase in ["train", "validation"]: self.log("-"*40) self.log("phase " + phase) if phase == "train": epochs += 1 self.log("epochs: %d steps: %d/%d" % (epochs, steps, self.max_steps)) model.train(True) # set model to training mode for param in model.parameters(): param.requires_grad = True else: model.train(False) # set model to evaluate mode for param in model.parameters(): param.requires_grad = False optimizer.zero_grad() # Iterate over batch data for d in tqdm.tqdm(dataloader[phase]): if self.code_testing: if phase == "train" and steps >= self.max_steps: break if phase == "validation" and accum[phase] >= self.max_steps: break accum[phase] += 1 # Get prediction frames = self.to_variable(d["frames"]) labels = d["labels"] true_labels[phase] += self.labels_to_list(labels) labels = self.to_variable(labels) pred = self.make_pred(model, frames) pred_labels[phase] += self.labels_to_list(pred.cpu().detach()) # Compute localization loss loc_loss = F.binary_cross_entropy_with_logits(pred, labels) tot_loc_loss[phase] += loc_loss.data # Compute classification loss (with max-pooling along time, batch x channel x time) cls_loss = F.binary_cross_entropy_with_logits(torch.max(pred, dim=2)[0], torch.max(labels, dim=2)[0]) tot_cls_loss[phase] += cls_loss.data # Backprop loss = (0.5*loc_loss + 0.5*cls_loss) / nspu tot_loss[phase] += loss.data if phase == "train": loss.backward() # Accumulate gradients during training if (accum[phase] == nspu) and phase == "train": steps += 1 if steps % nspc == 0: # Log learning rate and loss lr = lr_sche.get_lr()[0] tll = tot_loc_loss[phase]/nspu_nspc tcl = tot_cls_loss[phase]/nspu_nspc tl = tot_loss[phase]/nspc self.log(log_fm % (phase, steps, lr, tll, tcl, tl)) # Add to tensorboard if rank == 0: writer_t.add_scalar("localization_loss", tll, global_step=steps) writer_t.add_scalar("classification_loss", tcl, global_step=steps) writer_t.add_scalar("loss", tl, global_step=steps) writer_t.add_scalar("learning_rate", lr, global_step=steps) # Reset loss tot_loss[phase] = tot_loc_loss[phase] = tot_cls_loss[phase] = 0.0 # Reset gradient accumulation accum[phase] = 0 # Update learning rate and optimizer optimizer.step() optimizer.zero_grad() lr_sche.step() # END FOR LOOP if phase == "validation": # Log learning rate and loss lr = lr_sche.get_lr()[0] tll = tot_loc_loss[phase]/accum[phase] tcl = tot_cls_loss[phase]/accum[phase] tl = (tot_loss[phase]*nspu)/accum[phase] # Sync losses for validation set if self.can_parallel: tll_tcl_tl = torch.Tensor([tll, tcl, tl]).cuda() dist.all_reduce(tll_tcl_tl, op=dist.ReduceOp.SUM) tll = tll_tcl_tl[0].item() / world_size tcl = tll_tcl_tl[1].item() / world_size tl = tll_tcl_tl[2].item() / world_size self.log(log_fm % (phase, steps, lr, tll, tcl, tl)) # Add to tensorboard and save model if rank == 0: writer_v.add_scalar("localization_loss", tll, global_step=steps) writer_v.add_scalar("classification_loss", tcl, global_step=steps) writer_v.add_scalar("loss", tl, global_step=steps) writer_v.add_scalar("learning_rate", lr, global_step=steps) self.save(model, save_model_path + str(steps) + ".pt") # Reset loss tot_loss[phase] = tot_loc_loss[phase] = tot_cls_loss[phase] = 0.0 # Reset gradient accumulation accum[phase] = 0 # Save precision, recall, and f-score to the log and tensorboard for ps in ["train", "validation"]: # Sync true_labels and pred_labels for validation set if self.can_parallel and ps == "validation": true_pred_labels = torch.Tensor([true_labels[ps], pred_labels[ps]]).cuda() true_pred_labels_list = [torch.ones_like(true_pred_labels) for _ in range(world_size)] dist.all_gather(true_pred_labels_list, true_pred_labels) true_pred_labels = torch.cat(true_pred_labels_list, dim=1) true_labels[ps] = true_pred_labels[0].cpu().numpy() pred_labels[ps] = true_pred_labels[1].cpu().numpy() self.log("Evaluate performance of phase: %s\n%s" % (ps, cr(true_labels[ps], pred_labels[ps]))) if rank == 0: result = prfs(true_labels[ps], pred_labels[ps], average="weighted") writer = writer_t if ps == "train" else writer_v writer.add_scalar("precision", result[0], global_step=steps) writer.add_scalar("recall", result[1], global_step=steps) writer.add_scalar("weighted_fscore", result[2], global_step=steps) # Reset pred_labels[ps] = [] true_labels[ps] = [] # Clean processors self.clean_mp() self.log("Done training")
training_curve_title = 'Support vector classifier' train_val_split_folds = 5 train_sizes = np.linspace(0.04, 1.0, 20) #plot_learning_curve(estimator, X_train_val, np.ravel(y_train_val), title = training_curve_title, cv=train_val_split_folds,train_sizes = train_sizes) #plt.show() #Plot a cross-validation curve CV_curve_title = 'Support vector classifier' CV_param_name = 'C' #CV_param_name = 'gamma' CV_pararam_range = np.array([0.001, 0.01, 0.1, 1, 10, 100]) #plot_validation_curve(estimator, X_train_val, np.ravel(y_train_val), CV_param_name, CV_pararam_range, title = CV_curve_title, xlabel = 'Parameter', ylabel = 'Score') #plt.show() #VALIDATION #Display the error metrics on the training data class_names = ['Diseased', 'Survived'] class_rep_train = cr(y_train, y_pred_train_self, target_names=class_names) print("Performance on training data:") print(class_rep_train) #Compare the predictions with the real values class_rep_val = cr(y_val, y_pred_train, target_names=class_names) print("Performance on validation data:") print(class_rep_val) #Generate predictions from the test set #y_pred_test = estimator.predict(X_test)
def test_worker(self, rank, world_size, p_model, save_log_path, p_frame, save_viz_path, p_metadata_test): # Set logger save_log_path += str(rank) self.create_logger(log_path=save_log_path) self.log("="*60) self.log("="*60) self.log("Use Two-Stream Inflated 3D ConvNet learner") self.log("Start testing with mode: " + self.mode) self.log("save_log_path: " + save_log_path) self.log("save_viz_path: " + save_viz_path) self.log("p_metadata_test: " + p_metadata_test) self.log_parameters() # Set model model = self.set_model(rank, world_size, self.mode, p_model, self.can_parallel, phase="test") if model is None: return None # Load dataset metadata_path = {"test": p_metadata_test} transform = {"test": self.get_transform(self.mode, image_size=self.image_size)} dataloader = self.set_dataloader(rank, world_size, metadata_path, p_frame, transform, self.batch_size_test, self.can_parallel) # Test model.train(False) # set the model to evaluation mode file_name = [] true_labels = [] pred_labels = [] true_scores = [] pred_scores = [] counter = 0 with torch.no_grad(): # Iterate over batch data for d in dataloader["test"]: if counter % 5 == 0: self.log("Process batch " + str(counter)) counter += 1 file_name += d["file_name"] frames = self.to_variable(d["frames"]) labels = d["labels"] true_labels += self.labels_to_list(labels) true_scores += self.labels_to_score_list(labels) labels = self.to_variable(labels) pred = self.make_pred(model, frames) pred = pred.cpu().detach() pred_labels += self.labels_to_list(pred) pred_scores += self.labels_to_score_list(pred) # Sync true_labels and pred_labels for testing set true_labels_all = np.array(true_labels) pred_labels_all = np.array(pred_labels) true_scores_all = np.array(true_scores) pred_scores_all = np.array(pred_scores) if self.can_parallel: true_pred_labels = torch.Tensor([true_labels, pred_labels, true_scores, pred_scores]).cuda() true_pred_labels_list = [torch.ones_like(true_pred_labels) for _ in range(world_size)] dist.all_gather(true_pred_labels_list, true_pred_labels) true_pred_labels = torch.cat(true_pred_labels_list, dim=1) true_labels_all = true_pred_labels[0].cpu().numpy() pred_labels_all = true_pred_labels[1].cpu().numpy() true_scores_all = true_pred_labels[2].cpu().numpy() pred_scores_all = true_pred_labels[3].cpu().numpy() # Save precision, recall, and f-score to the log self.log("Evaluate performance of phase: test\n%s" % (cr(true_labels_all, pred_labels_all))) # Save roc curve and score self.log("roc_auc_score: %s" % str(roc_auc_score(true_scores_all, pred_scores_all, average=None))) # Generate video summary and show class activation map # TODO: this part will cause an error when using multiple GPUs try: # Video summary cm = confusion_matrix_of_samples(true_labels, pred_labels, n=64) write_video_summary(cm, file_name, p_frame, save_viz_path + str(rank) + "/") # Save confusion matrix cm_all = confusion_matrix_of_samples(true_labels, pred_labels) for u in cm_all: for v in cm_all[u]: for i in range(len(cm_all[u][v])): idx = cm_all[u][v][i] cm_all[u][v][i] = file_name[idx] save_json(cm_all, save_viz_path + str(rank) + "/confusion_matrix_of_samples.json") except Exception as ex: self.log(ex) # Clean processors self.clean_mp() self.log("Done testing")