def testing(net): global fileNames_path_valid global map_fname_label_valid global maps_list global get_temp predicted_y = [] actual_y = [] if get_temp: random.shuffle(fileNames_path_valid) maps_list = manager.list([]) pool = multi.Pool(processes=4) pool.map(runner, fileNames_path_valid[:500]) get_temp = False for maps, path in maps_list: result = net.activate(np.ravel(np.array(maps))) label = map_label_int_label[map_fname_label_valid[path]] # if result[0] >= 0.5: # predicted_y.append(1) # else: # predicted_y.append(0) # print result # print result.argmax() predicted_y.append(result.argmax()) actual_y.append(label) print accuracy_score(actual_y, predicted_y) print confusion_matrix(actual_y, predicted_y)
def run_ratio(self, dataset, set_size): ''' Compare several competing methods changing the ratio of the positive class in the dataset. We use binary class dataset for the easy of interpretation. ''' X_train_full, y_train_full, X_test, y_test = dataset X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size) test_set_original = (X_test, y_test) large = ENMLT(LinearSVC) large.fit(X_train, y_train) simple = LinearSVC() simple.fit(X_train, y_train) for r in numpy.arange(0.05, 1.0, 0.05): # Generate a new test set with desired positive proportions. X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1) y_pred = large.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc1 = self.accuracy(cm) y_pred = simple.predict(X_test_new) cm = confusion_matrix(y_test_new, y_pred) acc2 = self.accuracy(cm) print "%.2f, %f, %f" % (r, acc1, acc2)
def do_xgboost(x,y): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0) print "xgboost" xgb_model = xgb.XGBClassifier().fit(x_train, y_train) y_pred = xgb_model.predict(x_test) print(classification_report(y_test, y_pred)) print metrics.confusion_matrix(y_test, y_pred)
def printMeasuresOfEfficiency(yTest, y_pred): # I was having an issue where the confusion matrix would come out to [[100]] or [[40]]. # As opposed to how it should be: a 2x2 matrix. # After some investigation, I came to the conclusion that the test set only contained one class! # Therefore, there is no false positive, no false negative; Only the one class in the test set. if(len(confusion_matrix(yTest, y_pred)) == 1): print("Test set contains one class only. There is no false positive or false negative; Only the one class.") return tn, fp, fn, tp = confusion_matrix(yTest, y_pred).ravel() # Measures of efficiency # ppv: positive predicted values # npv: negative predicted values # sensitivity (recall): negative predicted values specificity = tn / (tn + fp) sensitivity = tp / (tp + fn) ppv = tp / tp / (tp+fp) npv = tn / (tn + fn) print("\ttn: {} fp: {} fn: {} tp: {}".format(tn, fp, fn, tp)) print("\tspecificity: {}".format(specificity, sensitivity, ppv, npv)) print("\tsensitivity: {}".format(specificity, sensitivity, ppv, npv)) print("\tppv: {}".format(specificity, sensitivity, ppv, npv)) print("\tnpv: {}".format(specificity, sensitivity, ppv, npv))
def test_one_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(n_estimators=500, max_depth=200, min_samples_split=10, oob_score=True, n_jobs=-1,verbose=1,class_weight='balanced')), ]) ############# train pipeline.fit(Xtrain_raw,ytrain_raw) ############# check result rf = pipeline.steps[-1][1] rf.oob_score_ ############# training error ytrain_predict = pipeline.predict(Xtrain_raw) print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict) print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict) ############# testing error Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = pipeline.predict(Xtest_raw) accuracy_score(y_true=ytest_raw,y_pred=ytest_predict) print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def inspect(): data = pd.DataFrame.from_csv(os.path.join(PATH, "obs.csv"), header=None, index_col=[0]) features, labels = stat_calculator().get_stats_labels(data.values) f_train, f_test, l_train , l_test = train_test_split(features, labels, test_size=.5) clf = RandomForestClassifier(n_estimators=30).fit(f_train, l_train) out = clf.predict(f_test) out_p = clf.predict_proba(f_test) out_p = pd.DataFrame(out_p, columns=clf.classes_) # regular conf mat cm1 = confusion_matrix(l_test, out) # regular argmax confusion matrix predicted = out_p.apply(lambda s: s.argmax(), axis=1) cm2 = confusion_matrix(l_test, predicted) # thresholded argmax confusion matrix predicted2 = out_p.apply(lambda s: s.argmax() if s.max() > THETA else "zother", axis=1) cm2 = confusion_matrix(l_test, predicted2) print(cm1, cm2)
def print_prediction_results(): results = [] for c, Y_test in zip(classes, test_data): for y in Y_test: query = ma.masked_array( np.array([tuple(y) + (0,)], dtype=[('', bool)] * (D - 1) + [('', int)]), mask=[(False,) * (D - 1) + (True,)])[0] samples = [ s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)] samples = np.bincount(samples, minlength=len(classes)) prediction = np.argmax(samples) results.append((classmap[c], prediction, samples)) print 'finished predictions for class', c Y_actual = np.array([a for a, _, _ in results], dtype=np.int) Y_pred = np.array([b for _, b, _ in results], dtype=np.int) print 'accuracy:', accuracy_score(Y_actual, Y_pred) print 'confusion matrix:' print confusion_matrix(Y_actual, Y_pred) # AUROC for one vs all (each class) for i, clabel in enumerate(classes): Y_true = np.copy(Y_actual) # treat class c as the "positive" example positive_examples = Y_actual == i negative_examples = Y_actual != i Y_true[positive_examples] = 1 Y_true[negative_examples] = 0 Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results]) cls_auc = roc_auc_score(Y_true, Y_prob) print 'class', clabel, 'auc=', cls_auc
def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False): accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train)) accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test)) print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test)) if not short: # confusion matrix # rows: actual group # columns: predicted group print('Confusion_matrix (training data):') print(metrics.confusion_matrix(y_train, model.predict(X_train))) print('Confusion_matrix (test data):') print(metrics.confusion_matrix(y_test, model.predict(X_test))) # precision = tp / (tp + fp) # recall = tp / (tp + fn) (= sensitivity) # F1 = 2 * (precision * recall) / (precision + recall) print('\nPrecision - recall (training data):') print(metrics.classification_report(y_train, model.predict(X_train))) print('\nPrecision - recall (test data):') print(metrics.classification_report(y_test, model.predict(X_test)))
def main(): # parameters to cross-validate over parameters = { 'l2': np.logspace(-5, 0, num=6), } # load iris data in, make a binary decision problem out of it data = load_digits() X = Array2Dict().fit_transform(data.data) y = 2 * (data.target >= 5) - 1 i = int(0.8 * len(X)) X_train, X_test = X[:i], X[i:] y_train, y_test = y[:i], y[i:] # do the actual learning gs = GridSearchCV( VW_Classifier(loss='logistic', moniker='example_sklearn', passes=10, silent=True, learning_rate=10), param_grid=parameters, score_func=f1_score, cv=StratifiedKFold(y_train), ).fit(X_train, y_train) # print out results from cross-validation estimator = gs.best_estimator_ score = gs.best_score_ print 'Achieved a F1 score of %f using l2 == %f during cross-validation' % (score, estimator.l2) # print confusion matrix on test data y_est = estimator.fit(X_train, y_train).predict(X_test) print 'Confusion Matrix:' print confusion_matrix(y_test, y_est)
def detect_anomalies(): encoded_X_train = np.load("resources/files/encoded_X_train.npy") encoded_X_test = np.load("resources/files/encoded_X_test.npy") print(encoded_X_train.shape) print(encoded_X_test.shape) clf = svm.OneClassSVM(nu=0.1, kernel="linear") clf.fit(encoded_X_train) y_pred_train = clf.predict(encoded_X_train) y_pred_test = clf.predict(encoded_X_test) y_pred_outliers = clf.predict(np.full((100,hidden_dimensions[1]),4)) # print y_pred_train[y_pred_train == -1].size # print y_pred_test[y_pred_test == -1].size # print y_pred_outliers[y_pred_outliers == -1].size # n_normal_points_test = X_test[y_pred_test == 1] # n_anomalies_test = X_test[y_pred_test == -1] # print(n_normal_points_test.shape) # print(n_anomalies_test.shape) print("Train Accuracy: %f"%(accuracy_score(Y_train, y_pred_train))) print("Test Accuracy: %f"%( accuracy_score(Y_test, y_pred_test))) print("Precision: %f" % (precision_score(Y_test, y_pred_test,pos_label=1))) #print("Recall: %f" % (precision_score(Y_test, y_pred_test, pos_label=-1))) print "Confusion Matrix: (Anomalies, Normal)" print confusion_matrix(Y_test,y_pred_test,labels=[-1,1]) fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_test, pos_label=1) print "AUC: %f"%metrics.auc(fpr, tpr)
def test_digits() : from sklearn.cross_validation import train_test_split from sklearn.datasets import load_digits from sklearn.metrics import confusion_matrix, classification_report, accuracy_score from sklearn.preprocessing import LabelBinarizer digits = load_digits() X = digits.data y = digits.target #labels X /= X.max() #norm nn = NeuralNetwork([64,100,10],'logistic') #8x8 input, 10 output X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) labels_train = LabelBinarizer().fit_transform(y_train) #convert no to vector labels_test = LabelBinarizer().fit_transform(y_test) nn.fit(X_train,labels_train,epochs=100) predictions = [] for i in range(X_test.shape[0]) : o = nn.predict(X_test[i]) predictions.append(np.argmax(o)) print confusion_matrix(y_test,predictions) print classification_report(y_test,predictions) print 'accuracy at %0.3f'%accuracy_score(y_test,predictions)
def modelfit(alg, train_data, train_label, cv_folds=5, early_stopping_rounds=1): xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(train_data, label=train_label) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=True) alg.set_params(n_estimators=cvresult.shape[0]) # Goal of CV is to tune the number of rounds, which is set here # Note: can change to a different day to see what happens start = time.time() alg.fit(train_data, train_label, eval_metric='auc') print "Time to fit: %s" % (time.time()-start) pickle.dump(alg, open("/home/jche/Desktop/xgboost.p", "w+")) # Save model start = time.time() dtrain_predprob = alg.predict_proba(train_data)[:,1] print "Time to predict: %s" % (time.time() - start) for cutoff in range(0, 41): cut = cutoff/float(100) # Cutoff in decimal form dtrain_predictions = dtrain_predprob > cut # If y values are greater than the cutoff # Print model report: print "\nModel Report for cutoff %s" % cut print "Accuracy : %.4g" % metrics.accuracy_score(train_label, dtrain_predictions) print "AUC Score (Train): %f" % metrics.roc_auc_score(train_label, dtrain_predprob) print "Recall is: %s" % metrics.recall_score(train_label, dtrain_predictions) print metrics.confusion_matrix(train_label, dtrain_predictions)
def getBestK(X_train, y_train, X_val, y_val, nns=[30], print_train=True, print_val=True): acc_train = np.zeros((1, len(nns))) acc_val = np.zeros((1, len(nns))) for j in range(0, len(nns)): print j sys.stdout.flush() knn = KNNClassifier(nns[j]) knn.train(X_train, y_train) # acc_train[0, j] = np.mean(knn.predict(X_train) == y_train) print acc_train[0, j] sys.stdout.flush() y_pred = knn.predict(X_val) acc_val[0, j] = np.mean(y_pred == y_val) print acc_val[0, j] sys.stdout.flush() print "Confusion matrix:" print confusion_matrix(y_pred, y_val) if print_train: print (acc_train) if print_val: print (acc_val) best_val = np.max(acc_val) best_rate, best_reg = np.where(acc_val == np.amax(acc_val)) return (best_rate[0], best_reg[0]), knn
def getScores(y, yPredTrain, yTest, yPredTest): scores = dict() scores['f1Train'] = f1_score(y, yPredTrain) scores['f1Test'] = f1_score(yTest, yPredTest) scores['accTrain'] = accuracy_score(y, yPredTrain) scores['accTest'] = accuracy_score(yTest, yPredTest) scores['rocTrain'] = roc_auc_score(y, yPredTrain) scores['rocTest'] = roc_auc_score(yTest, yPredTest) scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain) scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest) proba = float(len(np.where(y==1)[0]))/len(y) if proba < 0.50: proba = 1 - proba scores['random'] = proba return scores
def crossVal(positions, X, y, missedYFile): outF = open(missedYFile, 'w') posArray = np.array(positions) # Split into training and test sss = StratifiedShuffleSplit(y, 4, test_size=0.1, random_state=442) cvRound = 0 for train_index, test_index in sss: clf = ExtraTreesClassifier(n_estimators=300, random_state=13, bootstrap=True, max_features=20, min_samples_split=1, max_depth=8, min_samples_leaf=13, n_jobs=4 ) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] pos_test = posArray[test_index] clf = clf.fit(X_train, y_train) preds = clf.predict(X_test) metrics.confusion_matrix( y_test, preds ) print( metrics.classification_report(y_test, clf.predict(X_test)) ) for loc,t,p in zip(pos_test, y_test, preds): if t=='0' and p=='1': print >> outF, loc + '\t' + str(cvRound) cvRound += 1 outF.close()
def testdata_stats(): test_dataset = datasets.load_files(project_root+"/testdata", encoding='utf-8', decode_error='ignore') # save_thing_to_file(test_dataset, "test_dataset.txt") bayes = get_thing_from_file("bayes.txt") bayes.fit(test_dataset.data, test_dataset.target) predicted_nb = bayes.predict(test_dataset.data) print "*****BAYESIAN STATS****" print "average accuracy = " + \ str(numpy.mean(predicted_nb == test_dataset.target)) print(metrics.classification_report(test_dataset.target, predicted_nb, target_names=test_dataset.target_names)) print "*****BAYESIAN CONFUSION MATRIX*****" print metrics.confusion_matrix(test_dataset.target, predicted_nb) svm = get_thing_from_file("svm.txt") svm.fit(test_dataset.data, test_dataset.target) predicted_svm = svm.predict(test_dataset.data) print "*****SVM STATS*****" print "average accuracy = " + \ str(numpy.mean(predicted_svm == test_dataset.target)) print(metrics.classification_report(test_dataset.target, predicted_svm, target_names=test_dataset.target_names)) print "*****SVM CONFUSION MATRIX*****" print metrics.confusion_matrix(test_dataset.target, predicted_svm)
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring): ''' Run normal SVM classification without cross-fold validation. ''' x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation # feature selection since we have a small sample space fs = SelectPercentile(scoring, percentile=20) pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)]) pipeline = OneVsRestClassifier(pipeline) clfer = pipeline.fit(x_train, y_train) y_predict_train = clfer.predict(x_train) print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
def pipeline_summary(concept_id, pos_concept, neg_concept, pipeline, test_set, predicted, store=False): vectorizer = pipeline.named_steps['vectorizer'] classifier = pipeline.named_steps['classifier'] fnames = vectorizer.get_feature_names() try: selector = pipeline.named_steps['selector'] indices = selector.get_support(True) selected_terms = [ fnames[i] for i in indices ] except KeyError: print'Selector not used' selected_terms = fnames show_most_informative_features(selected_terms, classifier, n=25) print classification_report(test_set, predicted) print confusion_matrix(test_set, predicted) if store: print 'Storing pipeline...' pickle.dump(pipeline, open(concept_pipeline(pos_concept, neg_concept), 'wb')) coefs = np.where( classifier.coef_ > 0)[1] concept_terms = [ selected_terms[i] for i in coefs ] term_weights = [ classifier.coef_[0,i] for i in coefs ] feature_weights = sorted(zip(term_weights, concept_terms), reverse=True) print 'Storing concept terms' save_concept_terms(concept_id, feature_weights)
def fitMdl(nFitObs = 50): mdl = linear_model.LogisticRegression(verbose = 1) mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], (nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])), glbObsTrnRsp[0:nFitObs]) print mdl.get_params() print mdl.coef_.shape print ' coeff stats:' for lblIx in xrange(len(dspLabels)): print ' label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % (dspLabels[lblIx], mdl.coef_[lblIx,:].argmin() / glbImgSz, mdl.coef_[lblIx,:].argmin() % glbImgSz, mdl.coef_[lblIx,:].min(), mdl.coef_[lblIx,:].argmax() / glbImgSz, mdl.coef_[lblIx,:].argmax() % glbImgSz, mdl.coef_[lblIx,:].max()) train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], (nFitObs , glbImgSz ** 2))) accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs]) print ' accuracy train:%0.4f' % (accuracy_train) print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels) valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr, (glbObsVldFtr.shape[0], glbImgSz ** 2))) accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp) print ' accuracy valid:%0.4f' % (accuracy_valid) print metrics.confusion_matrix(glbObsVldRsp , valid_pred_labels) test_pred_labels = mdl.predict(np.reshape(glbObsNewFtr, (glbObsNewFtr.shape[0], glbImgSz ** 2))) accuracy_test = metrics.accuracy_score( test_pred_labels, glbObsNewRsp) print ' accuracy test:%0.4f' % (accuracy_test) test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp, test_pred_labels), index = dspLabels, columns = dspLabels) print test_conf return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
def train_and_evaluate(X, y, clf): from sklearn.cross_validation import train_test_split train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7, random_state=1) clf.fit(train_X, train_y) pre = clf.predict(test_X) print metrics.classification_report(test_y, pre) print metrics.confusion_matrix(test_y, pre)
def report(self): from sklearn.metrics import roc_auc_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix y_pred_probas, y_true = self.make_predictions()[:2] y_pred = y_pred_probas.argmax(1) y_pred_probas = y_pred_probas[:, 1] y_true = y_true.reshape(-1) try: score = roc_auc_score(y_true, y_pred_probas) except ValueError: pass else: print print "AUC score:", score print "AUC score (binary):", roc_auc_score(y_true, y_pred) print print "Classification report:" print classification_report(y_true, y_pred) print print "Confusion matrix:" print confusion_matrix(y_true, y_pred) print
def a_b_classify_pca((f_train, t_train, f_test, t_test, n_components)): ''' Uses an SVM to classify A and B sections based on the feature vectors built above, and returns some statistical results ''' print '{}: Starting PCA with {} components (this could take a while...)'.format(time.ctime(), n_components) pca = PCA(n_components = n_components) pca.fit(f_train) f_train_pca = list(pca.transform(f_train)) f_test_pca = list(pca.transform(f_test)) print '{0}: Training the SVM'.format(time.ctime()) clf = svm.SVC() clf.fit(f_train_pca, t_train) print '{0}: Classifying using SVM'.format(time.ctime()) t_predict = clf.predict(f_test_pca) t_train_predict = clf.predict(f_train_pca) print 'Confusion matrix is built so that C_ij is the number of observations known to be in group i but predicted to be in group j. In this case, group 0 corresponds to A sections and group 1 corresponds to B sections.' print 'Confusion matrix on test data:' test_confuse = confusion_matrix(t_test, t_predict) print test_confuse print 'Confusion matrix on training data:' train_confuse = confusion_matrix(t_train, t_train_predict) print train_confuse return train_confuse, test_confuse
def confusion_matrices(testX, testY): pred = [card.predict_number() for card in testX] labels = ["one", "two", "three"] cm = confusion_matrix(testY[:,NUMBER], pred, labels) print(cm) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) pl.title('Number Confusion Matrix') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) pl.xlabel('Predicted') pl.ylabel('True') pl.show() pred = [card.predict_shading() for card in testX] labels = ["empty", "striped", "solid"] cm = confusion_matrix(testY[:,SHADING], pred, labels) print(cm) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) pl.title('Shading Confusion Matrix') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) pl.xlabel('Predicted') pl.ylabel('True') pl.show() pred = [card.predict_shape() for card in testX] labels = ["rounded-rectangle", "squiggle", "diamond"] cm = confusion_matrix(testY[:,SHAPE], pred, labels) print(cm) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) pl.title('Shape Confusion Matrix') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) pl.xlabel('Predicted') pl.ylabel('True') pl.show() pred = [card.predict_color() for card in testX] labels = ["red", "green", "purple"] cm = confusion_matrix(testY[:,COLOR], pred, labels) print(cm) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) pl.title('Color Confusion Matrix') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) pl.xlabel('Predicted') pl.ylabel('True') pl.show()
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confusion_matrix=True): """ 多指标来评估模型 :param X: 测试集 :param y: 真实结果 :param clf: 模型 :param show_accuracy: 显示正确率 :param show_classification_report: 显示分类报告 :param show_confusion_matrix: :return: """ y_pred = clf.predict(X) if show_accuracy: print "Accuracy:{0:.4f}".format(metrics.accuracy_score(y,y_pred)), "\n" if show_classification_report: print "模型分类报告:" print metrics.classification_report(y,y_pred,labels=[0,1],target_names=['良性网址','恶意网址']), "\n" if show_confusion_matrix: print "混淆矩阵报告:" print metrics.confusion_matrix(y,y_pred),"\n" return metrics.confusion_matrix(y,y_pred)
def benchmark(clf): clf.fit(X_train, y_train) pred = clf.predict(X_test) print score = 1 - metrics.f1_score(y_test, pred) print "error: %0.3f" % score if hasattr(clf, 'coef_'): print "top 10 keywords per class:" for i, category in enumerate(categories): top10 = np.argsort(clf.coef_[i])[-10:] print "%s: %s" % (category, " ".join(feature_names[top10])) print print metrics.classification_report(y_test, pred, target_names=categories) print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print clf_descr = str(clf).split('(')[0] return clf_descr, score
def select_classifier(algo, label): model = algo model.fit(training_features, training_labels) expected = testing_labels predicted = model.predict(testing_features) print("----------------------------------------------------") print("| Classification Report |") print("----------------------------------------------------") print(metrics.classification_report(expected, predicted)) print("") print("----------------------------------------------------") print("| Confusion Matrix |") print("----------------------------------------------------") print(metrics.confusion_matrix(expected, predicted)) print("") cm_list = metrics.confusion_matrix(expected, predicted).tolist() list_total = float(sum(sum(x) for x in cm_list)) print("----------------------------------------------------") print("| False Positives and Negatives |") print("----------------------------------------------------") print "False Positive: ", cm_list[1][0] / list_total print("") print "False Negative: ", cm_list[0][1] / list_total print("") plt.figure() plot_confusion_matrix(metrics.confusion_matrix(expected, predicted), label) plt.show()
def train_and_evaluate(clf, X_train, X_test, y_train, y_test, y_name): # Training clf.fit(X_train, y_train) # Prediction of testing sets y_pred = clf.predict(X_test) # Precision, recall and support (i.e. nr. of samples used for the testing) print "Classification Report:" print metrics.classification_report(y_test, y_pred) # Confusion Matrix print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_pred) # Visualization of Categories / Assigned / Data print "Tested data => assigned category, data:" for i in range(len(X_test)): print str(i) + ") Real category: " + str(y_name[y_test[i]]) + ", Assigned category: " + \ str(y_name[y_pred[i]]) + ", Data: " + str(X_test[i]) # Assign names to the categories (defined by numbers) print "\n Categories: \n" categories = set() for cat in y_pred: categories.add(cat) categories = sorted(categories) for cat in categories: print str(cat) + " " + y_name[cat]
def gaussian_1d_2classes(x,y): regr = GaussianClassification1D() cv = KFold(len(x), n_folds=10) for train_idx, test_idx in cv: x_train = x[train_idx] x_test = x[test_idx] y_train = y[train_idx] y_test = y[test_idx] labels = mapping_labels(np.unique(y_train)) # Training regr.fit(x_train,y_train,labels) # Predict over the training data and getting the error predicted_y_training = regr.predict(x_train, labels) conf_matrix = confusion_matrix(y_train, predicted_y_training) precision = calculate_precision(conf_matrix) recall = calculate_recall(conf_matrix) accuracy = calculate_accuracy(conf_matrix) fmeasure = calculate_fmeasure(precision,recall) # Predict over the testing data and getting the error predicted_y_testing = regr.predict(x_test, labels) conf_matrix = confusion_matrix(y_test, predicted_y_testing) precision = calculate_precision(conf_matrix) recall = calculate_recall(conf_matrix) accuracy = calculate_accuracy(conf_matrix) fmeasure = calculate_fmeasure(precision,recall) print 'Precision:',precision, ' Recall:',recall, ' Accuracy:',accuracy,' F-Measure:',fmeasure
def learnCART(self): train_input_data = self.loadData(self.train_file) target = [x[1] for x in train_input_data] target = target[1:] features = [x[2:] for x in train_input_data] features = features[1:] # feature selection #features_new = self.doFeatureSelection(features,target) model = self.classify(features,target) test_input_data = self.loadData(self.test_file) actualOutput = [x[1] for x in test_input_data] actualOutput = actualOutput[1:] features = [x[2:] for x in test_input_data] features = features[1:] predictedOutput = model.predict(features) #print predictedOutput #print actualOutput self.computeAccuracy(predictedOutput,actualOutput) print "Precision recall Fscore support metrics for CART " print precision_recall_fscore_support(actualOutput,predictedOutput) print "\nconfusion matrix\n" print confusion_matrix(actualOutput,predictedOutput) self.printDTRules(model) X= [] Y=[] for a in predictedOutput: X.append(int(a)) for a in actualOutput: Y.append(int(a)) self.plotROC(Y,X) result = zip(Y,X) self.write_To_File(result,"cart-predictions.csv")
def forward(x_data, y_data, print_conf_matrix=False): ''' Neural net architecture :param x_data: :param y_data: :param train: :return: ''' x, t = Variable(x_data), Variable(y_data) h1 = F.relu(model.l1(x)) h1 = F.max_pooling_2d(h1,max_pool_window_1,stride=max_pool_stride_1) h2 = F.dropout(F.relu(model.l2(h1))) h2 = F.average_pooling_2d(h2, avg_pool_window_2, stride=avg_pool_stride_2) h2 = F.max_pooling_2d(h2,max_pool_window_2,stride=max_pool_stride_2) y = model.l3(h2) # display confusion matrix if print_conf_matrix: pdb.set_trace() print confusion_matrix(cuda.to_cpu(t.data), cuda.to_cpu(y.data).argmax(axis=1)) return F.softmax_cross_entropy(y, t), F.accuracy(y, t)
auc1[label] = auc(fpr[label], tpr[label]) plt.plot(tpr[label], fpr[label], label='%s tagger, AUC = %.1f%%' % (label.replace('j_', ''), auc1[label] * 100.)) plt.semilogy() plt.xlabel("Signal Efficiency") plt.ylabel("Background Efficiency") plt.ylim(0.001, 1) plt.grid(True) plt.legend(loc='upper left') plt.figtext(0.25, 0.90, '(Unpruned)', fontweight='bold', wrap=True, horizontalalignment='right', fontsize=14) plt.savefig(options.outputDir + 'ROC_' + str(time) + '.png' % ()) # Confusion matrix conf_mat = confusion_matrix(lbllist.numpy(), predlist.numpy()) df_cm = pd.DataFrame(conf_mat, index=[i for i in full_dataset.labels_list], columns=[i for i in full_dataset.labels_list]) plt.figure(figsize=(10, 7)) sn.heatmap(df_cm, annot=True,fmt='g') plt.savefig(options.outputDir + 'confMatrix_' + str(time) + '.png') plt.show() print(conf_mat) class_accuracy = 100 * conf_mat.diagonal() / conf_mat.sum(1) print(class_accuracy) torch.save(current_model.state_dict(), options.outputDir + 'JetClassifyModel_' + str(time) + '.pt') os.makedirs(options.outputDir+'weight_dists/',exist_ok=True) plot_weights.plot_kernels(current_model, text=" (Locally Pruned)", output=options.outputDir+'weight_dists/'+'weight_dist_' + str(time) + '.png')
plt.plot(hist.history['val_loss'], label='val') plt.title('CNN_In_8_Steps : Loss & Validation Loss') plt.legend() plt.show() plt.plot(hist.history['accuracy'], label='train') plt.plot(hist.history['val_accuracy'], label='val') plt.title('CNN_In_8_Steps : Accuracy & Validation Accuracy') plt.legend() plt.show() # Confusion Matrix & Pres & Recall & F1-Score target_names = ['Abnormal', 'Normal'] label_names = [0, 1] Y_pred = model.predict_generator(testdata) y_pred = np.argmax(Y_pred, axis=1) cm = confusion_matrix(testdata.classes, y_pred, labels=label_names) print('Confusion Matrix') print(confusion_matrix(testdata.classes, y_pred)) print('classification_Report') print( classification_report(testdata.classes, y_pred, target_names=target_names)) disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names) disp = disp.plot(cmap=plt.cm.Blues, values_format='g') plt.show()
train_sentiments = (train_sentiments.replace({ 'positive': 1, 'negative': 0 })).values test_sentiments = (test_sentiments.replace({ 'positive': 1, 'negative': 0 })).values corpus_train = CleanUpData(train_reviews) corpus_test = CleanUpData(test_reviews) #corpus_train = CleanUpData(train) #corpus_test = CleanUpData(test) count_vec = CountVectorizer(ngram_range=(1, 3), binary=False) count_vec_train = count_vec.fit_transform(corpus_train) count_vec_test = count_vec.transform(corpus_test) linear_svc_count = LinearSVC(C=0.5, random_state=42, max_iter=5000) linear_svc_count.fit(count_vec_train, train_sentiments) predict_count = linear_svc_count.predict(count_vec_test) print( "Classification Report: \n", classification_report(test_sentiments, predict_count, target_names=['Negative', 'Positive'])) print("Confusion Matrix: \n", confusion_matrix(test_sentiments, predict_count)) print("Accuracy: \n", accuracy_score(test_sentiments, predict_count))
################################################# #=================Classification================# # Perform a classification MLP on the Taxonomy # # data. It has a categorical target. # ################################################# #========================== # Use a logistic function #========================== nnclass1 = MLPClassifier(activation='logistic', solver='sgd', hidden_layer_sizes=(100,100)) nnclass1.fit(taxon_data_train, taxon_train) nnclass1_pred = nnclass1.predict(taxon_data_test) cm = metrics.confusion_matrix(taxon_test, nnclass1_pred) print(cm) plt.matshow(cm) plt.title('Confusion Matrix') plt.xlabel('Actual Value') plt.ylabel('Predicted Value') plt.xticks([0,1,2,3], ['I','II','III','IV']) print(metrics.classification_report(taxon_test, nnclass1_pred)) #===================================== # Use rectified linear unit function #===================================== nnclass2 = MLPClassifier(activation='relu', solver='sgd', hidden_layer_sizes=(100,100))
epochs=5, #steps_per_epoch=steps_per_epoch, validation_data=test_it, callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)], verbose=1, workers=1, validation_steps=40) # evaluate model #loss = model.evaluate_generator(test_it, steps=24) #Confution Matrix and Classification Report Y_pred = model.predict_generator(test_it, 5480 // batch_size + 1) y_pred = np.argmax(Y_pred, axis=1) print('Confusion Matrix') tn, fp, fn, tp = confusion_matrix(test_it.classes, y_pred).ravel() print(tp) print(tn) print(fp) print(fn) specificity = tn / (tn + fp) print(specificity) print('Classification Report') target_names = ['Negative', 'Positive'] print(classification_report(test_it.classes, y_pred, target_names=target_names)) # ROC #from sklearn.metrics import roc_curve #y_pred_keras = model.predict(test_it) #fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_it, y_pred_keras)
from sklearn.linear_model import LogisticRegression #creating local variable classifier classifier = LogisticRegression() #Training the model classifier.fit(X_train,y_train) #predicting the value of Y y_pred = classifier.predict(X_test) #importing metrics for evaluation from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report #summary of the model predicion print(classification_report(y_test,y_pred)) print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred)) #accuracy score of the model from sklearn.metrics import accuracy_score print('accuracy score :',accuracy_score(y_pred,y_test)) """### **K-Nearest Neighbour**""" #K-Nearest Neighbour #importing the library from sklearn.neighbors import KNeighborsClassifier #creating local variable classifier classifier = KNeighborsClassifier(n_neighbors=8) #Training the model classifier.fit(X_train,y_train)
Classifier = LogisticRegression(random_state=0) Classifier.fit(X_train, y_train) # Predicting the Test Set Results # y_pred --> Vector of predictions y_pred = Classifier.predict(X_test) # Making the Confusion Matrix # Confusion Matrix -> To see weather our Logistic Regression made correct prediction or not # This confusion matrx will contain the correct predictions made on the Test Set as well as the incorrect predictions # For this we are importing a function and not a class # Distinction --> Class contains the captial letters at the buginneing # Parameters of cnfusion matrix -> (1) y_true = Real values thats the values of the data set, (2) y_pred # 65, 24 = 89-> Correct Predictions, 8,3 = 11-> Incorrect Predictions from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualizing the Training Set Results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1, stop=X_set[:, 0].max() + 1, step=0.01), np.arange(start=X_set[:, 1].min() - 1, stop=X_set[:, 1].max() + 1, step=0.01)) plt.contourf(X1, X2, Classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
data = pd.read_csv("lending_club_data01.csv.txt") #print(data.head()) #print(data.tail()) data["good_loans"] = data["bad_loans"].apply(lambda y: 'yes' if y == 0 else 'no') print(data.head()) x = data.drop(['bad_loans', 'good_loans'], axis=1) y = data['good_loans'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=124) model = DecisionTreeClassifier() model.fit(x_train, y_train) predication = model.predict(x_test) print(predication) print(confusion_matrix(y_test, predication)) print(classification_report(y_test, predication)) # By using random forest classifier rf_model = RandomForestClassifier(n_estimators=150) rf_model.fit(x_train, y_train) rf_predication = rf_model.predict(x_test) print(rf_predication) print(confusion_matrix(y_test, rf_predication)) print(classification_report(y_test, rf_predication))
# Instructions # 100 XP # Instructions # 100 XP # Import the metrics module from sklearn and MultinomialNB from sklearn.naive_bayes. # Instantiate a MultinomialNB classifier called nb_classifier. # Fit the classifier to the training data. # Compute the predicted tags for the test data. # Calculate and print the accuracy score of the classifier. # Compute the confusion matrix. To make it easier to read, specify the keyword argument labels=['FAKE', 'REAL']. from sklearn.naive_bayes import MultinomialNB from sklearn import metrics # Instantiate a Multinomial Naive Bayes classifier: nb_classifier nb_classifier = MultinomialNB() # Fit the classifier to the training data nb_classifier.fit(count_train, y_train) # Create the predicted tags: pred pred = nb_classifier.predict(count_test) # Calculate the accuracy score: score score = metrics.accuracy_score(y_test, pred) print(score) # Calculate the confusion matrix: cm cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL']) print(cm)
X_test_all = df_test_ml # scaled X_train_all_sc = df_train_ml_sc y_train_all_sc = df_train_ml['Survived'] X_test_all_sc = df_test_ml_sc X_test_all.fillna(X_test_all.mean(), inplace=True) print("*") from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() logreg.fit(X_train,y_train) pred_logreg = logreg.predict(X_test) print(confusion_matrix(y_test, pred_logreg)) print(classification_report(y_test, pred_logreg)) print(accuracy_score(y_test, pred_logreg)) logreg.fit(X_train_all, y_train_all) pred_all_logreg = logreg.predict(X_test_all) sub_logreg = pd.DataFrame() sub_logreg['PassengerId'] = df_test['PassengerId'] sub_logreg['Survived'] = pred_all_logreg #sub_logmodel.to_csv('logmodel.csv',index=False) from sklearn.naive_bayes import GaussianNB gnb=GaussianNB() gnb.fit(X_train,y_train) pred_gnb = gnb.predict(X_test) print(confusion_matrix(y_test, pred_gnb)) print(classification_report(y_test, pred_gnb)) print(accuracy_score(y_test, pred_gnb))
def LSTM_model_train(train_data, epochs, test_data, name, jump_per): def f1(y_true, y_pred): y_pred = K.round(y_pred) tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0) fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0) fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0) p = tp / (tp + fp + K.epsilon()) r = tp / (tp + fn + K.epsilon()) f1 = 2*p*r / (p+r+K.epsilon()) f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1) return K.mean(f1) def precision(y_true, y_pred): """Precision metric. Only computes a batch-wise average of precision. Computes the precision, a metric for multi-label classification of how many selected items are relevant. """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) precision = true_positives / (predicted_positives + K.epsilon()) return precision def recall(y_true, y_pred): """Recall metric. Only computes a batch-wise average of recall. Computes the recall, a metric for multi-label classification of how many relevant items are selected. """ true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) recall = true_positives / (possible_positives + K.epsilon()) return recall def f1_score_own(y_true, y_pred): """Computes the F1 Score Only computes a batch-wise average of recall. Computes the recall, a metric for multi-label classification of how many relevant items are selected. """ p = precision(y_true, y_pred) r = recall(y_true, y_pred) return (2 * p * r) / (p + r + K.epsilon()) def matthews_correlation(y_true, y_pred): y_pred_pos = K.round(K.clip(y_pred, 0, 1)) y_pred_neg = 1 - y_pred_pos y_pos = K.round(K.clip(y_true, 0, 1)) y_neg = 1 - y_pos tp = K.sum(y_pos * y_pred_pos) tn = K.sum(y_neg * y_pred_neg) fp = K.sum(y_neg * y_pred_pos) fn = K.sum(y_pos * y_pred_neg) numerator = (tp * tn - fp * fn) denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) return numerator / (denominator + K.epsilon()) # Get the values from the data # # train data and test data remove NA # train_data = train_data.dropna(axis = 0) test_data = test_data.dropna(axis = 0) train_data.loc[train_data['jump_pred'] == -1, 'jump_pred'] = 1 test_data.loc[test_data['jump_pred'] == -1, 'jump_pred'] = 1 x_train = train_data.drop(columns = ['jump_pred', 'utcsec', 'sec']) y_train = train_data['jump_pred'] smt_dict_train = {0: len(y_train[y_train == 0]), 1: int(np.ceil(len(y_train[y_train == 0]) * jump_per))} smt_train = SMOTE(sampling_strategy = smt_dict_train) x_train, y_train = smt_train.fit_sample(x_train, y_train) #x_train = x_train.values #y_train = pd.get_dummies(y_train) #y_train = y_train.values # Test data # x_test = test_data.drop(columns = ['jump_pred', 'utcsec', 'sec']) y_test = test_data['jump_pred'] y_test_out = y_test #print('Distribution of jumps', pd.DataFrame(y_test_out)[0].value_counts()) #x_test = x_test.values #y_test = pd.get_dummies(y_test) #y_test = y_test.values # LSTM # lstm_output_size = 40 # Training # batch_size = 248 print("Batch size:", batch_size) # Scale the values # min_max_scaler = preprocessing.StandardScaler() #x_train = x_train.values x_train = min_max_scaler.fit_transform(x_train) #x_train = pd.DataFrame(x_train_scaled) #x_test = x_test.values x_test = min_max_scaler.fit_transform(x_test) #x_test = pd.DataFrame(x_test_scaled) # Print shapes before reshaping # #print('------------------------------') #print('Shapes before reshaping') #print('x_train shape:', x_train.shape) #print('x_test shape:', x_test.shape) #print('y_train shape:', y_train.shape) #print('y_test shape:', y_test.shape) # Reshape to LSTM training format # (x_train, y_train) = x_train.reshape(np.shape(x_train)[0], np.shape(x_train)[1], -1), y_train.reshape(np.shape(y_train)[0],1) (x_test, y_test) = x_test.reshape(np.shape(x_test)[0], np.shape(x_test)[1], -1), y_test.reshape(np.shape(y_test)[0],1) print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) print("y_train shape:", y_train.shape) print("y_test shape:", y_test.shape) print("Build model...") seq_length = x_train.shape[1] input_dims = x_train.shape[2] inputs = Input(shape = (seq_length, input_dims)) dense_att = Dense(input_dims, activation='relu', kernel_regularizer=regularizers.l2(0.01), name='dense_att')(inputs) attention_probs = Dense(input_dims, activation='sigmoid', name='attention_probs')(dense_att) attention_mul = multiply([dense_att, attention_probs], name='attention_mul') conv_1d = Conv1D(filters = 16, kernel_size = 4, name = 'conv_1d')(attention_mul) max_pool_1 = MaxPooling1D(pool_size = 2, name = 'max_pool_1')(conv_1d) conv_1d_2 = Conv1D(filters = 32, kernel_size = 3, name = 'conv_1d_2')(max_pool_1) conv_1d_3 = Conv1D(filters = 32, kernel_size = 3, name = 'conv_1d_3')(conv_1d_2) max_pool_2 = MaxPooling1D(pool_size = 2, name = 'max_pool_2')(conv_1d_3) lstm = LSTM(40, return_sequences = False, recurrent_dropout = 0.25, name = 'lstm')(max_pool_2) dense_1 = Dense(40, activation = 'relu')(lstm) dense_out = Dense(1, activation = 'sigmoid', name = 'dense_out')(dense_1) model = Model(inputs=[inputs], outputs=dense_out) model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = [f1_score_own, precision, recall]) fileName = './models/weights_best_'+name+'.hdf5' checkpointer = ModelCheckpoint(filepath = fileName, monitor = 'val_f1_score_own', verbose = 1, save_best_only = True, save_weights_only = False, mode = 'max', period = 1) print('Train the model...') # Early stopping # es = EarlyStopping(monitor='val_f1_score_own', mode='max', verbose=1, patience = 5) model.fit(x_train, y_train, validation_split = 0.2, batch_size = batch_size, epochs = epochs, validation_data = (x_train, y_train), verbose = 1, callbacks = [checkpointer, es], use_multiprocessing = True) # Maybe turn off multiprocessing # model.load_weights(fileName) loss, f1, precision, recall = model.evaluate(x_test, y_test, verbose = 1) #loss, cat_loss = model.evaluate(x_test, y_test, verbose = 1) #loss, tp, fp, tn, fn, ba, precision, recall, auc, recall_two = model.evaluate(x_test, y_test, verbose = 1) y_pred = model.predict(x_test) #""" f1_calc = f1_score(y_test_out, np.round(y_pred)) precision_calc = precision_score(y_test_out, np.round(y_pred)) recall_calc = recall_score(y_test_out, np.round(y_pred)) cohens_kappa_calc = cohen_kappa_score(y_test_out, np.round(y_pred)) mcc_calc = matthews_corrcoef(y_test_out, np.round(y_pred)) print('------------------------------') #print('Test F1 def func:', f1) #print('Test prec def func:', precision) print('Test recall def func:', recall) #print('Test def MCC:', mcc) #print('Test recall def two func:', recall_two) print('Test F1 score:', f1_calc) print('Test precision:', precision_calc) print('Test recall:', recall_calc) print('Test Cohens kappa:', cohens_kappa_calc) print('Test MCC:', mcc_calc) print('Test loss:', loss) out_dict = {} out_dict['loss'] = loss out_dict['f1'] = f1_calc out_dict['precision'] = precision_calc out_dict['recall'] = recall_calc out_dict['y_pred'] = y_pred out_dict['y_test'] = y_test_out out_dict['con_mat'] = confusion_matrix(y_test_out, np.round(y_pred)) out_dict['cohens_kappa'] = cohens_kappa_calc out_dict['mcc'] = mcc_calc return out_dict
def plot_confusion_matrix(step, y_true, y_pred, output_size): # check result directory result_dir = 'result' check_existing_dir(result_dir) print('plot confusion matrix start: ', end='') # preprocessing y_true = [x for x in y_true if x != -1] y_pred = [x for x in y_pred if x != -1] # compute confusion matrix cnf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred) # configuration np.set_printoptions(precision=2) if output_size == 2: labels = ['benign', 'malware'] else: # toy dataset label # labels = ['Virus', 'Worm', 'Trojan', 'not-a-virus:Downloader', 'Trojan-Ransom', 'Backdoor'] labels = list(range(output_size)) tick_marks = np.arange(len(labels)) plt.xticks(tick_marks, labels, rotation=90) plt.xticks(tick_marks, labels) plt.yticks(tick_marks, labels) norm_flag = True plot_title = 'Confusion matrix' cmap = plt.cm.Blues if norm_flag: cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') for row in cnf_matrix: for val in row: print('{0:.2f}'.format(val), end=' ') print() # plotting start plt.figure() plt.imshow(cnf_matrix, interpolation='nearest', cmap=cmap) plt.title(plot_title) plt.colorbar() # information about each block's value fmt = '.3f' if norm_flag else 'd' thresh = cnf_matrix.max() / 2. for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])): plt.text(j, i, format(cnf_matrix[i, j], fmt), horizontalalignment="center", color="white" if cnf_matrix[i, j] > thresh else "black") # insert legend information # import matplotlib.patches as mpatches # patches = [mpatches.Patch(color='white', label='G{num} = {group}'.format(num=i+1, group=labels[i])) for i in range(len(labels))] # plt.legend(handles=patches, bbox_to_anchor=(-0.60, 1), loc=2, borderaxespad=0.) plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') # plt.show() plt.savefig(os.path.join(result_dir, 'conf_matrix{}'.format(step))) print('--plot confusion matrix finish--') pass
extension = path.split(".")[-1] if extension == 'csv': result_df = pd.read_csv(path, header=[0, 1]) elif extension == 'tsv': result_df = pd.read_table(path, header=[0, 1]) for dataset in test_names: print(result_df.get(dataset) is None, result_df.get('gt') is None) if result_df.get(dataset) is not None: gt_column = result_df[dataset, "label"] pred_column = result_df[dataset, "predicted"] else: gt_column = result_df["gt"] pred_column = result_df["pred"] tn, fp, fn, tp = confusion_matrix(gt_column.dropna(), round_by_threshold(pred_column.dropna(), th)).ravel() print("Evaluation of the %s set " % dataset) sen = float(tp) / (fn + tp) pre = float(tp) / (tp + fp) spe = float(tn) / (tn + fp) acc = float(tn + tp) / (tn + fp + fn + tp) f1 = (2 * sen * pre) / (sen + pre) print("\tSen : ", sen) print("\tSpe : ", spe) print("\tAcc : ", acc) print("\tPrecision : ", pre) print("\tF1 : ", f1) result_dic = {"Acc": acc, "Sen": sen, "Pre": pre, "F1": f1, "Spe": spe} if args.no_threshold: fpr, tpr, thresholds_AUC = roc_curve(gt_column, pred_column) AUC = auc(fpr, tpr)
############### fit frequency based word embeddings into our data set to turn text into wordvectors vectorizer = TfidfVectorizer(lowercase=True, stop_words=STOPWORDS) vectorizer.fit(x_train) x_train_vect = vectorizer.transform(x_train) x_test_vect = vectorizer.transform(x_test) ############# Build our classifier with Linear Support vector machine model = SVC(C=1, kernel='linear', class_weight='balanced') model.fit(x_train_vect, y_train) y_pred = model.predict(x_test_vect) cm = confusion_matrix(y_test, y_pred) ########## confusion matrix for test set pipeline = make_pipeline( vectorizer, model) #### save our model with pipeline function for future analysis def predict(text): score = pipeline.predict([clean_text(text)]) if score == 0: topic = 'real news' elif score == 1: topic = 'Genral spam' elif score == 2:
bayes_clf.fit(x_train, y_train) """ Predict the test dataset using Naive Bayes""" predicted = bayes_clf.predict(x_test) print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) # 输出f1分数,准确率,召回率等指标 print(metrics.classification_report(y_test, predicted, target_names=categories)) """ Support Vector Machine (SVM) classifier""" svm_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)), ]) svm_clf.fit(x_train, y_train) predicted = svm_clf.predict(x_test) print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test))) print(metrics.classification_report(y_test, predicted, target_names=categories)) # 输出混淆矩阵 print("Confusion Matrix:") print(metrics.confusion_matrix(y_test, predicted)) print('\n') """ 10-折交叉验证 """ clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB()) clf_s = make_pipeline(CountVectorizer(), TfidfTransformer(), SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)) bayes_10_fold = cross_val_score(clf_b, x_text, y, cv=10) svm_10_fold = cross_val_score(clf_s, x_text, y, cv=10) print('Naives Bayes 10-fold correct prediction: {:4.4f}'.format(np.mean(bayes_10_fold))) print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold)))
all_X = train[columns] all_y = train['Survived'] train_X, test_X, train_y, test_y = train_test_split(all_X, all_y, test_size=0.2, random_state=0) lr.fit(train_X, train_y) predictions = lr.predict(test_X) accuracy = accuracy_score(test_y, predictions) from sklearn.metrics import confusion_matrix conf_matrix = confusion_matrix(test_y, predictions) pd.DataFrame(conf_matrix, columns=['Survived', 'Died'], index=[['Survived', 'Died']]) from sklearn.model_selection import cross_val_score import numpy as np scores = cross_val_score(lr, all_X, all_y, cv=10) np.mean(scores) print('accuracy: ', accuracy) print('mean :', np.mean(scores)) lr = LogisticRegression()
kfold = model_selection.KFold(n_splits=10) cross_res = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append((name, cross_res)) for name, res in results: print("{:6} {:2.4} {:2.4}").format(name, res.mean(), res.std()) model = LinearDiscriminantAnalisys(solver='lsqr') model.fit(X_train, Y_train) predictions = model.predict(X_val) print(accuracy_score(Y_val, predictions)) print(classification_report(Y_val, predictions)) print(confusion_matrix(Y_val, predictions)) model = LinearDiscriminantAnalisys(solver='engel') model.fit(X_train, Y_train) predictions = model.predict(X_val) print(accuracy_score(Y_val, predictions)) print(classification_report(Y_val, predictions)) print(confusion_matrix(Y_val, predictions))
axes[i].set_xlim(x_min, x_max) axes[i].set_ylim(y_min, y_max) plt.sca(axes[i]) plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.prism) ys = (-clf.intercept_[i] - xs * clf.coef_[i, 0]) / clf.coef_[i, 1] plt.plot(xs, ys) print(clf.predict(scaler.transform([[4.7, 3.1]]))) print(clf.decision_function(scaler.transform([[4.7, 3.1]]))) from sklearn import metrics y_train_pred = clf.predict(X_train) print(metrics.accuracy_score(y_train, y_train_pred)) y_pred = clf.predict(X_test) print(metrics.accuracy_score(y_test, y_pred)) print( metrics.classification_report(y_test, y_pred, target_names=iris.target_names)) print(metrics.confusion_matrix(y_test, y_pred)) print("My name is Timothee Becker") print("My NetID is: tbecker5") print( "I hereby certify that I have read the University policy on Academic Integrity and that I am not in violation." )
} GridSearch_Log_Reg = GridSearchCV(model_Logistic_Regression, parameters, n_jobs=-1) GridSearch_Log_Reg = GridSearch_Log_Reg.fit(X, y) print('Training the Model please be patient.') print('\n') print('GridSearch Logistic Regression best score: \n',GridSearch_Log_Reg.best_score_) print('\n') print('GridSearch Logistic Regression best parameters: \n',GridSearch_Log_Reg.best_params_) #Fit the model with Train model_Logistic_Regression.fit(X_train, y_train) pred = model_Logistic_Regression.predict(X_test) #Defining Confusion Matrix confusion_matrix(pred, y_test) print('Fit X, y with Logistic Regression Algorithm ',model_Logistic_Regression.fit(X, y)) #model_Logistic_Regression.fit(X, y) #Export pkl joblib.dump(GridSearch_Log_Reg, "model_LogistikRegression.pkl") #import pkl model_Logistic_Regressionodel = joblib.load("model_LogistikRegression.pkl") y_preds = model_Logistic_Regression.predict(X_test) print('\n') print('\n') print('accuracy score: ',accuracy_score(y_test, y_preds)) print('\n') print('confusion matrix: \n',confusion_matrix(y_test,y_preds)) print('\n') print(classification_report(y_test, y_preds))
X_test, y_test, name='ROC fold {}'.format(k_iteration), alpha=0.3, lw=1, ax=ax) interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) y_pred_train = trained_model.predict(X_train) y_pred_test = trained_model.predict(X_test) accuracy_train = accuracy_score(y_train, y_pred_train) accuracy_test = accuracy_score(y_test, y_pred_test) TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel() __print(model_name + 'train_acc: {}'.format(accuracy_train)) __print(model_name + 'test_acc: {}'.format(accuracy_test)) __print(model_name + 'TN:{}'.format(TN)) __print(model_name + 'FP:{}'.format(FP)) __print(model_name + 'FN:{}'.format(FN)) __print(model_name + 'TP:{}'.format(TP)) result_dict[model_name]['train_acc'].append(accuracy_train) result_dict[model_name]['test_acc'].append(accuracy_test) result_dict[model_name]['TN'].append(TN) result_dict[model_name]['TP'].append(TP) result_dict[model_name]['FP'].append(FP) result_dict[model_name]['FN'].append(FN)
def infer(self): # from sklearn import svm # from sklearn.ensemble import IsolationForest # from sklearn.metrics import confusion_matrix # import pickle as pk # train_labels, test_labels = pk.load(open( # '/Users/badgod/badgod_documents/Datasets/covid19/processed_data/coswara_train_data_fbank_cough-shallow_labels.pkl', # 'rb')), pk.load(open( # '/Users/badgod/badgod_documents/Datasets/covid19/processed_data/coswara_test_data_fbank_cough-shallow_labels.pkl', # 'rb')) # train_latent_features, test_latent_features = pk.load( # open('/Users/badgod/badgod_documents/Datasets/covid19/processed_data/forced_train_latent.npy', # 'rb')), pk.load( # open('/Users/badgod/badgod_documents/Datasets/covid19/processed_data/forced_test_latent.npy', 'rb')) # # for x, y in zip(train_latent_features, train_labels): # # if y == 0: # # print('Mean: ', np.mean(x), ' Std: ', np.std(x), ' | Label: ', y) # # for x, y in zip(train_latent_features, train_labels): # # if y == 1: # # print('Mean: ', np.mean(x), ' Std: ', np.std(x), ' | Label: ', y) # # # # exit() # self.logger.info( # 'Total train data len: ' + str(len(train_labels)) + ' | Positive samples: ' + str(sum(train_labels))) # self.logger.info( # 'Total test data len: ' + str(len(test_labels)) + ' | Positive samples: ' + str(sum(test_labels))) # # oneclass_svm = svm.OneClassSVM(kernel="rbf") # oneclass_svm = IsolationForest(random_state=0) # oneclass_svm.fit(train_latent_features) # oneclass_predictions = oneclass_svm.predict(train_latent_features) # masked_predictions = self.mask_preds_for_one_class(oneclass_predictions) # train_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor(train_labels), threshold=self.threshold) # train_metrics = {'train_' + k: v for k, v in train_metrics.items()} # self.logger.info(f'***** Train Metrics ***** ') # self.logger.info( # f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} " # f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} " # f"| Precision:{'%.5f' % train_metrics['train_precision']} " # f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}") # self.logger.info('Train Confusion matrix - \n' + str(confusion_matrix(train_labels, masked_predictions))) # # Test # oneclass_predictions = oneclass_svm.predict(test_latent_features) # masked_predictions = self.mask_preds_for_one_class(oneclass_predictions) # test_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor(test_labels), threshold=self.threshold) # test_metrics = {'test_' + k: v for k, v in test_metrics.items()} # self.logger.info(f'***** Test Metrics ***** ') # self.logger.info( # f"Accuracy: {'%.5f' % test_metrics['test_accuracy']} " # f"| UAR: {'%.5f' % test_metrics['test_uar']}| F1:{'%.5f' % test_metrics['test_f1']} " # f"| Precision:{'%.5f' % test_metrics['test_precision']} " # f"| Recall:{'%.5f' % test_metrics['test_recall']} | AUC:{'%.5f' % test_metrics['test_auc']}") # self.logger.info('Test Confusion matrix - \n' + str(confusion_matrix(test_labels, masked_predictions))) from sklearn import svm from sklearn.metrics import confusion_matrix import pickle self._min, self._max = -80.0, 3.8146973e-06 train_data, train_labels = self.data_reader(self.data_read_path, [self.train_file], shuffle=False, train=True, only_negative_samples=False) test_data, test_labels = self.data_reader(self.data_read_path, [self.test_file], shuffle=False, train=False, only_negative_samples=False) train_latent_features, test_latent_features = [], [] with torch.no_grad(): for i, (audio_data, label) in enumerate(zip(train_data, train_labels)): audio_data = to_tensor(audio_data, device=self.device) train_predictions, train_latent = self.network(audio_data) train_latent_features.extend(to_numpy(train_latent.squeeze(1))) pickle.dump(train_latent_features, open('ae_contrastive_train_latent.npy', 'wb')) oneclass_svm = svm.OneClassSVM(nu=0.1, kernel="poly", gamma=0.1) oneclass_svm.fit(train_latent_features) oneclass_predictions = oneclass_svm.predict(train_latent_features) masked_predictions = self.mask_preds_for_one_class(oneclass_predictions) train_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor([element for sublist in train_labels for element in sublist]), threshold=self.threshold) train_metrics = {'train_' + k: v for k, v in train_metrics.items()} self.logger.info(f'***** Train Metrics ***** ') self.logger.info( f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} " f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} " f"| Precision:{'%.5f' % train_metrics['train_precision']} " f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}") self.logger.info('Train Confusion matrix - \n' + str( confusion_matrix([element for sublist in train_labels for element in sublist], masked_predictions))) # Test with torch.no_grad(): for i, (audio_data, label) in enumerate(zip(test_data, test_labels)): audio_data = to_tensor(audio_data, device=self.device) test_predictions, test_latent = self.network(audio_data) test_latent_features.extend(to_numpy(test_latent.squeeze(1))) pickle.dump(test_latent_features, open('ae_contrastive_test_latent.npy', 'wb')) oneclass_predictions = oneclass_svm.predict(test_latent_features) masked_predictions = self.mask_preds_for_one_class(oneclass_predictions) test_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor([element for sublist in test_labels for element in sublist]), threshold=self.threshold) test_metrics = {'test_' + k: v for k, v in test_metrics.items()} self.logger.info(f'***** Test Metrics ***** ') self.logger.info( f"Accuracy: {'%.5f' % test_metrics['test_accuracy']} " f"| UAR: {'%.5f' % test_metrics['test_uar']}| F1:{'%.5f' % test_metrics['test_f1']} " f"| Precision:{'%.5f' % test_metrics['test_precision']} " f"| Recall:{'%.5f' % test_metrics['test_recall']} | AUC:{'%.5f' % test_metrics['test_auc']}") self.logger.info('Test Confusion matrix - \n' + str( confusion_matrix([element for sublist in test_labels for element in sublist], masked_predictions))) train_latent_features = np.array(train_latent_features) test_latent_features = np.array(test_latent_features) ones_idx = [i for i, x in enumerate(train_labels) if x == 1] zeros_idx = [i for i, x in enumerate(train_labels) if x == 0] print(train_latent_features[ones_idx].mean(), train_latent_features[ones_idx].std()) print(train_latent_features[zeros_idx].mean(), train_latent_features[zeros_idx].std()) ones_idx = [i for i, x in enumerate(test_labels) if x == 1] zeros_idx = [i for i, x in enumerate(test_labels) if x == 0] print(test_latent_features[ones_idx].mean(), test_latent_features[ones_idx].std()) print(test_latent_features[zeros_idx].mean(), test_latent_features[zeros_idx].std())
dataset = pd.read_csv(r'C:\Users\96251\Desktop\ML_code\files\100-Days-Of-ML-Code-master\datasets\Social_Network_Ads.csv') X = dataset.iloc[ : , [2,3]].values Y = dataset.iloc[ : ,4].values X_train,X_test,Y_train, Y_test = train_test_split(X,Y,test_size=0.25, random_state=0) sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) classifier = LogisticRegression() classifier.fit(X_train,Y_train) Y_pred = classifier.predict(X_test) cm = confusion_matrix(Y_test,Y_pred) X_set ,Y_set = X_train, Y_train X1,X2=np. meshgrid(np. arange(start=X_set[:,0].min()-1, stop=X_set[:, 0].max()+1, step=0.01), np. arange(start=X_set[:,1].min()-1, stop=X_set[:,1].max()+1, step=0.01)) print('X_set[:,0].min()-1',X_set[:,0].min()-1) print('X_set[:, 0].max()+1',X_set[:, 0].max()+1) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, cmap = ListedColormap(('LightCoral', 'MintCream'))) print('a',np.array([X1.ravel(),X2.ravel()])) print('b',classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).shape) print('c',classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape).shape) plt.xlim(X1.min(),X1.max()) plt.ylim(X2.min(),X2.max()) for i,j in enumerate(np. unique(Y_set)):
ax.set_ylim([0.0, 1.02]) if k in [3, 4, 5]: ax.set_xlabel('Recall (Sensitivity)', fontsize=17) if k in [0, 3]: ax.set_ylabel('Precision (PPV)', fontsize=17) # plt.title('Precision-Recall curve (' + name + ')') if k == 0: plt.legend(loc="lower left", fontsize=17) else: ax.legend().remove() plt.tight_layout() plt.savefig('./outputs/figures/precision_recall_{0}.pdf'.format(name)) # %% Confusion matrices (Supplementary Table 1) M = [[confusion_matrix(y_true[:, k], y_pred[:, k], labels=[0, 1]) for k in range(nclasses)] for y_pred in [y_neuralnet, y_cardio, y_emerg, y_student]] M_xarray = xr.DataArray(np.array(M), dims=['predictor', 'diagnosis', 'true label', 'predicted label'], coords={'predictor': ['DNN', 'cardio.', 'emerg.', 'stud.'], 'diagnosis': diagnosis, 'true label': ['not present', 'present'], 'predicted label': ['not present', 'present']}) confusion_matrices = M_xarray.to_dataframe('n') confusion_matrices = confusion_matrices.reorder_levels([1, 2, 3, 0], axis=0) confusion_matrices = confusion_matrices.unstack() confusion_matrices = confusion_matrices.unstack() confusion_matrices = confusion_matrices['n'] confusion_matrices.to_excel("./outputs/tables/confusion matrices.xlsx", float_format='%.3f') confusion_matrices.to_csv("./outputs/tables/confusion matrices.csv", float_format='%.3f')
#len of X_train here is the 8000. #1 here is the column. #13 is the feature. X_train=X_train.reshape(len(X_train),1,16) X_test=X_test.reshape(len(X_test),1,16) model.add(LSTM(100, input_shape = (None,16),activation='relu')) model.add(Dense(output_dim=1, activation='sigmoid')) print('Model loaded.') model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print('Model compiled.') print(model.summary()) model.fit(X_train,Y_train,epochs=20) val_loss,val_acc=model.evaluate(X_test,Y_test) print(val_loss,val_acc) p=model.predict([X_test]) import numpy as np for i in range(0,40): print(p[i],Y_test[i]) p = (p > 0.5) from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, p) print(cm)
# Fitting the classifier into the Training set from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=0) classifier.fit(X_Train, Y_Train) # Predicting the test set results Y_Pred = classifier.predict(X_Test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_Test, Y_Pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_Set, Y_Set = X_Train, Y_Train X1, X2 = np.meshgrid( np.arange(start=X_Set[:, 0].min() - 1, stop=X_Set[:, 0].max() + 1, step=0.01), np.arange(start=X_Set[:, 1].min() - 1, stop=X_Set[:, 1].max() + 1, step=0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),
def specificity_score(y_true, y_pred): m = confusion_matrix(y_true, y_pred, labels=[0, 1]) spc = m[0, 0] * 1.0 / (m[0, 0] + m[0, 1]) return spc
# definindo o modelo de treinamento # dando um fit no modelo passando o conjunto de teste e de treino # Foi executado diversos algoritmos e o linear_model.BayesianRidge() foi o que teve mais precisao para # a base de treinamento proposta model = linear_model.BayesianRidge() model.fit(X_train, Y_train) preds = model.predict(X_test) #Imprimindo a precisao do modelo accuracy = metrics.accuracy_score(preds.round(), Y_test) print("Accuracy : %s" % "{0:.3%}".format(accuracy)) # Criando a matriz de confusao from sklearn.metrics import confusion_matrix cm = confusion_matrix(Y_test, preds.round()) # criando a precisao do modelo em arquivo com base no conjunto de gerado de treino, teste e predicao createPrediction() ################################################################################################## ###### Fim da execucao do modelo de data mining no conjunto de dados ja preprocessados ################################################################################################## ####################################################################################### ############ Criando a tabela de classificacao com base no resultado da predicao ####################################################################################### jogos = pd.read_csv('result.csv') tabela = grupoTime[grupoTime['ano'] == 2018] tabela = tabela.filter(items=['id', 'team', 'grupo'])
classifier = LogisticRegression(random_state=0) classifier.fit(X_train,y_train) #pca dönüşümünden sonra gelen LR classifier2 = LogisticRegression(random_state=0) classifier2.fit(X_train2,y_train) #tahminler y_pred = classifier.predict(X_test) y_pred2 = classifier2.predict(X_test2) from sklearn.metrics import confusion_matrix #actual / PCA olmadan çıkan sonuç print('gercek / PCAsiz') cm = confusion_matrix(y_test,y_pred) print(cm) #actual / PCA sonrası çıkan sonuç print("gercek / pca ile") cm2 = confusion_matrix(y_test,y_pred2) print(cm2) #PCA sonrası / PCA öncesi print('pcasiz ve pcali') cm3 = confusion_matrix(y_pred,y_pred2) print(cm3) from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
Y, test_size=0.3, random_state=0) # 2-Let us build the model and validate the parameters clf1 = linear_model.LogisticRegression(solver='lbfgs') clf1.fit(X_train, Y_train) #3- Run the test data against the new model probs = clf1.predict_proba(X_test) print(probs) predicted = clf1.predict(X_test) print(predicted) #4-Check model accuracy print(metrics.accuracy_score(Y_test, predicted)) #To avoid sampling bias run cross validation for 10 times, as follows scores = cross_val_score(linear_model.LogisticRegression(solver='lbfgs'), X, Y, scoring='accuracy', cv=10) print(scores) print(scores.mean()) #Generate the confusion matrix as follows: prob = probs[:, 1] prob_df = pd.DataFrame(prob) prob_df['predict'] = np.where(prob_df[0] >= 0.05, 1, 0) Y_A = Y_test.values Y_P = np.array(prob_df['predict']) confusion_matrix = confusion_matrix(Y_A, Y_P) print(confusion_matrix)
for i in imptlist: temp.append([i]) temp = pd.DataFrame(temp, index = ['Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'Age', 'Education_Bachelors', 'Education_Graduate Degree', 'Education_High School', 'Education_Partial College', 'Education_Partial High School', 'Occupation_Clerical', 'Occupation_Management', 'Occupation_Manual', 'Occupation_Professional', 'Occupation_Skilled Manual']) temp.columns = ['Feature Importance'] print(temp) import sklearn.metrics as metrics yc_predict = fcmodel.predict(Xc_test) print(metrics.confusion_matrix(yc_test, yc_predict)) print('Recall Score:', round(metrics.recall_score(yc_test, yc_predict) * 100,3)) print('Accuracy Score:', round(metrics.accuracy_score(yc_test, yc_predict) * 100, 3)) print('Precision:', round(metrics.precision_score(yc_test, yc_predict) * 100, 3)) print('F1 Score:', round(metrics.f1_score(yc_test, yc_predict) * 100,3)) fpr, tpr, threshold = metrics.roc_curve(yc_test, yc_predict) roc_auc = metrics.auc(fpr, tpr) plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc, color='darkorange') plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--', color = 'b') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()