def testing(net):
	global fileNames_path_valid
	global map_fname_label_valid
	global maps_list
	global get_temp
	predicted_y = []
	actual_y = []
	if get_temp:
		random.shuffle(fileNames_path_valid)
		maps_list = manager.list([])
		pool = multi.Pool(processes=4)
		pool.map(runner, fileNames_path_valid[:500])
		get_temp = False
	
	for maps, path in maps_list:
		result =  net.activate(np.ravel(np.array(maps)))
		label = map_label_int_label[map_fname_label_valid[path]]
		# if result[0] >= 0.5:
		# 	predicted_y.append(1)
		# else:
		# 	predicted_y.append(0)
		# print result
		# print result.argmax()
		predicted_y.append(result.argmax())
		actual_y.append(label)

	print accuracy_score(actual_y, predicted_y)
	print confusion_matrix(actual_y, predicted_y)
Example #2
0
    def run_ratio(self, dataset, set_size):
        '''
        Compare several competing methods changing the ratio of the positive
        class in the dataset. We use binary class dataset for the easy of
        interpretation.
        '''
        X_train_full, y_train_full, X_test, y_test = dataset
        X_train, y_train = self.get_sub_set_with_size([X_train_full, y_train_full], set_size)
        test_set_original = (X_test, y_test)

        large = ENMLT(LinearSVC)
        large.fit(X_train, y_train)

        simple = LinearSVC()
        simple.fit(X_train, y_train)

        for r in numpy.arange(0.05, 1.0, 0.05):
            # Generate a new test set with desired positive proportions.
            X_test_new, y_test_new = SetGen.with_pos_ratio(test_set_original, r, pos_label=1)

            y_pred = large.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc1 = self.accuracy(cm)

            y_pred = simple.predict(X_test_new)
            cm = confusion_matrix(y_test_new, y_pred)
            acc2 = self.accuracy(cm)

            print "%.2f, %f, %f" % (r, acc1, acc2)
Example #3
0
def do_xgboost(x,y):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
    print "xgboost"
    xgb_model = xgb.XGBClassifier().fit(x_train, y_train)
    y_pred = xgb_model.predict(x_test)
    print(classification_report(y_test, y_pred))
    print metrics.confusion_matrix(y_test, y_pred)
Example #4
0
def printMeasuresOfEfficiency(yTest, y_pred):
    # I was having an issue where the confusion matrix would come out to [[100]] or [[40]].
    # As opposed to how it should be: a 2x2 matrix.
    # After some investigation, I came to the conclusion that the test set only contained one class!
    # Therefore, there is no false positive, no false negative; Only the one class in the test set.
    if(len(confusion_matrix(yTest, y_pred)) == 1):
        print("Test set contains one class only. There is no false positive or false negative; Only the one class.")
        return


    tn, fp, fn, tp = confusion_matrix(yTest, y_pred).ravel()

    # Measures of efficiency
    # ppv: positive predicted values
    # npv: negative predicted values
    # sensitivity (recall): negative predicted values
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    ppv = tp / tp / (tp+fp)
    npv = tn / (tn + fn)

    print("\ttn: {}  fp: {}  fn: {}  tp: {}".format(tn, fp, fn, tp))
    print("\tspecificity: {}".format(specificity, sensitivity, ppv, npv))
    print("\tsensitivity: {}".format(specificity, sensitivity, ppv, npv))
    print("\tppv: {}".format(specificity, sensitivity, ppv, npv))
    print("\tnpv: {}".format(specificity, sensitivity, ppv, npv))
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def inspect():
    data = pd.DataFrame.from_csv(os.path.join(PATH, "obs.csv"), header=None, index_col=[0])
    features, labels = stat_calculator().get_stats_labels(data.values)

    f_train, f_test, l_train , l_test = train_test_split(features, labels, test_size=.5)

    clf = RandomForestClassifier(n_estimators=30).fit(f_train, l_train)

    out = clf.predict(f_test)

    out_p = clf.predict_proba(f_test)
    out_p = pd.DataFrame(out_p, columns=clf.classes_)

    # regular conf mat
    cm1 = confusion_matrix(l_test, out)

    # regular argmax confusion matrix 
    predicted = out_p.apply(lambda s: s.argmax(), axis=1)
    cm2 = confusion_matrix(l_test, predicted)

    # thresholded argmax confusion matrix 
    predicted2 = out_p.apply(lambda s: s.argmax() if s.max() > THETA else "zother", axis=1)
    cm2 = confusion_matrix(l_test, predicted2)

    print(cm1, cm2)
Example #7
0
    def print_prediction_results():
        results = []
        for c, Y_test in zip(classes, test_data):
            for y in Y_test:
                query = ma.masked_array(
                    np.array([tuple(y) + (0,)],
                             dtype=[('', bool)] * (D - 1) + [('', int)]),
                    mask=[(False,) * (D - 1) + (True,)])[0]
                samples = [
                    s.sample_post_pred(query, r)[1][0][-1] for _ in xrange(30)]
                samples = np.bincount(samples, minlength=len(classes))
                prediction = np.argmax(samples)
                results.append((classmap[c], prediction, samples))
            print 'finished predictions for class', c

        Y_actual = np.array([a for a, _, _ in results], dtype=np.int)
        Y_pred = np.array([b for _, b, _ in results], dtype=np.int)
        print 'accuracy:', accuracy_score(Y_actual, Y_pred)
        print 'confusion matrix:'
        print confusion_matrix(Y_actual, Y_pred)

        # AUROC for one vs all (each class)
        for i, clabel in enumerate(classes):
            Y_true = np.copy(Y_actual)

            # treat class c as the "positive" example
            positive_examples = Y_actual == i
            negative_examples = Y_actual != i
            Y_true[positive_examples] = 1
            Y_true[negative_examples] = 0
            Y_prob = np.array([float(c[i]) / c.sum() for _, _, c in results])
            cls_auc = roc_auc_score(Y_true, Y_prob)
            print 'class', clabel, 'auc=', cls_auc
def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False):
  
    accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train))
    accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test))
    print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test))
    
    if not short:
    
      # confusion matrix
      # rows: actual group
      # columns: predicted group
      print('Confusion_matrix (training data):')
      print(metrics.confusion_matrix(y_train, model.predict(X_train)))
      
      print('Confusion_matrix (test data):')
      print(metrics.confusion_matrix(y_test, model.predict(X_test)))

      # precision =  tp / (tp + fp)
      # recall = tp / (tp + fn) (= sensitivity)
      # F1 = 2 * (precision * recall) / (precision + recall)
      print('\nPrecision - recall (training data):')
      print(metrics.classification_report(y_train, model.predict(X_train)))
      
      print('\nPrecision - recall (test data):')
      print(metrics.classification_report(y_test, model.predict(X_test)))
def main():
    # parameters to cross-validate over
    parameters = {
        'l2': np.logspace(-5, 0, num=6),
    }

    # load iris data in, make a binary decision problem out of it
    data = load_digits()

    X = Array2Dict().fit_transform(data.data)
    y = 2 * (data.target >= 5) - 1

    i = int(0.8 * len(X))
    X_train, X_test = X[:i], X[i:]
    y_train, y_test = y[:i], y[i:]

    # do the actual learning
    gs = GridSearchCV(
        VW_Classifier(loss='logistic', moniker='example_sklearn',
                      passes=10, silent=True, learning_rate=10),
        param_grid=parameters,
        score_func=f1_score,
        cv=StratifiedKFold(y_train),
    ).fit(X_train, y_train)

    # print out results from cross-validation
    estimator = gs.best_estimator_
    score = gs.best_score_
    print 'Achieved a F1 score of %f using l2 == %f during cross-validation' % (score, estimator.l2)

    # print confusion matrix on test data
    y_est = estimator.fit(X_train, y_train).predict(X_test)
    print 'Confusion Matrix:'
    print confusion_matrix(y_test, y_est)
Example #10
0
def detect_anomalies():

    encoded_X_train = np.load("resources/files/encoded_X_train.npy")
    encoded_X_test = np.load("resources/files/encoded_X_test.npy")
    print(encoded_X_train.shape)
    print(encoded_X_test.shape)

    clf = svm.OneClassSVM(nu=0.1, kernel="linear")
    clf.fit(encoded_X_train)
    y_pred_train = clf.predict(encoded_X_train)
    y_pred_test = clf.predict(encoded_X_test)
    y_pred_outliers = clf.predict(np.full((100,hidden_dimensions[1]),4))

    # print y_pred_train[y_pred_train == -1].size
    # print y_pred_test[y_pred_test == -1].size
    # print y_pred_outliers[y_pred_outliers == -1].size

    # n_normal_points_test = X_test[y_pred_test == 1]
    # n_anomalies_test = X_test[y_pred_test == -1]
    # print(n_normal_points_test.shape)
    # print(n_anomalies_test.shape)

    print("Train Accuracy: %f"%(accuracy_score(Y_train, y_pred_train)))
    print("Test Accuracy: %f"%( accuracy_score(Y_test, y_pred_test)))
    print("Precision: %f" % (precision_score(Y_test, y_pred_test,pos_label=1)))
    #print("Recall: %f" % (precision_score(Y_test, y_pred_test, pos_label=-1)))
    print "Confusion Matrix: (Anomalies, Normal)"
    print confusion_matrix(Y_test,y_pred_test,labels=[-1,1])
    fpr, tpr, thresholds = metrics.roc_curve(Y_test, y_pred_test, pos_label=1)
    print "AUC: %f"%metrics.auc(fpr, tpr)
def test_digits() :
    
    from sklearn.cross_validation import train_test_split 
    from sklearn.datasets import load_digits
    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    from sklearn.preprocessing import LabelBinarizer
    
    digits = load_digits()
    X = digits.data
    y = digits.target   #labels
    X /= X.max()        #norm

    nn = NeuralNetwork([64,100,10],'logistic')  #8x8 input, 10 output
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    labels_train = LabelBinarizer().fit_transform(y_train)  #convert no to vector
    labels_test = LabelBinarizer().fit_transform(y_test)

    nn.fit(X_train,labels_train,epochs=100)
    predictions = []
    for i in range(X_test.shape[0]) :
        o = nn.predict(X_test[i])
        predictions.append(np.argmax(o))
    print confusion_matrix(y_test,predictions)
    print classification_report(y_test,predictions)
    print 'accuracy at %0.3f'%accuracy_score(y_test,predictions)
Example #12
0
def modelfit(alg, train_data, train_label, cv_folds=5, early_stopping_rounds=1):

    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(train_data, label=train_label)
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=alg.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics=['auc'],
                      early_stopping_rounds=early_stopping_rounds,
                      show_progress=True)
    alg.set_params(n_estimators=cvresult.shape[0])   # Goal of CV is to tune the number of rounds, which is set here

    # Note: can change to a different day to see what happens
    start = time.time()
    alg.fit(train_data,
            train_label,
            eval_metric='auc')
    print "Time to fit: %s" % (time.time()-start)

    pickle.dump(alg, open("/home/jche/Desktop/xgboost.p", "w+"))   # Save model

    start = time.time()
    dtrain_predprob = alg.predict_proba(train_data)[:,1]
    print "Time to predict: %s" % (time.time() - start)

    for cutoff in range(0, 41):
        cut = cutoff/float(100)   # Cutoff in decimal form
        dtrain_predictions = dtrain_predprob > cut   # If y values are greater than the cutoff
        # Print model report:
        print "\nModel Report for cutoff %s" % cut
        print "Accuracy : %.4g" % metrics.accuracy_score(train_label, dtrain_predictions)
        print "AUC Score (Train): %f" % metrics.roc_auc_score(train_label, dtrain_predprob)
        print "Recall is: %s" % metrics.recall_score(train_label, dtrain_predictions)
        print metrics.confusion_matrix(train_label, dtrain_predictions)
Example #13
0
def getBestK(X_train, y_train, X_val, y_val, nns=[30], print_train=True, print_val=True):
    acc_train = np.zeros((1, len(nns)))
    acc_val = np.zeros((1, len(nns)))
    for j in range(0, len(nns)):
        print j
        sys.stdout.flush()
        knn = KNNClassifier(nns[j])
        knn.train(X_train, y_train)
        # acc_train[0, j] = np.mean(knn.predict(X_train) == y_train)
        print acc_train[0, j]
        sys.stdout.flush()
        y_pred = knn.predict(X_val)
        acc_val[0, j] = np.mean(y_pred == y_val)
        print acc_val[0, j]
        sys.stdout.flush()
        print "Confusion matrix:"
        print confusion_matrix(y_pred, y_val)

    if print_train:
        print (acc_train)
    if print_val:
        print (acc_val)

    best_val = np.max(acc_val)
    best_rate, best_reg = np.where(acc_val == np.amax(acc_val))
    return (best_rate[0], best_reg[0]), knn
Example #14
0
def getScores(y, yPredTrain, yTest, yPredTest):

    scores = dict()

    scores['f1Train'] = f1_score(y, yPredTrain)
    scores['f1Test'] = f1_score(yTest, yPredTest)


    scores['accTrain'] = accuracy_score(y, yPredTrain)
    scores['accTest'] = accuracy_score(yTest, yPredTest)
    

    scores['rocTrain'] = roc_auc_score(y, yPredTrain)
    scores['rocTest'] = roc_auc_score(yTest, yPredTest)
    

    scores['cMatrixTrain'] = confusion_matrix(y, yPredTrain)
    scores['cMatrixTest'] = confusion_matrix(yTest, yPredTest)

    proba = float(len(np.where(y==1)[0]))/len(y)
    if proba < 0.50:
        proba = 1 - proba
    scores['random'] = proba
    
    return scores
def crossVal(positions, X, y, missedYFile):
    outF = open(missedYFile, 'w')
    posArray = np.array(positions)
    # Split into training and test
    sss = StratifiedShuffleSplit(y, 4, test_size=0.1, random_state=442)
    cvRound = 0
    for train_index, test_index in sss:
        clf = ExtraTreesClassifier(n_estimators=300,
                                   random_state=13,
                                   bootstrap=True,
                                   max_features=20,
                                   min_samples_split=1,
                                   max_depth=8,
                                   min_samples_leaf=13,
                                   n_jobs=4
                                   )
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        pos_test = posArray[test_index]

        clf = clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        metrics.confusion_matrix( y_test, preds )
        print( metrics.classification_report(y_test, clf.predict(X_test)) )
        for loc,t,p in zip(pos_test, y_test, preds):
            if t=='0' and p=='1':
                print >> outF, loc + '\t' + str(cvRound)
        cvRound += 1
    outF.close()
Example #16
0
def testdata_stats():
    test_dataset = datasets.load_files(project_root+"/testdata",
                                     encoding='utf-8',
                                  decode_error='ignore')

    # save_thing_to_file(test_dataset, "test_dataset.txt")

    bayes = get_thing_from_file("bayes.txt")
    bayes.fit(test_dataset.data, test_dataset.target)
    predicted_nb = bayes.predict(test_dataset.data)

    print "*****BAYESIAN STATS****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_nb == test_dataset.target))

    print(metrics.classification_report(test_dataset.target, predicted_nb,
    target_names=test_dataset.target_names))
    print "*****BAYESIAN CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_nb)

    svm = get_thing_from_file("svm.txt")
    svm.fit(test_dataset.data, test_dataset.target)
    predicted_svm = svm.predict(test_dataset.data)

    print "*****SVM STATS*****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_svm == test_dataset.target))
    print(metrics.classification_report(test_dataset.target, predicted_svm,
    target_names=test_dataset.target_names))
    print "*****SVM CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_svm)
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring):
    '''
    Run normal SVM classification without cross-fold validation.
    '''

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation

    # feature selection since we have a small sample space
    fs = SelectPercentile(scoring, percentile=20)

    pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)])

    pipeline = OneVsRestClassifier(pipeline)

    clfer = pipeline.fit(x_train, y_train)
    y_predict_train = clfer.predict(x_train)

    print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train)

    y_predict_test = clfer.predict(x_test)
    print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test)

    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)

    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
Example #18
0
def pipeline_summary(concept_id, pos_concept, neg_concept, pipeline, test_set, predicted, store=False):
    vectorizer = pipeline.named_steps['vectorizer']
    classifier = pipeline.named_steps['classifier']

    fnames = vectorizer.get_feature_names() 
    try:    
        selector = pipeline.named_steps['selector']
        indices = selector.get_support(True)                 
        selected_terms = [ fnames[i] for i in indices ]
    except KeyError:
        print'Selector not used'
        selected_terms = fnames
    
    show_most_informative_features(selected_terms, classifier, n=25)
    
    print classification_report(test_set, predicted)
    print confusion_matrix(test_set, predicted)    
    
    if store:
        print 'Storing pipeline...'
        pickle.dump(pipeline, open(concept_pipeline(pos_concept, neg_concept), 'wb'))

        coefs = np.where( classifier.coef_ > 0)[1]
        concept_terms = [ selected_terms[i] for i in coefs ]
        term_weights = [ classifier.coef_[0,i] for i in coefs ]
        
        feature_weights = sorted(zip(term_weights, concept_terms), reverse=True)        
        
        print 'Storing concept terms'
        save_concept_terms(concept_id, feature_weights)
def fitMdl(nFitObs = 50):
    mdl = linear_model.LogisticRegression(verbose = 1)
    mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:],                             (nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])),                  glbObsTrnRsp[0:nFitObs])
    print mdl.get_params()
    print mdl.coef_.shape
    print '  coeff stats:'
    for lblIx in xrange(len(dspLabels)):
        print '  label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' %             (dspLabels[lblIx],              mdl.coef_[lblIx,:].argmin() / glbImgSz,              mdl.coef_[lblIx,:].argmin() % glbImgSz,              mdl.coef_[lblIx,:].min(),              mdl.coef_[lblIx,:].argmax() / glbImgSz,              mdl.coef_[lblIx,:].argmax() % glbImgSz,              mdl.coef_[lblIx,:].max())

    train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:],                                                     (nFitObs               , glbImgSz ** 2)))
    accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs])
    print '  accuracy train:%0.4f' % (accuracy_train)
    print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels)

    valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr,                                                     (glbObsVldFtr.shape[0], glbImgSz ** 2)))
    accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
    print '  accuracy valid:%0.4f' % (accuracy_valid)
    print metrics.confusion_matrix(glbObsVldRsp           , valid_pred_labels)

    test_pred_labels  = mdl.predict(np.reshape(glbObsNewFtr,                                                     (glbObsNewFtr.shape[0], glbImgSz ** 2)))
    accuracy_test = metrics.accuracy_score( test_pred_labels,  glbObsNewRsp)
    print '  accuracy  test:%0.4f' % (accuracy_test)
    test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp,  test_pred_labels),                              index = dspLabels, columns = dspLabels)
    print test_conf
    
    return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
Example #20
0
def train_and_evaluate(X, y, clf):
    from sklearn.cross_validation import train_test_split
    train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7, random_state=1)
    clf.fit(train_X, train_y)
    pre = clf.predict(test_X)
    print metrics.classification_report(test_y, pre)
    print metrics.confusion_matrix(test_y, pre)
Example #21
0
    def report(self):
        from sklearn.metrics import roc_auc_score
        from sklearn.metrics import classification_report
        from sklearn.metrics import confusion_matrix

        y_pred_probas, y_true = self.make_predictions()[:2]
        y_pred = y_pred_probas.argmax(1)
        y_pred_probas = y_pred_probas[:, 1]
        y_true = y_true.reshape(-1)

        try:
            score = roc_auc_score(y_true, y_pred_probas)
        except ValueError:
            pass
        else:
            print
            print "AUC score:", score
            print "AUC score (binary):", roc_auc_score(y_true, y_pred)
            print

        print "Classification report:"
        print classification_report(y_true, y_pred)
        print

        print "Confusion matrix:"
        print confusion_matrix(y_true, y_pred)
        print
Example #22
0
def a_b_classify_pca((f_train, t_train, f_test, t_test, n_components)):
    '''
    Uses an SVM to classify A and B sections based on the feature vectors
    built above, and returns some statistical results
    '''
    print '{}: Starting PCA with {} components (this could take a while...)'.format(time.ctime(), n_components)
    pca = PCA(n_components = n_components)
    pca.fit(f_train)
    f_train_pca = list(pca.transform(f_train))
    f_test_pca = list(pca.transform(f_test))

    print '{0}: Training the SVM'.format(time.ctime())
    clf = svm.SVC()
    clf.fit(f_train_pca, t_train)

    print '{0}: Classifying using SVM'.format(time.ctime())
    t_predict = clf.predict(f_test_pca)
    t_train_predict = clf.predict(f_train_pca)
    
    print 'Confusion matrix is built so that C_ij is the number of observations known to be in group i but predicted to be in group j. In this case, group 0 corresponds to A sections and group 1 corresponds to B sections.'
    
    print 'Confusion matrix on test data:'
    test_confuse = confusion_matrix(t_test, t_predict)
    print test_confuse

    print 'Confusion matrix on training data:'
    train_confuse = confusion_matrix(t_train, t_train_predict)
    print train_confuse
    return train_confuse, test_confuse
Example #23
0
def confusion_matrices(testX, testY):
    pred = [card.predict_number() for card in testX]
    labels = ["one", "two", "three"]
    cm = confusion_matrix(testY[:,NUMBER], pred, labels)
    print(cm)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    pl.title('Number Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    pl.xlabel('Predicted')
    pl.ylabel('True')
    pl.show()

    pred = [card.predict_shading() for card in testX]
    labels = ["empty", "striped", "solid"]
    cm = confusion_matrix(testY[:,SHADING], pred, labels)
    print(cm)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    pl.title('Shading Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    pl.xlabel('Predicted')
    pl.ylabel('True')
    pl.show()

    pred = [card.predict_shape() for card in testX]
    labels = ["rounded-rectangle", "squiggle", "diamond"]
    cm = confusion_matrix(testY[:,SHAPE], pred, labels)
    print(cm)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    pl.title('Shape Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    pl.xlabel('Predicted')
    pl.ylabel('True')
    pl.show()

    pred = [card.predict_color() for card in testX]
    labels = ["red", "green", "purple"]
    cm = confusion_matrix(testY[:,COLOR], pred, labels)
    print(cm)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    pl.title('Color Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    pl.xlabel('Predicted')
    pl.ylabel('True')
    pl.show()
def measure_performance(X,y,clf, show_accuracy=True,
                        show_classification_report=True,
                        show_confusion_matrix=True):
    """
    多指标来评估模型
    :param X: 测试集
    :param y: 真实结果
    :param clf: 模型
    :param show_accuracy: 显示正确率
    :param show_classification_report: 显示分类报告
    :param show_confusion_matrix:
    :return:
    """
    y_pred = clf.predict(X)
    if show_accuracy:
        print "Accuracy:{0:.4f}".format(metrics.accuracy_score(y,y_pred)), "\n"

    if show_classification_report:
        print "模型分类报告:"
        print metrics.classification_report(y,y_pred,labels=[0,1],target_names=['良性网址','恶意网址']), "\n"

    if show_confusion_matrix:
        print "混淆矩阵报告:"
        print metrics.confusion_matrix(y,y_pred),"\n"
    return metrics.confusion_matrix(y,y_pred)
def benchmark(clf):
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)

    print

    score = 1 - metrics.f1_score(y_test, pred)
    print "error:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        print "top 10 keywords per class:"
        for i, category in enumerate(categories):
            top10 = np.argsort(clf.coef_[i])[-10:]
            print "%s: %s" % (category, " ".join(feature_names[top10]))
        print

    print metrics.classification_report(y_test, pred,
                                        target_names=categories)

    print "confusion matrix:"
    print metrics.confusion_matrix(y_test, pred)

    print
    clf_descr = str(clf).split('(')[0]
    return clf_descr, score
Example #26
0
def select_classifier(algo, label):
    model = algo
    model.fit(training_features, training_labels)
    expected = testing_labels
    predicted = model.predict(testing_features)

    print("----------------------------------------------------")
    print("|               Classification Report              |")
    print("----------------------------------------------------")
    print(metrics.classification_report(expected, predicted))
    print("")

    print("----------------------------------------------------")
    print("|                  Confusion Matrix                |")
    print("----------------------------------------------------")
    print(metrics.confusion_matrix(expected, predicted))
    print("")

    cm_list = metrics.confusion_matrix(expected, predicted).tolist()
    list_total = float(sum(sum(x) for x in cm_list))

    print("----------------------------------------------------")
    print("|           False Positives and Negatives          |")
    print("----------------------------------------------------")
    print "False Positive: ", cm_list[1][0] / list_total
    print("")
    print "False Negative: ", cm_list[0][1] / list_total
    print("")

    plt.figure()
    plot_confusion_matrix(metrics.confusion_matrix(expected, predicted), label)
    plt.show()
def train_and_evaluate(clf, X_train, X_test, y_train, y_test, y_name):

    # Training
    clf.fit(X_train, y_train)
    # Prediction of testing sets
    y_pred = clf.predict(X_test)

    # Precision, recall and support (i.e. nr. of samples used for the testing)
    print "Classification Report:"
    print metrics.classification_report(y_test, y_pred)
    # Confusion Matrix
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_pred)

    # Visualization of Categories / Assigned / Data
    print "Tested data => assigned category,    data:"
    for i in range(len(X_test)):
        print str(i) + ")   Real category: " + str(y_name[y_test[i]]) + ",    Assigned category: " + \
            str(y_name[y_pred[i]]) + ",     Data: " + str(X_test[i])

    # Assign names to the categories (defined by numbers)
    print "\n Categories: \n"
    categories = set()
    for cat in y_pred:
        categories.add(cat)
    categories = sorted(categories)
    for cat in categories:
        print str(cat) + "    " + y_name[cat]
def gaussian_1d_2classes(x,y):	
	regr = GaussianClassification1D()
	cv = KFold(len(x), n_folds=10)
	for train_idx, test_idx in cv:

		x_train = x[train_idx]
		x_test = x[test_idx]
		y_train = y[train_idx]
		y_test = y[test_idx]

		labels = mapping_labels(np.unique(y_train))

		# Training
		regr.fit(x_train,y_train,labels)

		# Predict over the training data and getting the error
		predicted_y_training = regr.predict(x_train, labels)
		conf_matrix = confusion_matrix(y_train, predicted_y_training)
		precision = calculate_precision(conf_matrix)
		recall = calculate_recall(conf_matrix)
		accuracy = calculate_accuracy(conf_matrix)
		fmeasure = calculate_fmeasure(precision,recall)

		# Predict over the testing data and getting the error
		predicted_y_testing = regr.predict(x_test, labels)
		conf_matrix = confusion_matrix(y_test, predicted_y_testing)
		precision = calculate_precision(conf_matrix)
		recall = calculate_recall(conf_matrix)
		accuracy = calculate_accuracy(conf_matrix)
		fmeasure = calculate_fmeasure(precision,recall)

		print 'Precision:',precision, ' Recall:',recall, ' Accuracy:',accuracy,' F-Measure:',fmeasure
    def learnCART(self):
        train_input_data = self.loadData(self.train_file)
        target = [x[1] for x in train_input_data]
        target = target[1:]
        features = [x[2:] for x in train_input_data]
        features = features[1:]
        # feature selection
        #features_new = self.doFeatureSelection(features,target)
        model = self.classify(features,target)

        test_input_data = self.loadData(self.test_file)
        actualOutput = [x[1] for x in test_input_data]
        actualOutput = actualOutput[1:]
        features = [x[2:] for x in test_input_data]
        features = features[1:]

        predictedOutput = model.predict(features)
        #print predictedOutput
        #print actualOutput
        self.computeAccuracy(predictedOutput,actualOutput)
        print "Precision recall Fscore support metrics for CART "
        print precision_recall_fscore_support(actualOutput,predictedOutput)
        print "\nconfusion matrix\n"
        print confusion_matrix(actualOutput,predictedOutput)
        self.printDTRules(model)
        X= []
        Y=[]
        for a in predictedOutput:
            X.append(int(a))
        for a in actualOutput:
            Y.append(int(a))
        self.plotROC(Y,X)
        result = zip(Y,X)
        self.write_To_File(result,"cart-predictions.csv")
def forward(x_data, y_data, print_conf_matrix=False):
    '''
    Neural net architecture
    :param x_data:
    :param y_data:
    :param train:
    :return:
    '''
    x, t = Variable(x_data), Variable(y_data)

    h1 = F.relu(model.l1(x))
    h1 = F.max_pooling_2d(h1,max_pool_window_1,stride=max_pool_stride_1)

    h2 = F.dropout(F.relu(model.l2(h1)))
    h2 = F.average_pooling_2d(h2, avg_pool_window_2, stride=avg_pool_stride_2)
    h2 = F.max_pooling_2d(h2,max_pool_window_2,stride=max_pool_stride_2)

    y = model.l3(h2)

    # display confusion matrix
    if print_conf_matrix:
        pdb.set_trace()
        print confusion_matrix(cuda.to_cpu(t.data), cuda.to_cpu(y.data).argmax(axis=1))

    return F.softmax_cross_entropy(y, t), F.accuracy(y, t)
Example #31
0
            auc1[label] = auc(fpr[label], tpr[label])

            plt.plot(tpr[label], fpr[label],
                     label='%s tagger, AUC = %.1f%%' % (label.replace('j_', ''), auc1[label] * 100.))
        plt.semilogy()
        plt.xlabel("Signal Efficiency")
        plt.ylabel("Background Efficiency")
        plt.ylim(0.001, 1)
        plt.grid(True)
        plt.legend(loc='upper left')
        plt.figtext(0.25, 0.90, '(Unpruned)', fontweight='bold', wrap=True, horizontalalignment='right', fontsize=14)
        plt.savefig(options.outputDir + 'ROC_' + str(time) + '.png' % ())

    # Confusion matrix
    conf_mat = confusion_matrix(lbllist.numpy(), predlist.numpy())
    df_cm = pd.DataFrame(conf_mat, index=[i for i in full_dataset.labels_list],
                         columns=[i for i in full_dataset.labels_list])
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True,fmt='g')
    plt.savefig(options.outputDir + 'confMatrix_' + str(time) + '.png')
    plt.show()
    print(conf_mat)
    class_accuracy = 100 * conf_mat.diagonal() / conf_mat.sum(1)
    print(class_accuracy)

    torch.save(current_model.state_dict(), options.outputDir + 'JetClassifyModel_' + str(time) + '.pt')
    os.makedirs(options.outputDir+'weight_dists/',exist_ok=True)
    plot_weights.plot_kernels(current_model, text=" (Locally Pruned)",
                              output=options.outputDir+'weight_dists/'+'weight_dist_' + str(time) + '.png')
Example #32
0
plt.plot(hist.history['val_loss'], label='val')
plt.title('CNN_In_8_Steps :  Loss  &  Validation Loss')
plt.legend()
plt.show()

plt.plot(hist.history['accuracy'], label='train')
plt.plot(hist.history['val_accuracy'], label='val')
plt.title('CNN_In_8_Steps :  Accuracy  &  Validation Accuracy')
plt.legend()
plt.show()

# Confusion Matrix  & Pres  & Recall   & F1-Score

target_names = ['Abnormal', 'Normal']
label_names = [0, 1]

Y_pred = model.predict_generator(testdata)
y_pred = np.argmax(Y_pred, axis=1)

cm = confusion_matrix(testdata.classes, y_pred, labels=label_names)

print('Confusion Matrix')
print(confusion_matrix(testdata.classes, y_pred))

print('classification_Report')
print(
    classification_report(testdata.classes, y_pred, target_names=target_names))

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_names)
disp = disp.plot(cmap=plt.cm.Blues, values_format='g')
plt.show()
Example #33
0
train_sentiments = (train_sentiments.replace({
    'positive': 1,
    'negative': 0
})).values
test_sentiments = (test_sentiments.replace({
    'positive': 1,
    'negative': 0
})).values

corpus_train = CleanUpData(train_reviews)
corpus_test = CleanUpData(test_reviews)
#corpus_train = CleanUpData(train)
#corpus_test = CleanUpData(test)

count_vec = CountVectorizer(ngram_range=(1, 3), binary=False)
count_vec_train = count_vec.fit_transform(corpus_train)
count_vec_test = count_vec.transform(corpus_test)

linear_svc_count = LinearSVC(C=0.5, random_state=42, max_iter=5000)
linear_svc_count.fit(count_vec_train, train_sentiments)

predict_count = linear_svc_count.predict(count_vec_test)

print(
    "Classification Report: \n",
    classification_report(test_sentiments,
                          predict_count,
                          target_names=['Negative', 'Positive']))
print("Confusion Matrix: \n", confusion_matrix(test_sentiments, predict_count))
print("Accuracy: \n", accuracy_score(test_sentiments, predict_count))
#################################################
#=================Classification================#
# Perform a classification MLP on the Taxonomy  #
# data. It has a categorical target.            #
#################################################

#==========================
# Use a logistic function
#==========================
nnclass1 = MLPClassifier(activation='logistic', solver='sgd', 
                         hidden_layer_sizes=(100,100))
nnclass1.fit(taxon_data_train, taxon_train)

nnclass1_pred = nnclass1.predict(taxon_data_test)

cm = metrics.confusion_matrix(taxon_test, nnclass1_pred)
print(cm)

plt.matshow(cm)
plt.title('Confusion Matrix')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.xticks([0,1,2,3], ['I','II','III','IV'])

print(metrics.classification_report(taxon_test, nnclass1_pred))

#=====================================
# Use rectified linear unit function
#=====================================
nnclass2 = MLPClassifier(activation='relu', solver='sgd',
                         hidden_layer_sizes=(100,100))
Example #35
0
    epochs=5,
    #steps_per_epoch=steps_per_epoch,
    validation_data=test_it,
    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)],
    verbose=1,
    workers=1,
    validation_steps=40)

# evaluate model
#loss = model.evaluate_generator(test_it, steps=24)

#Confution Matrix and Classification Report
Y_pred = model.predict_generator(test_it, 5480 // batch_size + 1)
y_pred = np.argmax(Y_pred, axis=1)
print('Confusion Matrix')
tn, fp, fn, tp = confusion_matrix(test_it.classes, y_pred).ravel()
print(tp)
print(tn)
print(fp)
print(fn)
specificity = tn / (tn + fp)
print(specificity)
print('Classification Report')
target_names = ['Negative', 'Positive']
print(classification_report(test_it.classes, y_pred,
                            target_names=target_names))

# ROC
#from sklearn.metrics import roc_curve
#y_pred_keras = model.predict(test_it)
#fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_it, y_pred_keras)
from sklearn.linear_model import LogisticRegression
#creating local variable classifier
classifier = LogisticRegression()
#Training the model
classifier.fit(X_train,y_train)

#predicting the value of Y
y_pred = classifier.predict(X_test)

#importing metrics for evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

#summary of the model predicion
print(classification_report(y_test,y_pred))
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred))

#accuracy score of the model
from sklearn.metrics import accuracy_score
print('accuracy score :',accuracy_score(y_pred,y_test))

"""### **K-Nearest Neighbour**"""

#K-Nearest Neighbour
#importing the library
from sklearn.neighbors import KNeighborsClassifier
#creating local variable classifier
classifier = KNeighborsClassifier(n_neighbors=8)
#Training the model
classifier.fit(X_train,y_train)
Classifier = LogisticRegression(random_state=0)
Classifier.fit(X_train, y_train)

# Predicting the Test Set Results
# y_pred --> Vector of predictions
y_pred = Classifier.predict(X_test)

# Making the Confusion Matrix
# Confusion Matrix -> To see weather our Logistic Regression made correct prediction or not
# This confusion matrx will contain the correct predictions made on the Test Set as well as the incorrect predictions
# For this we are importing a function and not a class
# Distinction --> Class contains the captial letters at the buginneing
# Parameters of cnfusion matrix -> (1) y_true = Real values thats the values of the data set, (2) y_pred
# 65, 24 = 89-> Correct Predictions, 8,3 = 11-> Incorrect Predictions
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Visualizing the Training Set Results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(
    np.arange(start=X_set[:, 0].min() - 1,
              stop=X_set[:, 0].max() + 1,
              step=0.01),
    np.arange(start=X_set[:, 1].min() - 1,
              stop=X_set[:, 1].max() + 1,
              step=0.01))
plt.contourf(X1,
             X2,
             Classifier.predict(np.array([X1.ravel(),
                                          X2.ravel()]).T).reshape(X1.shape),
data = pd.read_csv("lending_club_data01.csv.txt")
#print(data.head())
#print(data.tail())
data["good_loans"] = data["bad_loans"].apply(lambda y: 'yes'
                                             if y == 0 else 'no')
print(data.head())
x = data.drop(['bad_loans', 'good_loans'], axis=1)
y = data['good_loans']
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=124)

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

predication = model.predict(x_test)
print(predication)
print(confusion_matrix(y_test, predication))
print(classification_report(y_test, predication))

# By using random forest classifier

rf_model = RandomForestClassifier(n_estimators=150)
rf_model.fit(x_train, y_train)

rf_predication = rf_model.predict(x_test)
print(rf_predication)
print(confusion_matrix(y_test, rf_predication))
print(classification_report(y_test, rf_predication))
# Instructions
# 100 XP
# Instructions
# 100 XP
# Import the metrics module from sklearn and MultinomialNB from sklearn.naive_bayes.
# Instantiate a MultinomialNB classifier called nb_classifier.
# Fit the classifier to the training data.
# Compute the predicted tags for the test data.
# Calculate and print the accuracy score of the classifier.
# Compute the confusion matrix. To make it easier to read, specify the keyword argument labels=['FAKE', 'REAL'].

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=['FAKE', 'REAL'])
print(cm)
X_test_all = df_test_ml

# scaled
X_train_all_sc = df_train_ml_sc
y_train_all_sc = df_train_ml['Survived']
X_test_all_sc = df_test_ml_sc
X_test_all.fillna(X_test_all.mean(), inplace=True)
print("*")

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
pred_logreg = logreg.predict(X_test)
print(confusion_matrix(y_test, pred_logreg))
print(classification_report(y_test, pred_logreg))
print(accuracy_score(y_test, pred_logreg))
logreg.fit(X_train_all, y_train_all)
pred_all_logreg = logreg.predict(X_test_all)
sub_logreg = pd.DataFrame()
sub_logreg['PassengerId'] = df_test['PassengerId']
sub_logreg['Survived'] = pred_all_logreg
#sub_logmodel.to_csv('logmodel.csv',index=False)
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(X_train,y_train)
pred_gnb = gnb.predict(X_test)
print(confusion_matrix(y_test, pred_gnb))
print(classification_report(y_test, pred_gnb))
print(accuracy_score(y_test, pred_gnb))
def LSTM_model_train(train_data, epochs, test_data, name, jump_per):
    
    def f1(y_true, y_pred):
        y_pred = K.round(y_pred)
        tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
        fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
        fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

        p = tp / (tp + fp + K.epsilon())
        r = tp / (tp + fn + K.epsilon())

        f1 = 2*p*r / (p+r+K.epsilon())
        f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
        return K.mean(f1)

    def precision(y_true, y_pred):  
        """Precision metric.    
        Only computes a batch-wise average of precision. Computes the precision, a
        metric for multi-label classification of how many selected items are
        relevant.
        """ 
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))  
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))  
        precision = true_positives / (predicted_positives + K.epsilon())    
        return precision

    def recall(y_true, y_pred): 
        """Recall metric.   
        Only computes a batch-wise average of recall. Computes the recall, a metric
        for multi-label classification of how many relevant items are selected. 
        """ 
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))  
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))   
        recall = true_positives / (possible_positives + K.epsilon())    
        return recall

    def f1_score_own(y_true, y_pred):
        """Computes the F1 Score
        Only computes a batch-wise average of recall. Computes the recall, a metric
        for multi-label classification of how many relevant items are selected. 
        """
        p = precision(y_true, y_pred)
        r = recall(y_true, y_pred)
        return (2 * p * r) / (p + r + K.epsilon())
    
    def matthews_correlation(y_true, y_pred):
        y_pred_pos = K.round(K.clip(y_pred, 0, 1))
        y_pred_neg = 1 - y_pred_pos

        y_pos = K.round(K.clip(y_true, 0, 1))
        y_neg = 1 - y_pos

        tp = K.sum(y_pos * y_pred_pos)
        tn = K.sum(y_neg * y_pred_neg)

        fp = K.sum(y_neg * y_pred_pos)
        fn = K.sum(y_pos * y_pred_neg)

        numerator = (tp * tn - fp * fn)
        denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

        return numerator / (denominator + K.epsilon())
    
    
    # Get the values from the data #
    
    # train data and test data remove NA #
    train_data = train_data.dropna(axis = 0)
    test_data = test_data.dropna(axis = 0)
    
    train_data.loc[train_data['jump_pred'] == -1, 'jump_pred'] = 1
    test_data.loc[test_data['jump_pred'] == -1, 'jump_pred'] = 1
    
    x_train = train_data.drop(columns = ['jump_pred', 'utcsec', 'sec'])
    y_train = train_data['jump_pred']
    
    
    smt_dict_train = {0: len(y_train[y_train == 0]), 
                      1: int(np.ceil(len(y_train[y_train == 0]) * jump_per))}
    
    smt_train = SMOTE(sampling_strategy = smt_dict_train)
    
    x_train, y_train = smt_train.fit_sample(x_train, y_train)
    
    #x_train = x_train.values
    #y_train = pd.get_dummies(y_train)
    #y_train = y_train.values
    
    # Test data #
    x_test = test_data.drop(columns = ['jump_pred', 'utcsec', 'sec'])
    y_test = test_data['jump_pred']
    
    y_test_out = y_test
    
    #print('Distribution of jumps', pd.DataFrame(y_test_out)[0].value_counts())
    
    #x_test = x_test.values
    #y_test = pd.get_dummies(y_test)
    #y_test = y_test.values
    
    # LSTM #
    lstm_output_size = 40
    
    # Training #
    batch_size = 248
    print("Batch size:", batch_size)
    
    # Scale the values #
    min_max_scaler = preprocessing.StandardScaler()
    
    #x_train = x_train.values
    x_train = min_max_scaler.fit_transform(x_train)
    #x_train = pd.DataFrame(x_train_scaled)
    
    #x_test = x_test.values
    x_test = min_max_scaler.fit_transform(x_test)
    #x_test = pd.DataFrame(x_test_scaled)
    
    # Print shapes before reshaping #
    #print('------------------------------')
    #print('Shapes before reshaping')
    #print('x_train shape:', x_train.shape)
    #print('x_test shape:', x_test.shape)
    #print('y_train shape:', y_train.shape)
    #print('y_test shape:', y_test.shape)
    
    # Reshape to LSTM training format #
    (x_train, y_train) = x_train.reshape(np.shape(x_train)[0], np.shape(x_train)[1], -1), y_train.reshape(np.shape(y_train)[0],1)
    (x_test, y_test) = x_test.reshape(np.shape(x_test)[0], np.shape(x_test)[1], -1), y_test.reshape(np.shape(y_test)[0],1)
    
    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    print("y_train shape:", y_train.shape)
    print("y_test shape:", y_test.shape)
    
    print("Build model...")
    
    seq_length = x_train.shape[1]
    input_dims = x_train.shape[2]
    
    inputs = Input(shape = (seq_length, input_dims))
    dense_att = Dense(input_dims, activation='relu', kernel_regularizer=regularizers.l2(0.01), name='dense_att')(inputs)
    attention_probs = Dense(input_dims, activation='sigmoid', name='attention_probs')(dense_att)
    attention_mul = multiply([dense_att, attention_probs], name='attention_mul')
    conv_1d = Conv1D(filters = 16, kernel_size = 4,
                     name = 'conv_1d')(attention_mul)
    max_pool_1 = MaxPooling1D(pool_size = 2, name = 'max_pool_1')(conv_1d)
    conv_1d_2 = Conv1D(filters = 32, kernel_size = 3, name = 'conv_1d_2')(max_pool_1)
    conv_1d_3 = Conv1D(filters = 32, kernel_size = 3, name = 'conv_1d_3')(conv_1d_2)
    max_pool_2 = MaxPooling1D(pool_size = 2, name = 'max_pool_2')(conv_1d_3)
    lstm = LSTM(40, return_sequences = False, recurrent_dropout = 0.25, name = 'lstm')(max_pool_2)
    dense_1 = Dense(40, activation = 'relu')(lstm)
    dense_out = Dense(1, activation = 'sigmoid', name = 'dense_out')(dense_1)
    
    model = Model(inputs=[inputs], outputs=dense_out)
    
    model.compile(loss = 'binary_crossentropy',
                  optimizer = 'adam', 
                  metrics = [f1_score_own, precision, recall])
    
    fileName = './models/weights_best_'+name+'.hdf5'
    checkpointer = ModelCheckpoint(filepath = fileName, monitor = 'val_f1_score_own', 
                                   verbose = 1, save_best_only = True, 
                                   save_weights_only = False, mode = 'max', period = 1)
    
    print('Train the model...')
    # Early stopping #
    es = EarlyStopping(monitor='val_f1_score_own', mode='max', verbose=1, patience = 5)
    
    model.fit(x_train, y_train, validation_split = 0.2, batch_size = batch_size, epochs = epochs, 
              validation_data = (x_train, y_train),
              verbose = 1, callbacks = [checkpointer, es], use_multiprocessing = True)
    
    # Maybe turn off multiprocessing #
    
    model.load_weights(fileName)
    
    loss, f1, precision, recall = model.evaluate(x_test, y_test, verbose = 1)
    #loss, cat_loss = model.evaluate(x_test, y_test, verbose = 1)
    #loss, tp, fp, tn, fn, ba, precision, recall, auc, recall_two = model.evaluate(x_test, y_test, verbose = 1)
    
    y_pred = model.predict(x_test)
    
    #"""
    f1_calc = f1_score(y_test_out, np.round(y_pred))
    precision_calc = precision_score(y_test_out, np.round(y_pred))
    recall_calc = recall_score(y_test_out, np.round(y_pred))
    cohens_kappa_calc = cohen_kappa_score(y_test_out, np.round(y_pred))
    mcc_calc = matthews_corrcoef(y_test_out, np.round(y_pred))
    
    print('------------------------------')
    #print('Test F1 def func:', f1)
    #print('Test prec def func:', precision)
    print('Test recall def func:', recall)
    #print('Test def MCC:', mcc)
    #print('Test recall def two func:', recall_two)
    
    print('Test F1 score:', f1_calc)
    print('Test precision:', precision_calc)
    print('Test recall:', recall_calc)
    print('Test Cohens kappa:', cohens_kappa_calc)
    print('Test MCC:', mcc_calc)
    print('Test loss:', loss)
    
    out_dict = {}
    out_dict['loss'] = loss
    out_dict['f1'] = f1_calc
    out_dict['precision'] = precision_calc
    out_dict['recall'] = recall_calc
    out_dict['y_pred'] = y_pred
    out_dict['y_test'] = y_test_out
    out_dict['con_mat'] = confusion_matrix(y_test_out, np.round(y_pred))
    out_dict['cohens_kappa'] = cohens_kappa_calc
    out_dict['mcc'] = mcc_calc
    return out_dict
Example #42
0
def plot_confusion_matrix(step, y_true, y_pred, output_size):
    # check result directory
    result_dir = 'result'
    check_existing_dir(result_dir)

    print('plot confusion matrix start: ', end='')

    # preprocessing
    y_true = [x for x in y_true if x != -1]
    y_pred = [x for x in y_pred if x != -1]

    # compute confusion matrix
    cnf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)

    # configuration
    np.set_printoptions(precision=2)

    if output_size == 2:
        labels = ['benign', 'malware']
    else:
        # toy dataset label
        # labels = ['Virus', 'Worm', 'Trojan', 'not-a-virus:Downloader', 'Trojan-Ransom', 'Backdoor']
        labels = list(range(output_size))
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=90)
    plt.xticks(tick_marks, labels)
    plt.yticks(tick_marks, labels)

    norm_flag = True
    plot_title = 'Confusion matrix'
    cmap = plt.cm.Blues

    if norm_flag:
        cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    for row in cnf_matrix:
        for val in row:
            print('{0:.2f}'.format(val), end=' ')
        print()

    # plotting start
    plt.figure()
    plt.imshow(cnf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(plot_title)
    plt.colorbar()

    # information about each block's value
    fmt = '.3f' if norm_flag else 'd'
    thresh = cnf_matrix.max() / 2.
    for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
        plt.text(j, i, format(cnf_matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cnf_matrix[i, j] > thresh else "black")

    # insert legend information
    # import matplotlib.patches as mpatches
    # patches = [mpatches.Patch(color='white', label='G{num} = {group}'.format(num=i+1, group=labels[i])) for i in range(len(labels))]
    # plt.legend(handles=patches, bbox_to_anchor=(-0.60, 1), loc=2, borderaxespad=0.)

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    # plt.show()
    plt.savefig(os.path.join(result_dir, 'conf_matrix{}'.format(step)))
    print('--plot confusion matrix finish--')
    pass
Example #43
0
    extension = path.split(".")[-1]

    if extension == 'csv':
        result_df = pd.read_csv(path, header=[0, 1])
    elif extension == 'tsv':
        result_df = pd.read_table(path, header=[0, 1])

    for dataset in test_names:
        print(result_df.get(dataset) is None, result_df.get('gt') is None)
        if result_df.get(dataset) is not None:
            gt_column = result_df[dataset, "label"]
            pred_column = result_df[dataset, "predicted"]
        else:
            gt_column = result_df["gt"]
            pred_column = result_df["pred"]
        tn, fp, fn, tp = confusion_matrix(gt_column.dropna(), round_by_threshold(pred_column.dropna(), th)).ravel()
        print("Evaluation of the %s set " % dataset)
        sen = float(tp) / (fn + tp)
        pre = float(tp) / (tp + fp)
        spe = float(tn) / (tn + fp)
        acc = float(tn + tp) / (tn + fp + fn + tp)
        f1 = (2 * sen * pre) / (sen + pre)
        print("\tSen : ", sen)
        print("\tSpe : ", spe)
        print("\tAcc : ", acc)
        print("\tPrecision : ", pre)
        print("\tF1 : ", f1)
        result_dic = {"Acc": acc, "Sen": sen, "Pre": pre, "F1": f1, "Spe": spe}
        if args.no_threshold:
            fpr, tpr, thresholds_AUC = roc_curve(gt_column, pred_column)
            AUC = auc(fpr, tpr)
Example #44
0
############### fit frequency based word embeddings into our data set to turn text into wordvectors

vectorizer = TfidfVectorizer(lowercase=True, stop_words=STOPWORDS)
vectorizer.fit(x_train)
x_train_vect = vectorizer.transform(x_train)
x_test_vect = vectorizer.transform(x_test)

############# Build our classifier with Linear Support vector machine

model = SVC(C=1, kernel='linear', class_weight='balanced')
model.fit(x_train_vect, y_train)

y_pred = model.predict(x_test_vect)

cm = confusion_matrix(y_test, y_pred)  ########## confusion matrix for test set

pipeline = make_pipeline(
    vectorizer,
    model)  #### save our model with pipeline function for future analysis


def predict(text):

    score = pipeline.predict([clean_text(text)])

    if score == 0:
        topic = 'real news'
    elif score == 1:
        topic = 'Genral spam'
    elif score == 2:
Example #45
0
bayes_clf.fit(x_train, y_train)
""" Predict the test dataset using Naive Bayes"""
predicted = bayes_clf.predict(x_test)
print('Naive Bayes correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
# 输出f1分数,准确率,召回率等指标
print(metrics.classification_report(y_test, predicted, target_names=categories))

""" Support Vector Machine (SVM) classifier"""
svm_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)),
                    ])
svm_clf.fit(x_train, y_train)
predicted = svm_clf.predict(x_test)
print('SVM correct prediction: {:4.4f}'.format(np.mean(predicted == y_test)))
print(metrics.classification_report(y_test, predicted, target_names=categories))
# 输出混淆矩阵
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, predicted))
print('\n')

""" 10-折交叉验证 """
clf_b = make_pipeline(CountVectorizer(), TfidfTransformer(), MultinomialNB())
clf_s = make_pipeline(CountVectorizer(), TfidfTransformer(),
                      SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))

bayes_10_fold = cross_val_score(clf_b, x_text, y, cv=10)
svm_10_fold = cross_val_score(clf_s, x_text, y, cv=10)

print('Naives Bayes 10-fold correct prediction: {:4.4f}'.format(np.mean(bayes_10_fold)))
print('SVM 10-fold correct prediction: {:4.4f}'.format(np.mean(svm_10_fold)))
Example #46
0
all_X = train[columns]
all_y = train['Survived']

train_X, test_X, train_y, test_y = train_test_split(all_X,
                                                    all_y,
                                                    test_size=0.2,
                                                    random_state=0)

lr.fit(train_X, train_y)
predictions = lr.predict(test_X)
accuracy = accuracy_score(test_y, predictions)

from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(test_y, predictions)
pd.DataFrame(conf_matrix,
             columns=['Survived', 'Died'],
             index=[['Survived', 'Died']])

from sklearn.model_selection import cross_val_score
import numpy as np

scores = cross_val_score(lr, all_X, all_y, cv=10)
np.mean(scores)

print('accuracy: ', accuracy)

print('mean :', np.mean(scores))

lr = LogisticRegression()
Example #47
0
    kfold = model_selection.KFold(n_splits=10)
    cross_res = model_selection.cross_val_score(model,
                                                X_train,
                                                Y_train,
                                                cv=kfold,
                                                scoring=scoring)
    results.append((name, cross_res))

for name, res in results:
    print("{:6} {:2.4} {:2.4}").format(name, res.mean(), res.std())

model = LinearDiscriminantAnalisys(solver='lsqr')

model.fit(X_train, Y_train)

predictions = model.predict(X_val)
print(accuracy_score(Y_val, predictions))

print(classification_report(Y_val, predictions))
print(confusion_matrix(Y_val, predictions))

model = LinearDiscriminantAnalisys(solver='engel')

model.fit(X_train, Y_train)

predictions = model.predict(X_val)
print(accuracy_score(Y_val, predictions))

print(classification_report(Y_val, predictions))
print(confusion_matrix(Y_val, predictions))
    axes[i].set_xlim(x_min, x_max)
    axes[i].set_ylim(y_min, y_max)
    plt.sca(axes[i])
    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.prism)
    ys = (-clf.intercept_[i] - xs * clf.coef_[i, 0]) / clf.coef_[i, 1]
    plt.plot(xs, ys)

print(clf.predict(scaler.transform([[4.7, 3.1]])))

print(clf.decision_function(scaler.transform([[4.7, 3.1]])))

from sklearn import metrics
y_train_pred = clf.predict(X_train)
print(metrics.accuracy_score(y_train, y_train_pred))

y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

print(
    metrics.classification_report(y_test,
                                  y_pred,
                                  target_names=iris.target_names))

print(metrics.confusion_matrix(y_test, y_pred))

print("My name is Timothee Becker")
print("My NetID is: tbecker5")
print(
    "I hereby certify that I have read the University policy on Academic Integrity and that I am not in violation."
)
}

GridSearch_Log_Reg = GridSearchCV(model_Logistic_Regression, parameters, n_jobs=-1)
GridSearch_Log_Reg = GridSearch_Log_Reg.fit(X, y)
print('Training the Model please be patient.')
print('\n')
print('GridSearch Logistic Regression best score: \n',GridSearch_Log_Reg.best_score_)
print('\n')
print('GridSearch Logistic Regression best parameters: \n',GridSearch_Log_Reg.best_params_)

#Fit the model with Train
model_Logistic_Regression.fit(X_train, y_train)

pred = model_Logistic_Regression.predict(X_test)
#Defining Confusion Matrix
confusion_matrix(pred, y_test)
print('Fit X, y with Logistic Regression Algorithm ',model_Logistic_Regression.fit(X, y))
#model_Logistic_Regression.fit(X, y)
#Export pkl
joblib.dump(GridSearch_Log_Reg, "model_LogistikRegression.pkl")
#import pkl
model_Logistic_Regressionodel = joblib.load("model_LogistikRegression.pkl")

y_preds = model_Logistic_Regression.predict(X_test)
print('\n')
print('\n')
print('accuracy score: ',accuracy_score(y_test, y_preds))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds))
print('\n')
print(classification_report(y_test, y_preds))
                             X_test,
                             y_test,
                             name='ROC fold {}'.format(k_iteration),
                             alpha=0.3,
                             lw=1,
                             ax=ax)
        interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

        y_pred_train = trained_model.predict(X_train)
        y_pred_test = trained_model.predict(X_test)
        accuracy_train = accuracy_score(y_train, y_pred_train)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        TN, FP, FN, TP = confusion_matrix(y_test, y_pred_test).ravel()

        __print(model_name + 'train_acc: {}'.format(accuracy_train))
        __print(model_name + 'test_acc: {}'.format(accuracy_test))
        __print(model_name + 'TN:{}'.format(TN))
        __print(model_name + 'FP:{}'.format(FP))
        __print(model_name + 'FN:{}'.format(FN))
        __print(model_name + 'TP:{}'.format(TP))

        result_dict[model_name]['train_acc'].append(accuracy_train)
        result_dict[model_name]['test_acc'].append(accuracy_test)
        result_dict[model_name]['TN'].append(TN)
        result_dict[model_name]['TP'].append(TP)
        result_dict[model_name]['FP'].append(FP)
        result_dict[model_name]['FN'].append(FN)
Example #51
0
    def infer(self):
        # from sklearn import svm
        # from sklearn.ensemble import IsolationForest
        # from sklearn.metrics import confusion_matrix
        # import pickle as pk
        # train_labels, test_labels = pk.load(open(
        #         '/Users/badgod/badgod_documents/Datasets/covid19/processed_data/coswara_train_data_fbank_cough-shallow_labels.pkl',
        #         'rb')), pk.load(open(
        #         '/Users/badgod/badgod_documents/Datasets/covid19/processed_data/coswara_test_data_fbank_cough-shallow_labels.pkl',
        #         'rb'))
        # train_latent_features, test_latent_features = pk.load(
        #         open('/Users/badgod/badgod_documents/Datasets/covid19/processed_data/forced_train_latent.npy',
        #              'rb')), pk.load(
        #         open('/Users/badgod/badgod_documents/Datasets/covid19/processed_data/forced_test_latent.npy', 'rb'))
        # # for x, y in zip(train_latent_features, train_labels):
        # #     if y == 0:
        # #         print('Mean: ', np.mean(x), ' Std: ', np.std(x), ' | Label: ', y)
        # # for x, y in zip(train_latent_features, train_labels):
        # #     if y == 1:
        # #         print('Mean: ', np.mean(x), ' Std: ', np.std(x), ' | Label: ', y)
        # #
        # # exit()
        # self.logger.info(
        #         'Total train data len: ' + str(len(train_labels)) + ' | Positive samples: ' + str(sum(train_labels)))
        # self.logger.info(
        #         'Total test data len: ' + str(len(test_labels)) + ' | Positive samples: ' + str(sum(test_labels)))
        # # oneclass_svm = svm.OneClassSVM(kernel="rbf")
        # oneclass_svm = IsolationForest(random_state=0)
        # oneclass_svm.fit(train_latent_features)
        # oneclass_predictions = oneclass_svm.predict(train_latent_features)
        # masked_predictions = self.mask_preds_for_one_class(oneclass_predictions)
        # train_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor(train_labels), threshold=self.threshold)
        # train_metrics = {'train_' + k: v for k, v in train_metrics.items()}
        # self.logger.info(f'***** Train Metrics ***** ')
        # self.logger.info(
        #         f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} "
        #         f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} "
        #         f"| Precision:{'%.5f' % train_metrics['train_precision']} "
        #         f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}")
        # self.logger.info('Train Confusion matrix - \n' + str(confusion_matrix(train_labels, masked_predictions)))
        # # Test
        # oneclass_predictions = oneclass_svm.predict(test_latent_features)
        # masked_predictions = self.mask_preds_for_one_class(oneclass_predictions)
        # test_metrics = accuracy_fn(to_tensor(masked_predictions), to_tensor(test_labels), threshold=self.threshold)
        # test_metrics = {'test_' + k: v for k, v in test_metrics.items()}
        # self.logger.info(f'***** Test Metrics ***** ')
        # self.logger.info(
        #         f"Accuracy: {'%.5f' % test_metrics['test_accuracy']} "
        #         f"| UAR: {'%.5f' % test_metrics['test_uar']}| F1:{'%.5f' % test_metrics['test_f1']} "
        #         f"| Precision:{'%.5f' % test_metrics['test_precision']} "
        #         f"| Recall:{'%.5f' % test_metrics['test_recall']} | AUC:{'%.5f' % test_metrics['test_auc']}")
        # self.logger.info('Test Confusion matrix - \n' + str(confusion_matrix(test_labels, masked_predictions)))

        from sklearn import svm
        from sklearn.metrics import confusion_matrix
        import pickle
        self._min, self._max = -80.0, 3.8146973e-06
        train_data, train_labels = self.data_reader(self.data_read_path, [self.train_file],
                                                    shuffle=False,
                                                    train=True, only_negative_samples=False)

        test_data, test_labels = self.data_reader(self.data_read_path, [self.test_file],
                                                  shuffle=False,
                                                  train=False, only_negative_samples=False)
        train_latent_features, test_latent_features = [], []
        with torch.no_grad():
            for i, (audio_data, label) in enumerate(zip(train_data, train_labels)):
                audio_data = to_tensor(audio_data, device=self.device)
                train_predictions, train_latent = self.network(audio_data)
                train_latent_features.extend(to_numpy(train_latent.squeeze(1)))
        pickle.dump(train_latent_features,
                    open('ae_contrastive_train_latent.npy', 'wb'))

        oneclass_svm = svm.OneClassSVM(nu=0.1, kernel="poly", gamma=0.1)
        oneclass_svm.fit(train_latent_features)
        oneclass_predictions = oneclass_svm.predict(train_latent_features)
        masked_predictions = self.mask_preds_for_one_class(oneclass_predictions)
        train_metrics = accuracy_fn(to_tensor(masked_predictions),
                                    to_tensor([element for sublist in train_labels for element in sublist]),
                                    threshold=self.threshold)
        train_metrics = {'train_' + k: v for k, v in train_metrics.items()}
        self.logger.info(f'***** Train Metrics ***** ')
        self.logger.info(
                f"Accuracy: {'%.5f' % train_metrics['train_accuracy']} "
                f"| UAR: {'%.5f' % train_metrics['train_uar']}| F1:{'%.5f' % train_metrics['train_f1']} "
                f"| Precision:{'%.5f' % train_metrics['train_precision']} "
                f"| Recall:{'%.5f' % train_metrics['train_recall']} | AUC:{'%.5f' % train_metrics['train_auc']}")
        self.logger.info('Train Confusion matrix - \n' + str(
                confusion_matrix([element for sublist in train_labels for element in sublist], masked_predictions)))

        # Test
        with torch.no_grad():
            for i, (audio_data, label) in enumerate(zip(test_data, test_labels)):
                audio_data = to_tensor(audio_data, device=self.device)
                test_predictions, test_latent = self.network(audio_data)
                test_latent_features.extend(to_numpy(test_latent.squeeze(1)))
        pickle.dump(test_latent_features,
                    open('ae_contrastive_test_latent.npy', 'wb'))

        oneclass_predictions = oneclass_svm.predict(test_latent_features)
        masked_predictions = self.mask_preds_for_one_class(oneclass_predictions)
        test_metrics = accuracy_fn(to_tensor(masked_predictions),
                                   to_tensor([element for sublist in test_labels for element in sublist]),
                                   threshold=self.threshold)
        test_metrics = {'test_' + k: v for k, v in test_metrics.items()}
        self.logger.info(f'***** Test Metrics ***** ')
        self.logger.info(
                f"Accuracy: {'%.5f' % test_metrics['test_accuracy']} "
                f"| UAR: {'%.5f' % test_metrics['test_uar']}| F1:{'%.5f' % test_metrics['test_f1']} "
                f"| Precision:{'%.5f' % test_metrics['test_precision']} "
                f"| Recall:{'%.5f' % test_metrics['test_recall']} | AUC:{'%.5f' % test_metrics['test_auc']}")
        self.logger.info('Test Confusion matrix - \n' + str(
                confusion_matrix([element for sublist in test_labels for element in sublist], masked_predictions)))

        train_latent_features = np.array(train_latent_features)
        test_latent_features = np.array(test_latent_features)
        ones_idx = [i for i, x in enumerate(train_labels) if x == 1]
        zeros_idx = [i for i, x in enumerate(train_labels) if x == 0]
        print(train_latent_features[ones_idx].mean(), train_latent_features[ones_idx].std())
        print(train_latent_features[zeros_idx].mean(), train_latent_features[zeros_idx].std())

        ones_idx = [i for i, x in enumerate(test_labels) if x == 1]
        zeros_idx = [i for i, x in enumerate(test_labels) if x == 0]
        print(test_latent_features[ones_idx].mean(), test_latent_features[ones_idx].std())
        print(test_latent_features[zeros_idx].mean(), test_latent_features[zeros_idx].std())
dataset = pd.read_csv(r'C:\Users\96251\Desktop\ML_code\files\100-Days-Of-ML-Code-master\datasets\Social_Network_Ads.csv')
X = dataset.iloc[ : , [2,3]].values
Y = dataset.iloc[ : ,4].values

X_train,X_test,Y_train, Y_test = train_test_split(X,Y,test_size=0.25, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = LogisticRegression()
classifier.fit(X_train,Y_train)

Y_pred = classifier.predict(X_test)

cm = confusion_matrix(Y_test,Y_pred)

X_set ,Y_set = X_train, Y_train
X1,X2=np. meshgrid(np. arange(start=X_set[:,0].min()-1, stop=X_set[:, 0].max()+1, step=0.01),
                   np. arange(start=X_set[:,1].min()-1, stop=X_set[:,1].max()+1, step=0.01))
print('X_set[:,0].min()-1',X_set[:,0].min()-1)
print('X_set[:, 0].max()+1',X_set[:, 0].max()+1)
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('LightCoral', 'MintCream')))
print('a',np.array([X1.ravel(),X2.ravel()]))
print('b',classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).shape)
print('c',classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape).shape)

plt.xlim(X1.min(),X1.max())
plt.ylim(X2.min(),X2.max())
for i,j in enumerate(np. unique(Y_set)):
    ax.set_ylim([0.0, 1.02])
    if k in [3, 4, 5]:
        ax.set_xlabel('Recall (Sensitivity)', fontsize=17)
    if k in [0, 3]:
        ax.set_ylabel('Precision (PPV)', fontsize=17)
    # plt.title('Precision-Recall curve (' + name + ')')
    if k == 0:
        plt.legend(loc="lower left", fontsize=17)
    else:
        ax.legend().remove()
    plt.tight_layout()
    plt.savefig('./outputs/figures/precision_recall_{0}.pdf'.format(name))

# %% Confusion matrices (Supplementary Table 1)

M = [[confusion_matrix(y_true[:, k], y_pred[:, k], labels=[0, 1])
      for k in range(nclasses)] for y_pred in [y_neuralnet, y_cardio, y_emerg, y_student]]

M_xarray = xr.DataArray(np.array(M),
                        dims=['predictor', 'diagnosis', 'true label', 'predicted label'],
                        coords={'predictor': ['DNN', 'cardio.', 'emerg.', 'stud.'],
                                'diagnosis': diagnosis,
                                'true label': ['not present', 'present'],
                                'predicted label': ['not present', 'present']})
confusion_matrices = M_xarray.to_dataframe('n')
confusion_matrices = confusion_matrices.reorder_levels([1, 2, 3, 0], axis=0)
confusion_matrices = confusion_matrices.unstack()
confusion_matrices = confusion_matrices.unstack()
confusion_matrices = confusion_matrices['n']
confusion_matrices.to_excel("./outputs/tables/confusion matrices.xlsx", float_format='%.3f')
confusion_matrices.to_csv("./outputs/tables/confusion matrices.csv", float_format='%.3f')
Example #54
0
#len of X_train here is the 8000.
#1 here is the column.
#13 is the feature.
X_train=X_train.reshape(len(X_train),1,16)
X_test=X_test.reshape(len(X_test),1,16)
model.add(LSTM(100, input_shape = (None,16),activation='relu')) 
model.add(Dense(output_dim=1, activation='sigmoid'))
print('Model loaded.')
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('Model compiled.')

print(model.summary())

model.fit(X_train,Y_train,epochs=20)

val_loss,val_acc=model.evaluate(X_test,Y_test)
print(val_loss,val_acc)



p=model.predict([X_test])
 
import numpy as np
for i in range(0,40):
   print(p[i],Y_test[i])

p = (p > 0.5)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, p)
print(cm)
Example #55
0
# Fitting the classifier into the Training set

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=200,
                                    criterion='entropy',
                                    random_state=0)
classifier.fit(X_Train, Y_Train)

# Predicting the test set results

Y_Pred = classifier.predict(X_Test)

# Making the Confusion Matrix

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_Test, Y_Pred)

# Visualising the Training set results

from matplotlib.colors import ListedColormap
X_Set, Y_Set = X_Train, Y_Train
X1, X2 = np.meshgrid(
    np.arange(start=X_Set[:, 0].min() - 1,
              stop=X_Set[:, 0].max() + 1,
              step=0.01),
    np.arange(start=X_Set[:, 1].min() - 1,
              stop=X_Set[:, 1].max() + 1,
              step=0.01))
plt.contourf(X1,
             X2,
             classifier.predict(np.array([X1.ravel(),
def specificity_score(y_true, y_pred):
    m = confusion_matrix(y_true, y_pred, labels=[0, 1])
    spc = m[0, 0] * 1.0 / (m[0, 0] + m[0, 1])
    return spc
Example #57
0
# definindo o modelo de treinamento
# dando um fit no modelo passando o conjunto de teste e de treino
# Foi executado diversos algoritmos e o linear_model.BayesianRidge() foi o que teve mais precisao para
# a base de treinamento proposta
model = linear_model.BayesianRidge()
model.fit(X_train, Y_train)
preds = model.predict(X_test)

#Imprimindo a precisao do modelo
accuracy = metrics.accuracy_score(preds.round(), Y_test)
print("Accuracy : %s" % "{0:.3%}".format(accuracy))

# Criando a matriz de confusao
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, preds.round())

# criando a precisao do modelo em arquivo com base no conjunto de gerado de treino, teste e predicao
createPrediction()

##################################################################################################
######      Fim da execucao do modelo de data mining no conjunto de dados ja preprocessados
##################################################################################################

#######################################################################################
############ Criando a tabela de classificacao com base no resultado da predicao
#######################################################################################
jogos = pd.read_csv('result.csv')

tabela = grupoTime[grupoTime['ano'] == 2018]
tabela = tabela.filter(items=['id', 'team', 'grupo'])
Example #58
0
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)

#pca dönüşümünden sonra gelen LR
classifier2 = LogisticRegression(random_state=0)
classifier2.fit(X_train2,y_train)

#tahminler
y_pred = classifier.predict(X_test)

y_pred2 = classifier2.predict(X_test2)

from sklearn.metrics import confusion_matrix
#actual / PCA olmadan çıkan sonuç
print('gercek / PCAsiz')
cm = confusion_matrix(y_test,y_pred)
print(cm)

#actual / PCA sonrası çıkan sonuç
print("gercek / pca ile")
cm2 = confusion_matrix(y_test,y_pred2)
print(cm2)

#PCA sonrası / PCA öncesi
print('pcasiz ve pcali')
cm3 = confusion_matrix(y_pred,y_pred2)
print(cm3)


from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
Example #59
0
                                                    Y,
                                                    test_size=0.3,
                                                    random_state=0)
# 2-Let us build the model and validate the parameters

clf1 = linear_model.LogisticRegression(solver='lbfgs')
clf1.fit(X_train, Y_train)
#3- Run the test data against the new model
probs = clf1.predict_proba(X_test)
print(probs)
predicted = clf1.predict(X_test)
print(predicted)
#4-Check model accuracy
print(metrics.accuracy_score(Y_test, predicted))
#To avoid sampling bias run cross validation for 10 times, as follows
scores = cross_val_score(linear_model.LogisticRegression(solver='lbfgs'),
                         X,
                         Y,
                         scoring='accuracy',
                         cv=10)
print(scores)
print(scores.mean())
#Generate the confusion matrix as follows:
prob = probs[:, 1]
prob_df = pd.DataFrame(prob)
prob_df['predict'] = np.where(prob_df[0] >= 0.05, 1, 0)
Y_A = Y_test.values
Y_P = np.array(prob_df['predict'])
confusion_matrix = confusion_matrix(Y_A, Y_P)
print(confusion_matrix)
Example #60
0
for i in imptlist:
    temp.append([i])
temp = pd.DataFrame(temp, index = ['Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
       'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'Age',
       'Education_Bachelors', 'Education_Graduate Degree',
       'Education_High School', 'Education_Partial College',
       'Education_Partial High School', 'Occupation_Clerical',
       'Occupation_Management', 'Occupation_Manual', 'Occupation_Professional',
       'Occupation_Skilled Manual'])
temp.columns = ['Feature Importance']
print(temp)

import sklearn.metrics as metrics

yc_predict = fcmodel.predict(Xc_test)
print(metrics.confusion_matrix(yc_test, yc_predict))
print('Recall Score:', round(metrics.recall_score(yc_test, yc_predict) * 100,3))
print('Accuracy Score:', round(metrics.accuracy_score(yc_test, yc_predict) * 100, 3))
print('Precision:', round(metrics.precision_score(yc_test, yc_predict) * 100, 3))
print('F1 Score:', round(metrics.f1_score(yc_test, yc_predict) * 100,3))
fpr, tpr, threshold = metrics.roc_curve(yc_test, yc_predict)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc, color='darkorange')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--', color = 'b')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()