def test_topic_distribution(doc_topic_weights_filename, annotated_data_filename, k, test_indice, train_prop, column_of_interest):
	(X, Y, topic_to_index) = process_dataset(doc_topic_weights_filename, annotated_data_filename, k, column_of_interest)
	num_train = int(X.shape[0])

	# We repeat the experiments and report the average
	scores = []

	# Divide the set into train and test sets
	X_train = X[:num_train]
	Y_train = Y[:num_train]

	test_start_index = test_indice[0]
	test_end_index = test_indice[1]
	X_test = X[test_start_index:test_end_index]
	Y_test = Y[test_start_index:test_end_index]

	# Build a classifier
	clf = LogisticRegression().fit(X_train, Y_train)

	# Make prediction
	predicted_labels = clf.predict(X_test)

	# Report the accuracy
	true_labels = Y_test
	score = f1_score(predicted_labels, true_labels)	
	print "---------------- %s --------------------"  % str(test_indice)
	print classification_report(predicted_labels, true_labels)	
	print topic_to_index
	print "-----------------------------------------" 
	return score
def test_classification_report_multiclass_with_string_label():
    y_true, y_pred, _ = make_prediction(binary=False)

    y_true = np.array(["blue", "green", "red"])[y_true]
    y_pred = np.array(["blue", "green", "red"])[y_pred]

    expected_report = """\
             precision    recall  f1-score   support

       blue       0.83      0.79      0.81        24
      green       0.33      0.10      0.15        31
        red       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)

    expected_report = """\
             precision    recall  f1-score   support

          a       0.83      0.79      0.81        24
          b       0.33      0.10      0.15        31
          c       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred,
                                   target_names=["a", "b", "c"])
    assert_equal(report, expected_report)
def test_classification_report_multiclass():
    """Test performance report"""
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = """\
             precision    recall  f1-score   support

     setosa       0.83      0.79      0.81        24
 versicolor       0.33      0.10      0.15        31
  virginica       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(
        y_true, y_pred, labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names)
    assert_equal(report, expected_report)

    # print classification report with label detection
    expected_report = """\
             precision    recall  f1-score   support

          0       0.83      0.79      0.81        24
          1       0.33      0.10      0.15        31
          2       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring):
    '''
    Run normal SVM classification without cross-fold validation.
    '''

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation

    # feature selection since we have a small sample space
    fs = SelectPercentile(scoring, percentile=20)

    pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)])

    pipeline = OneVsRestClassifier(pipeline)

    clfer = pipeline.fit(x_train, y_train)
    y_predict_train = clfer.predict(x_train)

    print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train)

    y_predict_test = clfer.predict(x_test)
    print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test)

    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)

    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
def test_classification_report_multiclass_with_digits():
    """Test performance report with added digits in floating point values"""
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = """\
             precision    recall  f1-score   support

     setosa    0.82609   0.79167   0.80851        24
 versicolor    0.33333   0.09677   0.15000        31
  virginica    0.41860   0.90000   0.57143        20

avg / total    0.51375   0.53333   0.47310        75
"""
    report = classification_report(
        y_true, y_pred, labels=np.arange(len(iris.target_names)),
        target_names=iris.target_names, digits=5)
    assert_equal(report, expected_report)

    # print classification report with label detection
    expected_report = """\
             precision    recall  f1-score   support

          0       0.83      0.79      0.81        24
          1       0.33      0.10      0.15        31
          2       0.42      0.90      0.57        20

avg / total       0.51      0.53      0.47        75
"""
    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)
Example #6
0
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test):  
    
    cv = ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0)

    for score in scores:
        print"# Tuning hyper-parameters for %s" % score
        print

        clf = GridSearchCV(estimator, tuned_params, cv=cv,
                           scoring='%s' % score)
        clf.fit(X_train, y_train)

        print"Best parameters set found on development set:"
        print
        print clf.best_params_
        print
        print "Grid scores on development set:"
        print
        for params, mean_score, scores in clf.grid_scores_:
            print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params)
        print

        print "Detailed classification report:"
        print
        print "The model is trained on the full development set."
        print "The scores are computed on the full evaluation set."
        print
        y_true, y_pred = y_test, clf.predict(X_test)
        print classification_report(y_true, y_pred)
        print
def main():
	# Get the data and targets
	df = pd.read_csv('train1.csv')
	df = df[df.rating != 'rating']
	corpus = [review for review in df.review]
	splitPoint = len(corpus)*2/3
	trainingCorpus = corpus[:splitPoint]
	testCorpus = corpus[splitPoint:]
	target = [rating for rating in df.rating]
	trainingTarget = np.array(target[:splitPoint])
	testTarget = np.array(target[splitPoint:])

	# Train the algorithm
	train_X, vocabList = createVectorizer(trainingCorpus, 'None', True)
	NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget)

	# Test the algorithm
	test_X = createVectorizer(testCorpus, vocabList, True)
	test_predict = NB_Bern_model.predict(test_X)
	print(np.mean(test_predict == testTarget))	
	print metrics.classification_report(testTarget, test_predict, target_names=['0', '1'])

	# Make Predictions
	predict_df = pd.read_csv('test2.csv')
	predictCorpus = [review for review in predict_df.review]
	member = [memberid for memberid in predict_df.ID]
	predict_X = createVectorizer(predictCorpus, vocabList, True)
	predictions = NB_Bern_model.predict(predict_X)
	predict_df.columns = ['ID', 'Predicted']
	for i in range(len(member)):
	 	predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i]
	predict_df.to_csv('submission1.csv', sep = ',', index=False)
  def print_metrics(self, y_true, y_pred, print_averages=True):
    print
    print "{:^30}".format("Confusion matrix")
    categories = sorted(self.categories)
    labels = " ".join("{:>10}".format(c) for c in categories)
    print "{:>10} {} {:>10}".format("gold\pred", labels, "total")
    for cat, predictions in zip(categories, metrics.confusion_matrix(y_true, y_pred)):
      vals = " ".join("{:>10d}".format(p) for p in predictions)
      print "{:>10} {} {:>10}".format(cat, vals, sum(predictions))
    print

    acc = metrics.accuracy_score(y_true, y_pred)
    print "Accuracy: {:.4f}".format(acc) 
    
    idx = 0
    d = {}
    for l in self.categories:
      d[l] = idx
      idx += 1

    print metrics.classification_report([d[y] for y in y_true], 
                                        [d[y] for y in y_pred], 
                                        target_names=self.categories)

    if print_averages:
      print "Macro averaging"
      self._print_metrics(y_true, y_pred, average='macro')

      print "Micro averaging"
      self._print_metrics(y_true, y_pred, average='micro')
Example #9
0
def test_classification_report():
    """Test performance report"""
    iris = datasets.load_iris()
    y_true, y_pred, _ = make_prediction(dataset=iris, binary=False)

    # print classification report with class names
    expected_report = """\
             precision    recall  f1-score   support

     setosa       0.82      0.92      0.87        25
 versicolor       0.56      0.17      0.26        30
  virginica       0.47      0.90      0.62        20

avg / total       0.62      0.61      0.56        75
"""
    report = classification_report(
        y_true, y_pred, labels=range(len(iris.target_names)),
        target_names=iris.target_names)
    assert_equal(report, expected_report)

    # print classification report with label detection
    expected_report = """\
             precision    recall  f1-score   support

          0       0.82      0.92      0.87        25
          1       0.56      0.17      0.26        30
          2       0.47      0.90      0.62        20

avg / total       0.62      0.61      0.56        75
"""
    report = classification_report(y_true, y_pred)
    assert_equal(report, expected_report)
Example #10
0
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def main(train_data, test_data):
    print "Training"

    m = HiddenMarkovModelTagger.train(train_data)

    print "Predicting"
    predicted_labels = []

    for i, sent in enumerate(test_data):
        if i % 500 == 0:
            print "%d / %d" %(i, len(test_data))
        predicted_labels += [tag 
                             for _, tag in m.tag(
                                     [word for word, _ in sent]
                             )]



    correct_labels = [tag 
                      for sent in test_data
                      for _, tag in sent]

    # print predicted_labels
    # print correct_labels

    from sklearn.metrics import classification_report

    print classification_report(correct_labels, predicted_labels)
        
    correct_n = len([1 
                     for p, c in zip(predicted_labels, correct_labels) 
                     if p == c])
        
    print "Item accuracy:", float(correct_n) / len(correct_labels)
Example #12
0
def separable_demo():
    """ Generate a linearly-separable dataset D, train a linear SVM on
    D, then output the resulting decision boundary on a figure.
    """
    from sklearn.datasets import make_blobs
    X, y = make_blobs(n_samples=200, n_features=2, 
                      centers=((0,0), (4, 4)),
                      cluster_std=1.0)
    plot_data(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    svc = svm.SVC(class_weight='auto')
    param_grid = {'kernel': ['linear'],
                  'C': [1e0, 1e1, 1e2, 1e3, 1e4]}
    strat_2fold = StratifiedKFold(y_train, k=2)
    print "    Parameters to be chosen through cross validation:"
    for name, vals in param_grid.iteritems():
        if name != 'kernel':
            print "        {0}: {1}".format(name, vals)
    clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold)
    clf.fit(X_train, y_train)
    print "== Best Parameters:", clf.best_params_
    y_pred = clf.predict(X_test)
    acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred))
    print "== Accuracy:", acc
    print classification_report(y_test, y_pred)
    plot_svm(clf.best_estimator_, X, y, X_test, y_test, 
             title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
def benchmark(clf_class, params, name):
    print "parameters:", params
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print "done in %fs" % (time() - t0)

    if hasattr(clf, 'coef_'):
        print "Percentage of non zeros coef: %f" % (
          np.mean(clf.coef_ != 0) * 100)

    print "Predicting the outcomes of the testing set"
    t0 = time()
    pred = clf.predict(X_test)
    print "done in %fs" % (time() - t0)

    print "Classification report on test set for classifier:"
    print clf
    print
    print classification_report(y_test, pred,
       target_names=news_test.target_names)

    cm = confusion_matrix(y_test, pred)
    print "Confusion matrix:"
    print cm

    # Show confusion matrix
    pl.matshow(cm)
    pl.title('Confusion matrix of the %s classifier' % name)
    pl.colorbar()
def score(train_labels, train_features, test_labels, test_features, save_file, use_tree=False):
    if use_tree:
        train_clf = Classifier(tree.DecisionTreeClassifier())
    else:
        train_clf = Classifier()

    print train_clf.clf
    print ''

    t_start = time.clock()
    train_clf.learn(train_features, train_labels)
    t_end = time.clock()
    if save_file:
        train_clf.save_to_file(open(save_file, 'w'))

    p_start = time.clock()
    predicted = train_clf.clf.predict(test_features)
    p_end = time.clock()

    test_labels_t = train_clf.labels.transform(test_labels)
    print classification_report(test_labels_t, predicted, target_names=train_clf.labels.classes_)
    print 'Training time: %fs' % (t_end - t_start)
    print 'Predicting time: %fs' % (p_end - p_start)
    print 'Mean squared error: %f' % mean_squared_error(test_labels_t, predicted)
    return train_clf.score(test_features, test_labels)
Example #15
0
def PredictAndAnalyze(data = data,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True),checkauc = False,ifprint = False,balancing = True):
    kf = KFold(len(target), n_folds=10, shuffle=True)
    aucs = []
    y_trueall = []
    y_pridictall = []
    for train, val in kf:
        X_train, y_train = np.array(data)[train], np.array(target)[train]
        if balancing == True:
            length = min([len(y_train[y_train == 0]),len(y_train[y_train == 1]),len(y_train[y_train == 2])])
            X_train = np.r_[X_train[y_train == 0][0:length],X_train[y_train == 1][0:length],X_train[y_train == 2][0:length]]
            y_train = np.r_[y_train[y_train == 0][0:length],y_train[y_train == 1][0:length],y_train[y_train == 2][0:length]]
        X_test, y_test = np.array(data)[val], np.array(target)[val]
        clf_cv.fit(X_train, y_train)
        y_pred = clf_cv.predict(X_test)
        y_true = y_test
        y_trueall = y_trueall + list(y_true)
        y_pridictall = y_pridictall  + list(y_pred)
        if ifprint == True:
            print(classification_report(y_true, y_pred))
        if checkauc == True:
            y_pred_cv = clf_cv.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_pred_cv)
            aucs.append(auc)
    if checkauc == True:
        print np.mean(aucs), np.std(aucs)
    print(classification_report(y_trueall, y_pridictall))
    return y_trueall, y_pridictall
Example #16
0
def xgb_model(x_train, x_test, y_train, y_test):
    dtrain = xgb.DMatrix( x_train, label=y_train)
    del x_train
    dtest = xgb.DMatrix( x_test, label=y_test)
    del x_test
    param = {}
    param['eta'] = 0.1
    param['max_depth'] = 10
    param['silent'] = 1
    param['num_class'] = 4
    param['objective'] = 'multi:softmax'
    param['nthread'] = 2
    param['n_estimators']=100
    #param['eval_metric'] = 'auc'
    plst = param.items()
    
    watchlist = [ (dtrain,'train'), (dtest, 'test') ]
    
    #evallist  = [(dtest,'eval'), (dtrain,'train')]
    num_round = 237
    bst = xgb.train( param, dtrain, num_round)
    #xgb.plot_importance(bst)
    
    #rf = RandomForestClassifier(n_estimators=3000, max_depth=10,n_jobs=2)
    #pipe1 = Pipeline([('sel',ColumnSelector(range(col_count))),('clf',bst)])
    #pipe2 = Pipeline([('sel',ColumnSelector(range(col_count[:-4]))),('clf',rf)])
    
    y_pred = bst.predict(dtest,ntree_limit=bst.best_ntree_limit)
    target_names = ['Start','Mid','End','Others']
    #eclf = EnsembleClassifier(clfs=[pipe1, pipe2],voting='soft',weights=[0.5,0.2])
    #eclf.fit(x_train,y_train)
    #y_pred = eclf.predict(x_test)
    print classification_report(y_test, y_pred, target_names=target_names)
    return bst
Example #17
0
def test_prf(fn1,fn2,sth,L):
    y_true=[]
    y_score=[]
    edges_1=prep.read_edges(fn1)
    edges_2=prep.read_edges(fn2)
    
    predict_set={}
    for key in sth.keys():
        predict_set[key]=predict_set.get(key,0.)+sth[key]
    predict_set=sorted(predict_set.iteritems(),key=lambda d:d[1],reverse=True)#

    threshold=predict_set[L][1]
    for i in edges_1:
        if sth[i]>threshold:
            y_score.append(1)
        else:
            y_score.append(0)

    for i in edges_1:
        if i not in edges_2:
            y_true.append(0)
        else:
            y_true.append(1)

    print classification_report(y_true,y_score)
    print auc_score(y_true,y_score)
def nearest_centroid_classifier(X_train, categories, X_test, test_categories):
    from sklearn.neighbors import NearestCentroid
    clf = NearestCentroid().fit(X_train, categories)
    y_roccio_predicted = clf.predict(X_test)
    print "\n Here is the classification report for NearestCentroid classifier:"
    print metrics.classification_report(test_categories, y_roccio_predicted)
    to_latex(test_categories, y_roccio_predicted)  
Example #19
0
    def test(self):
        lenW = len(self.vectorizer.vocabulary_)
        W = 3*lenW
        Y_true = []
        Y_pred = []
        for i,line in enumerate(self.test_lines):
            if line['type'] == 'q':
                r = line['answer']
                id = line['id']-1
                indices = [idx for idx in range(i-id, i+1)]
                memory_list = self.L_test[indices]

                m_o1 = O_t([id], memory_list, self.s_Ot)
                m_o2 = O_t([id, m_o1], memory_list, self.s_Ot)

                bestVal = None
                best = None
                for w in self.vectorizer.vocabulary_:
                    val = self.sR([id, m_o1, m_o2], self.H[w], memory_list, self.V)
                    if bestVal is None or val > bestVal:
                        bestVal = val
                        best = w
                Y_true.append(r)
                Y_pred.append(best)
        print metrics.classification_report(Y_true, Y_pred)
def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False):
  
    accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train))
    accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test))
    print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test))
    
    if not short:
    
      # confusion matrix
      # rows: actual group
      # columns: predicted group
      print('Confusion_matrix (training data):')
      print(metrics.confusion_matrix(y_train, model.predict(X_train)))
      
      print('Confusion_matrix (test data):')
      print(metrics.confusion_matrix(y_test, model.predict(X_test)))

      # precision =  tp / (tp + fp)
      # recall = tp / (tp + fn) (= sensitivity)
      # F1 = 2 * (precision * recall) / (precision + recall)
      print('\nPrecision - recall (training data):')
      print(metrics.classification_report(y_train, model.predict(X_train)))
      
      print('\nPrecision - recall (test data):')
      print(metrics.classification_report(y_test, model.predict(X_test)))
def svc_classifier(X_train, categories,X_test, test_categories):
    from sklearn.svm import SVC
    svm_classifier = SVC(C=100, gamma=0.1).fit(X_train, categories)
    y_svm_predicted = svm_classifier.predict(X_test)
    print '\n Here is the classification report for support vector machine classiffier:'
    print metrics.classification_report(test_categories, y_svm_predicted)
    to_latex(test_categories, y_svm_predicted)  
Example #22
0
def print_classification_metrics(estimated_labels, actual_labels):
    mapping = get_most_likely_class_map(estimated_labels, actual_labels)
    predicted_labels = []
    for i in estimated_labels:
        predicted_labels.append(mapping[i])
    
    print metrics.classification_report(estimated_labels, actual_labels)
def faces():
  from os import walk,path
  import numpy as np
  import mahotas as mh 
  from sklearn.cross_validation import train_test_split
  from sklearn.cross_validation import cross_val_score
  from sklearn.preprocessing import scale 
  from sklearn.decomposition import PCA 
  from sklearn.linear_model import LogisticRegression
  from sklearn.metrics import classification_report
  X = []
  y = []
  for dir_path,dir_names,file_names in walk('./data/att_faces'):
    for fn in file_names:
      if fn[-3:] == 'pgm':
        image_filename = path.join(dir_path,fn)
        X.append(scale(mh.imread(image_filename,as_grey=True).reshape(10304).astype('float32')))
        y.append(dir_path)
  X = np.array(X)
  X_train,X_test,y_train,y_test = train_test_split(X,y)
  pca = PCA(n_components = 150)
  X_train_reduced = pca.fit_transform(X_train)
  X_test_reduced = pca.transform(X_test)
  print 'original data were',X_train.shape
  print 'reduced is ',X_train_reduced.shape

  classifier = LogisticRegression()
  accuracies = cross_val_score(classifier,X_train_reduced,y_train)

  print 'cross val: ',np.mean(accuracies),accuracies
  classifier.fit(X_train_reduced,y_train)
  predictions = classifier.predict(X_test_reduced)
  print classification_report(y_test,predictions)
Example #24
0
def train_logreg(X, y, test_X, test_y, load_vec=True):
	""" 	
	Trains logistic regression on the feature set.
	"""
	full_y = y + test_y
	
	lb = LabelBinarizer()
	lb.fit(full_y)
	# Convert into 1-D array
	print len(X), len(test_X)
	model = LogisticRegression()
	big_X = X + test_X

	features = featurize(big_X)
	X, test_X = features[:4500], features[4500:]
	print X.shape, X

	model.fit(X, y)

	y_pred = model.predict(X)
	print set(y_pred)
	print metrics.classification_report(y, y_pred, digits = 3)
	y_pred = model.predict(test_X)
	print set(y_pred)
	print metrics.classification_report(test_y, y_pred, digits = 3)
Example #25
0
def sklearn_lp(X, y,
            output=None,
            kernel='knn', 
            gamma=None,
            n_neighbors=10, 
            alpha=1, 
            max_iter=1000, 
            tol=0.00001):

    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import classification_report
    from sklearn.metrics import accuracy_score

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3)
    label_prop_model = LabelPropagation(kernel=kernel, 
                                        gamma=gamma, 
                                        n_neighbors=n_neighbors, 
                                        alpha=alpha, 
                                        max_iter=max_iter, 
                                        tol=tol)
    label_prop_model.fit(X_train, y_train)

    y_predict = label_prop_model.predict(X_test)
    print 'y_train: ', y_train
    print 'y_predict: ', y_predict
    
    print '+--------------------------------------------------------+'
    print '|                         Report                         +'
    print '+--------------------------------------------------------+'
    print classification_report(y_test, y_predict)
    print 'accuracy: ' + str(accuracy_score(y_test, y_predict))
    print '\n\n'
Example #26
0
def main():
  data_id = 'B'
  data_path = '/broad/compbio/maxwshen/data/1-MAKETRAINTEST/complete/golf/'
  
  print 'train...', datetime.datetime.now()
  train_set = readin(data_id, 'train', data_path)
  print 'valid...', datetime.datetime.now()
  valid_set = readin(data_id, 'valid', data_path)
  print 'test...', datetime.datetime.now()
  test_set = readin(data_id, 'test', data_path)

  # Input to 300 node RBM to 2 node output
  dbn = DBN( \
    [xtrain.shape[1], 300, 2], \
    learn_rates = 5, \
    learn_rate_decays = 0.9, \
    epochs = 31, \
    verbose = 1)
  dbn.fit(dat_train, y_train)

  preds = dbn.predict(dat_test)
  print classification_report(y_test, preds)

  out_fn = 'dbn.pickle'
  with open(out_fn, 'w') as f:
    pickle.dump(dbn, out_fn)

  return
def all_report(auto_narrative_entities,auto_original_entities,auto_negative,manual_positive,manual_negative):
    y_true = []
    y_pred = []

    for e in manual_positive:
        y_true.append(1)
        if e in auto_narrative_entities or e in auto_original_entities:
            y_pred.append(1)
        elif manual_negative:
            y_pred.append(0)
        else:
            y_pred.append(-1)

    for e in manual_negative:
        y_true.append(0)
        if e in auto_narrative_entities or e in auto_original_entities:
            y_pred.append(1)
        elif manual_negative:
            y_pred.append(0)
        else:
            y_pred.append(-1)




    print classification_report(y_true, y_pred)
Example #28
0
    def on_epoch_end(self, epoch, logs={}):
        print logs

        corr=0
        tot=0
        preds = self.model.predict(self.dev_data, verbose=1)
        preds_text=[]
        for l in preds:
            preds_text.append(self.index2label[np.argmax(l)])

        print "Micro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"micro")
        print "Macro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"macro")
        print "Macro recall:", recall_score(self.dev_labels_text,preds_text,average=u"macro")

        if self.best_mr < recall_score(self.dev_labels_text,preds_text,average=u"macro"):
            self.best_mr = recall_score(self.dev_labels_text,preds_text,average=u"macro")
            model.save_weights(self.model_name + '_full_' + str(epoch) + '_MR_' + str(self.best_mr) + '.hdf5')
            print 'Saved Weights!'


        print classification_report(self.dev_labels_text, preds_text)
        for i in xrange(len(self.dev_labels)):

        #    next_index = sample(preds[i])
            next_index = np.argmax(preds[i])
            # print preds[i],next_index,index2label[next_index]

            l = self.index2label[next_index]

            # print "correct:", index2label[np.argmax(dev_labels[i])], "predicted:",l
            if self.index2label[np.argmax(self.dev_labels[i])]==l:
                corr+=1
            tot+=1
        print corr,"/",tot
Example #29
0
def tune_parameters(X_train,X_test, y_train,y_test,param_grid):

    '''
    Function to tune an SVM classifier and choose its parameters

    :param feature_matrix: training data
    :param labels: labels for training data
    :param param_grid: grid of parameters to try
    :return: clf.best_estimator_ the best classifier

    '''
    #X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=0)

    #X_train,X_test,y_train,y_test = split_data(feature_matrix,labels,params['split_percentage'])

    score = 'f1_weighted'

    clf = GridSearchCV(SVC(C=1), param_grid, cv=5, scoring=score, n_jobs=10)

    clf.fit(X_train, y_train)

    print "Best score during training: ", clf.best_score_
    print "Best estimator", clf.best_estimator_

    print "Classification report for validation set:"
    print classification_report(y_test,clf.predict(X_test))

    return clf.best_estimator_
def test_digits() :
    
    from sklearn.cross_validation import train_test_split 
    from sklearn.datasets import load_digits
    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    from sklearn.preprocessing import LabelBinarizer
    
    digits = load_digits()
    X = digits.data
    y = digits.target   #labels
    X /= X.max()        #norm

    nn = NeuralNetwork([64,100,10],'logistic')  #8x8 input, 10 output
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
    labels_train = LabelBinarizer().fit_transform(y_train)  #convert no to vector
    labels_test = LabelBinarizer().fit_transform(y_test)

    nn.fit(X_train,labels_train,epochs=100)
    predictions = []
    for i in range(X_test.shape[0]) :
        o = nn.predict(X_test[i])
        predictions.append(np.argmax(o))
    print confusion_matrix(y_test,predictions)
    print classification_report(y_test,predictions)
    print 'accuracy at %0.3f'%accuracy_score(y_test,predictions)
Example #31
0
def main():
    global start_dens
    st.title("Multi Variate Contamination in Fuel")

    menu = ["Home", "Model Training", "Contaminant Prediction"]
    choice = st.sidebar.selectbox("Menu", menu)

    if choice == "Home":
        st.subheader("Home")
        image_file = st.file_uploader("Upload Image",
                                      type=['png', 'jpeg', 'jpg'])
        if image_file is not None:
            # To See Details
            # st.write(type(image_file))
            # st.write(dir(image_file))
            file_details = {
                "Filename": image_file.name,
                "FileType": image_file.type,
                "FileSize": image_file.size
            }
            st.write(file_details)

            img = load_image(image_file)
            st.image(img, width=250)

    if choice == "Model Training":
        if st.button("Start Train"):
            allResults = glob.glob(
                'batch31-38_with_target_diff_prx_2000tampered/*.csv',
                recursive=True)
            allResults = sorted(allResults, key=lambda x: (x.split("/")[-1]))
            #st.write(allResults)

            newpath1 = 'batch31-38_with_target_diff_prx_2000tampered/'
            # newpath1='/content/drive/MyDrive/OIL SAMPLES DATA1/'

            folder = newpath1  ## data directory
            tag = str('.csv')  ## format to import
            initString = '-'  ## string in csv file name to search for category (normal, sludge, water, together)
            fileList = directorySearch(folder, tag)
            # print(fileList)
            final_filelist = pd.DataFrame(
                index=range(0, len(fileList)),
                columns=['file', 'Target', 'file_dir', 'window_id'])
            for i in range(0, (len(fileList))):
                fileName = fileList[i]
                res1 = fileName.find(initString)
                if res1 == -1:
                    res1 = fileName.find('_')
                if res1 == -1:
                    print(res1)
                    res1 = 5
                c1 = int(res1 + 1)
                c5 = int(res1 + 12)

                wloc = fileName.rfind('W', c1, c5)
                sloc = fileName.rfind('S', c1, c5)
                tloc = fileName.rfind('T', c1, c5)
                finalCat = max([wloc, sloc, tloc])
                strCat = fileName[finalCat]
                # print(strCat)

                classLabel = int(0)
                if strCat == 'S':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'---Sludge')
                    classLabel = int(1)
                if strCat == 'W':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'---Water')
                    classLabel = int(2)
                if strCat == 'T':
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat
                    # print(fileName,'--- Mix')
                    classLabel = int(3)
                if strCat not in ['S', 'T', 'W']:
                    final_filelist['file'][i] = fileName
                    final_filelist['Target'][i] = strCat

                final_filelist['file_dir'][i] = allResults[i]
                final_filelist['window_id'][i] = i + 1

            ll = []
            for i, j in enumerate(final_filelist['file']):
                # print(i,j)
                head, tail = os.path.split(j)
                r1 = re.split('_', tail)
                r2 = re.split('-', r1[0])
                print(r2)
                # if len(r2)==3 and int(r2[1]) < 37 and int(r2[1])<37 and not  'A' in r1[0] :
                if len(r2) == 3 and 'A' not in r2[2]:
                    ll.append(tail)
                elif len(r2) == 2 and 'A' not in (r2[1]):
                    ll.append(tail)
                elif len(r2) == 4 and 'A' not in (r2[3]):
                    ll.append(tail)

            dff = pd.DataFrame({'file': ll})
            dff['file'].count()

            df4 = pd.DataFrame()
            c = 0
            # for i,j in enumerate(allResults):
            for i, j in enumerate(dff['file']):
                # print(i,j)
                df = pd.read_csv(
                    'batch31-38_with_target_diff_prx_2000tampered/' + j)
                head, tail = os.path.split(j)
                # print(i,df.shape[1])
                df4[tail] = (df['Pressure_tmp'].rolling(300).std())

            df9 = pd.DataFrame(index=range(0, len(df4.columns)),
                               columns=[
                                   'file', 'pre-trans_mean', 'trans_mean',
                                   'post-trans_mean', 'transient_width'
                               ])

            for z, col in enumerate(df4.columns):
                start = 0
                end = 0

                a = df4[col]

                b = a.quantile(0.7)  # threshold set here : 70 percentile
                x = df4[col] > b  # find values greater than threshold

                # print(a)
                for i, j in enumerate(a):
                    # print(i,j)
                    if j > b:  # find value greater than threshold
                        start = i  # get the position of value greater than threshold
                        break
                for k, l in enumerate(
                        a[start:]
                ):  # now start checking from position that was marked earlier
                    # print(i,j)
                    if l < b and abs(
                            k
                    ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                        end = start + k
                        break
                df9['file'][z] = col
                df9['pre-trans_mean'][z] = (df4[col].iloc[:start].mean())
                df9['trans_mean'][z] = (df4[col].iloc[start:end].mean())
                df9['post-trans_mean'][z] = (df4[col].iloc[end:].mean())
                if (end - start) > 0:
                    df9['transient_width'][z] = end - start
                else:
                    df9['transient_width'][z] = 0

            df5 = df4.describe().transpose()
            df5 = df5.reset_index()
            df10 = pd.merge(df9,
                            df5[['index', 'std', 'max']],
                            left_on='file',
                            right_on='index',
                            how='left')
            del df10['index']
            df10 = df10.set_index('file')

            df11 = pd.merge(df10,
                            final_filelist[['file', 'Target']],
                            left_on='file',
                            right_on='file',
                            how='left')
            df11 = df11.set_index('file')
            df11 = df11.astype({
                'pre-trans_mean': 'float64',
                'trans_mean': 'float64',
                'post-trans_mean': 'float64',
                'transient_width': 'float64'
            })

            df12 = pd.DataFrame()
            for i, j in enumerate(dff['file']):
                # print(i,j)
                df = pd.read_csv(
                    'batch31-38_with_target_diff_prx_2000tampered/' + j)
                head, tail = os.path.split(j)
                # print(i,df.shape[1])
                df12[tail] = (df['Density'].rolling(300).std())

            df13 = pd.DataFrame(index=range(0, len(df12.columns)),
                                columns=[
                                    'file', 'pre-trans_mean-density',
                                    'trans_mean-density',
                                    'post-trans_mean-density',
                                    'transient_width-density'
                                ])

            for z, col in enumerate(df12.columns):
                start = 0
                end = 0
                print(col)  # file name
                a = df12[col]

                b = a.quantile(0.7)  # threshold set here : 70 percentile
                x = df12[col] > b  # find values greater than threshold
                # print(a)
                for i, j in enumerate(a):
                    # print(i,j)
                    if j > b:  # find value greater than threshold
                        start = i  # get the position of value greater than threshold
                        break
                for k, l in enumerate(
                        a[start:]
                ):  # now start checking from position that was marked earlier
                    # print(i,j)
                    if l < b and abs(
                            k
                    ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                        end = start + k
                        break
                df13['file'][z] = col
                df13['pre-trans_mean-density'][z] = (
                    df12[col].iloc[:start].mean())
                df13['trans_mean-density'][z] = (
                    df12[col].iloc[start:end].mean())
                df13['post-trans_mean-density'][z] = (
                    df12[col].iloc[end:].mean())
                if (end - start) > 0:
                    df13['transient_width-density'][z] = end - start
                else:
                    df13['transient_width-density'][z] = 0
            df13 = df13.astype({
                'pre-trans_mean-density': 'float64',
                'trans_mean-density': 'float64',
                'post-trans_mean-density': 'float64',
                'transient_width-density': 'float64'
            })
            df11.drop(['std'], axis=1, inplace=True)

            df14 = df13[[
                'file', 'pre-trans_mean-density', 'post-trans_mean-density'
            ]]
            df14['pre-trans_mean-density'] = df14[
                'pre-trans_mean-density'].fillna(0)

            df11.dropna(inplace=True)

            le = preprocessing.LabelEncoder()
            df11['Target'] = le.fit_transform(df11['Target'])
            df11.loc[:, 'Target']

            df15 = df11.merge(df14, how='inner', on='file')
            del df15['file']
            df15 = df15[[
                'pre-trans_mean', 'trans_mean', 'post-trans_mean',
                'transient_width', 'max', 'pre-trans_mean-density',
                'post-trans_mean-density', 'Target'
            ]]
            st.write(df15)
            col = df15.columns

            features = col.tolist()
            feature = features[:-1]
            target = features[-1]

            # x=dff_tr.loc[:,feature].values
            # y=dff_tr.loc[:,target].values
            x = df15.loc[:, feature].values
            y = df15.loc[:, target].values
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.3, random_state=98)
            ost = SMOTE()
            os_data_X, os_data_y = ost.fit_resample(x_train, y_train)
            os_data_X = pd.DataFrame(data=os_data_X, columns=feature)
            os_data_y = pd.DataFrame(data=os_data_y, columns=['Target'])

            # print('After Oversampling:')
            os_data_X, os_data_y = ost.fit_resample(x_train, y_train)
            clf_rf_bal = RandomForestClassifier(n_estimators=10,
                                                random_state=99)
            clf_rf_bal = clf_rf_bal.fit(os_data_X, os_data_y)

            #from sklearn.inspection import permutation_importance
            #results = permutation_importance(clf_rf_bal, x, y, scoring='accuracy')
            #importance = results.importances_mean
            # summarize feature importance
            #print('using permutaiton feature importance')
            #for i, v in enumerate(importance):
            #    print('Feature: %0d, Score: %.5f' % (i, v))
            #importance = clf_rf_bal.feature_importances_
            # summarize feature importance
            #print('using feature importance')
            #for i, v in enumerate(importance):
            #    print('Feature: %0d, Score: %.5f' % (i, v))

            bal_cm = confusion_matrix(y_test, clf_rf_bal.predict(x_test))
            y_pred_bal = clf_rf_bal.predict(x_test)

            print('balanced classification report')
            cls_rpt = classification_report(y_test, y_pred_bal)
            st.write(f'classification report : {cls_rpt}')

            bal_ac = accuracy_score(y_test, clf_rf_bal.predict(x_test))
            st.write(f'accuracy score : {bal_ac}')

            filename = 'finalized_model1.pkl'
            pickle.dump(clf_rf_bal,
                        open(os.path.join(os.getcwd(), filename), 'wb'))

    if choice == "Contaminant Prediction":
        st.subheader("Dataset")
        data_file = st.file_uploader("Upload CSV", type=['csv'])
        if st.button("Process") and data_file is not None:
            file_details = {
                "Filename": data_file.name,
                "FileType": data_file.type,
                "FileSize": data_file.size
            }
            st.write(file_details)

            df = pd.read_csv(data_file)
            st.dataframe(df)

            tag = str('.csv')  ## format to import
            initString = '-'  ## string in csv file name to search for category (normal, sludge, water, together)
            fileName = data_file.name
            # print(fileList)
            final_filelist = pd.DataFrame(columns=['file', 'Target'])
            res1 = fileName.find(initString)
            if res1 == -1:
                res1 = fileName.find('_')
            if res1 == -1:
                print(res1)
                res1 = 5
            c1 = int(res1 + 1)
            c5 = int(res1 + 12)

            wloc = fileName.rfind('W', c1, c5)
            sloc = fileName.rfind('S', c1, c5)
            tloc = fileName.rfind('T', c1, c5)
            finalCat = max([wloc, sloc, tloc])
            strCat = fileName[finalCat]

            st.write(f'FileName:{fileName}')
            if strCat not in ['S', 'T', 'W']:
                strCat = 'No Contaminant'
                st.write('No Contaminant')
            if strCat in ['S', 'T', 'W']:
                st.write('Contaminant Exists')
            if strCat == 'S':
                st.write('Type of Contaminant: Sludge')
            if strCat == 'W':
                st.write('Type of Contaminant: Water')
            if strCat == 'T':
                st.write('Type of Contaminant: Sludge+Water')

            df4 = pd.DataFrame()
            df4['roll_std'] = df['Pressure_tmp'].rolling(300).std()
            df5 = df4.describe().transpose()
            df5 = df5.reset_index()
            maxx = df5['max'][0]
            df9 = pd.DataFrame(columns=[
                'file', 'pre_trans_mean', 'trans_mean', 'post_trans_mean',
                'transient_width'
            ])

            # for col in df4.columns:
            # end = 0
            # print(col)  # file name
            # a = df4[col]
            a = df4['roll_std']
            st.write(a)

            b = a.quantile(0.7)  # threshold set here : 70 percentile
            # print(b)
            st.write(b)
            # x = df4[col] > b
            x = df4['roll_std'] > b  # find values greater than threshold
            # print(x.value_counts())
            # print(a)
            st.write(x)
            for i, j in enumerate(a):
                # print(i,j)
                if j > b:  # find value greater than threshold
                    start = i  # get the position of value greater than threshold
                    break
            for k, l in enumerate(
                    a[start:]
            ):  # now start checking from position that was marked earlier
                # print(i,j)
                if l < b and abs(
                        k
                ) > 200:  # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part)
                    end = start + k
                    break

            file = data_file.name
            pre_trans_mean = (df4['roll_std'].iloc[:start].mean())
            trans_mean = (df4['roll_std'].iloc[start:end].mean())
            post_trans_mean = (df4['roll_std'].iloc[end:].mean())
            if (end - start) > 0:
                transient_width = end - start
            else:
                transient_width = 0

            df12 = pd.DataFrame()
            df12['roll_std_den'] = (df['Density'].rolling(300).std())

            df13 = pd.DataFrame()
            df13 = pd.DataFrame(columns=[
                'file', 'pre_trans_mean_dens', 'trans_mean_dens',
                'post_trans_mean_dens', 'transient_width_dens'
            ])

            # for col in df4.columns:
            # end = 0
            # print(col)  # file name
            # a = df4[col]
            p = df12['roll_std_den']

            q = p.quantile(0.7)  # threshold set here : 70 percentile
            # print(b)
            # st.write(b)
            # x = df4[col] > b
            xx = df12['roll_std_den'] > q  # find values greater than threshold
            # print(x.value_counts())
            # print(a)
            # st.write(xx)
            for i, j in enumerate(p):
                # print(i,j)
                if j > q:  # find value greater than threshold
                    start_dens = i  # get the position of value greater than threshold
                    break
            for k, l in enumerate(
                    p[start_dens:]):  # now start checking from position
                # print(i,j)
                if l < q and abs(
                        k) > 200:  # find values that are less than threshold
                    end_dens = start_dens + k
                    break

            pre_trans_mean_dens = df12['roll_std_den'].iloc[:start_dens].mean()
            trans_mean_dens = df12['roll_std_den'].iloc[
                start_dens:end_dens].mean()
            post_trans_mean_dens = df12['roll_std_den'].iloc[end_dens:].mean()
            if (end_dens - start_dens) > 0:
                transient_width_dens = end_dens - start_dens
            else:
                transient_width_dens = 0

            zz = {
                'file': file,
                'pre_trans_mean': pre_trans_mean,
                'trans_mean': trans_mean,
                'post_trans_mean': post_trans_mean,
                'pre_trans_mean_dens': pre_trans_mean_dens,
                'trans_mean_dens': trans_mean_dens,
                'post_trans_mean_dens': post_trans_mean_dens
            }

            pre_trans_mean1 = pre_trans_mean
            trans_mean1 = trans_mean
            post_trans_mean1 = post_trans_mean
            transient_width1 = transient_width
            max1 = maxx
            pre_trans_mean_dens1 = pre_trans_mean_dens
            post_trans_mean_dens1 = post_trans_mean_dens

            st.write(zz)
            # load the model from disk
            loaded_model = pickle.load(open('finalized_model1.pkl', 'rb'))

            result = loaded_model.predict([[
                pre_trans_mean1, trans_mean1, post_trans_mean1,
                transient_width1, max1, pre_trans_mean_dens1,
                post_trans_mean_dens1
            ]])

            if result == 0:
                st.write(f'Predicted Contaminant: Sludge')
            if result == 1:
                st.write(f'Predicted Contaminant: Water')
            if result == 2:
                st.write(f'Predicted Contaminant: Water+Sludge')
            if result == 3:
                st.write('No Contaminant')
Example #32
0
# define the 3072-1024-512-10 architecture using Keras
model = Sequential()
model.add(Dense(1024, input_shape=(3072,), activation="relu"))
model.add(Dense(512, activation="relu"))
model.add(Dense(10, activation="softmax"))

# train the model using SGD
print("[INFO] training network...")
sgd = SGD(0.01)
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])
H = model.fit(trainX, trainY, validation_data=(testX, testY), epochs=100, batch_size=32)

# evaluate the network
print("[INFO] evaluating network...")
predictions = model.predict(testX, batch_size=32)
print(classification_report(testY.argmax(axis=1), predictions.argmax(axis=1), target_names=labelNames))

# plot the training loss and accuracy
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(100), H.history["loss"], label="train_loss")
plt.plot(np.arange(100), H.history["val_loss"], label="val_loss")
plt.plot(np.arange(100), H.history["acc"], label="train_acc")
plt.plot(np.arange(100), H.history["val_acc"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()
plt.savefig(args["output"])
plt.show()
def evaluate_model(model, X_test, y_test, category_names):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=category_names))
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples / 2:]
predicted = classifier.predict(data[n_samples / 2:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

images_and_predictions = list(zip(digits.images[n_samples / 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
    plt.subplot(2, 4, index + 5)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

plt.show()
#image data mechine learning
from sklearn import model_selection, svm, metrics
# CSV 파일을 읽어 들이고 가공(이미지 데이터의 각 픽셀을 실수 벡터로 치환)
def load_csv(fname):
    labels = []
    images = []
    with open(fname, "r") as f:
        for line in f:
            cols = line.split(",")
            if len(cols) < 2: continue
            labels.append(int(cols.pop(0)))
            vals = list(map(lambda n: int(n) / 256, cols))
            images.append(vals)
    return {"labels":labels, "images":images}

data = load_csv("./mnist/train.csv")
test = load_csv("./mnist/t10k.csv")

clf = svm.SVC()
clf.fit(data["images"], data["labels"])
predict = clf.predict(test["images"])
# 결과 확인
ac_score = metrics.accuracy_score(test["labels"], predict)
cl_report = metrics.classification_report(test["labels"], predict)
print("정답률 =", ac_score)
print("리포트 =")
print(cl_report)
Example #36
0
from sklearn.tree import DecisionTreeClassifier
import VeriYukle
import numpy as np

print(' KARAR AĞAÇLARI '.center(50, '-'))

x_train, x_test, y_train, y_test = VeriYukle.egitimTestVeriSeti()

dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)

y_pred = dt.predict(x_test)

cm = confusion_matrix(y_true=y_test, y_pred=y_pred)

print('Doğruluk matrisi'.capitalize().title())
print(cm)

FP = cm.sum(axis=0) - np.diag(cm)
FN = cm.sum(axis=1) - np.diag(cm)
TP = np.diag(cm)
TN = cm.sum() - (FP + FN + TP)

print('Doğruluk değerleri'.upper().center(50, '-'))
print(f"FP:{FP.sum()} FN:{FN.sum()} TP:{TP.sum()} TN:{TN.sum()}")
dogruluk = (TP.sum() + TN.sum()) / (TP.sum() + TN.sum() + FN.sum() +
                                    FP.sum()) * 100
print(f'Doğruluk: % {dogruluk}')

print(classification_report(y_test, y_pred))
from sklearn.datasets import fetch_mldata
from sklearn import linear_model
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score

mnist = fetch_mldata('MNIST original')

X_train = mnist.data[:60000] / 255.0
Y_train = mnist.target[:60000]

X_test = mnist.data[60000:] / 255.0
Y_test = mnist.target[60000:]

Y_train[Y_train > 1.0] = 0.0
Y_test[Y_test > 1.0] = 0.0

clf = linear_model.LogisticRegression()
clf.fit(X_train, Y_train)

Y_pred = clf.predict(X_test)
print(classification_report(Y_test, Y_pred))

accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average="weighted")
recall = recall_score(Y_test, Y_pred, average="weighted")

print(accuracy, precision, recall)
Example #38
0
#yp=pd.read_csv(r'test.csv')
#yp=datos.to_numpy()
#yp=np.delete(yp, 0, axis=1)

#print(len(yp[1]))
from sklearn import model_selection
from sklearn.metrics import confusion_matrix

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.33)

clasificador.fit(X_train, y_train)
print(len(X_train), len(X_test))
prediccion = clasificador.predict(X_test)
print(prediccion)
print()
print(confusion_matrix(y_test, prediccion))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nExactitud: {:.2f}\n'.format(accuracy_score(y_test, prediccion)))

#Realizamos 10 veces las pruebas

from sklearn.metrics import classification_report
print(' \n Informe de clasificación \n ')
print(
    classification_report(
        y_test,
        prediccion,
        target_names=['Clase 0', 'Clase 1', 'Clase 2', 'Clase 3']))
Example #39
0
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import sklearn

# generate the XOR data
tl = np.random.uniform(size=(100, 2)) + np.array([-2.0, 2.0])
tr = np.random.uniform(size=(100, 2)) + np.array([2.0, 2.0])
bl = np.random.uniform(size=(100, 2)) + np.array([-2.0, -2.0])
br = np.random.uniform(size=(100, 2)) + np.array([2.0, -2.0])
X = np.vstack([tl, tr, br, bl])
Y = np.hstack([[1] * len(tl), [-1] * len(tr), [1] * len(br), [-1] * len(bl)])

# split dataset
(trainData, testData, trainLabels,
 testLabels) = train_test_split(X, Y, test_size=0.25, random_state=42)

# train the linear SVM model
print('[RESULT] SVM w/ Linear Kernel')
model = SVC(kernel='linear')
model.fit(trainData, trainLabels)
print(classification_report(testLabels, model.predict(testData)))

print('[RESULT] SVM w/ Polynomial Kernel')
model = SVC(kernel='poly', degree=2, coef0=1)
model.fit(trainData, trainLabels)
print(classification_report(testLabels, model.predict(testData)))
rows,columns=df.shape
df.columns = ['TID', 'Text','Tag','Label']

#neu 0
#neg 1
#pos 2


text=[]
label=[]
for i in df.Text:
	text.append(i)
for i in df.Label:
	label.append(i)



X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.33, random_state=42)

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))
Example #41
0
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator_


###############################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))


###############################################################################
# Qualitative evaluation of the predictions using matplotlib

def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
    """Helper function to plot a gallery of portraits"""
    pl.figure(figsize=(1.8 * n_col, 2.4 * n_row))
    pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
    for i in range(n_row * n_col):
        pl.subplot(n_row, n_col, i + 1)
        pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray)
        pl.title(titles[i], size=12)
        pl.xticks(())
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import classification_report


# In[106]:


# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(x_train,y_train)
predmnb = mnb.predict(x_test)
print("Confusion Matrix for Multinomial Naive Bayes:")
print(confusion_matrix(y_test,predmnb))
print("Score:",round(accuracy_score(y_test,predmnb)*100,2))
print("Classification Report:",classification_report(y_test,predmnb))


# **The performance score of Naive Bayes classifier is 86.06. Since it is high score, I will treat this model as my baseline.**

# # 5.4.2 Random Forest Classifier

# There is no correlation between our feature(text) and target(review_stars) and this is the reason for choosing Random Forest Classifier.
# The vital thing for a Random Forest Classifier model to make an accurate class prediction is the trees of the forest and more importantly their predictions need to be uncorrelated (or at least have low correlations with each other).
# 
# Random forests are an ensemble learning method for classification. It operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees.

# In[107]:


# Random Forest
def display_classification_report(true_labels, predicted_labels, classes=[1,0]):

    report = metrics.classification_report(y_true=true_labels, 
                                           y_pred=predicted_labels, 
                                           labels=classes) 
    print(report)
Example #44
0
# train the network
print("[INFO] training network...")
H = model.fit(trainX,
              trainY,
              validation_data=(testX, testY),
              batch_size=32,
              epochs=100,
              verbose=1)

# evaluate the network
print("[INFO] evaluating network...")
predictions = model.predict(testX, batch_size=32)
print(
    classification_report(testY.argmax(axis=1),
                          predictions.argmax(axis=1),
                          target_names=["cat", "dog", "panda"]))

# plot the training lass and accuracy
plt.style.use("ggplot")
plt.figure()
plt.plot(np.arange(0, 100), H.history["loss"], label="train_loss")
plt.plot(np.arange(0, 100), H.history["val_loss"], label="val_loss")
plt.plot(np.arange(0, 100), H.history["acc"], label="train_acc")
plt.plot(np.arange(0, 100), H.history["val_acc"], label="val_acc")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend()
#plt.show()
plt.savefig("output/shallownet_animals.png")
Example #45
0
        print("svm")
        svm_model.fit(svm_trainX_scaled, svm_trainy)
        print(svm_model.score(svm_testX_scaled, svm_testy))
        pred = svm_model.predict(svm_testX_scaled)

        score = indicator(pred, svm_testy)
        Accuracy, Precision, Recall, F_meature = score.getMetrics()
        Specific = score.getSpecific()
        TPR, FPR = score.getfprtpr()
        AUC, x, y = score.getAuc()
        sumx += x
        sumy += y
        MCC = score.getMCC()
        sumAccuracy += Accuracy
        print(classification_report(svm_testy, pred))
        print("Accuracy:", Accuracy)
        print("Precison:", Precision)
        print("Recall:", Recall)
        print("F-meature:", F_meature)
        print("Specific:", Specific)
        print("MCC:", MCC)
        print("AUC:", AUC)
        print("TPR:", TPR)
        print("FPR:", FPR)
        sumPrecision += Precision
        sumRecall += Recall
        sumF_meature += F_meature
        sumAUC += AUC
        sumSpecific += Specific
        sumMCC += MCC
dirpath = os.getcwd()
#"C:\\Users\\iAngelMx\\Documents\\GitHub\\nlp\\deteccionDeSentimientos"
[sampleTexts, y] = prepareRawText2Classify(dirpath,
                                           tipoRawText="review",
                                           reviewCategory=categoria)
print("(1-> Positive 0-> Negative)")
y = np.asarray(y)

#y<- etiquetas de los textos
#X<- Lista de características
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(sampleTexts)
X = X_counts

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.5,
                                                    random_state=42)

import mord as m

clf = m.LogisticIT()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn import metrics
print("Precisión de prediccion: ", clf.score(X_test, y_test))
print("Matriz de confusión: \n", metrics.confusion_matrix(y_test, y_pred))
print("Classification report: \n",
      metrics.classification_report(y_test, y_pred))
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import classification_report


x_train, y_train, x_valid, y_valid, x_test, y_test = prepare_data(one_hot=False)

classifiers = [
    GaussianNB(),
    #  RidgeClassifier(tol=1e-2, solver="lsqr"),
    QuadraticDiscriminantAnalysis(),
    LinearDiscriminantAnalysis(),
    DecisionTreeClassifier(max_depth=5),
    KNeighborsClassifier(3, n_jobs=-1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=-1),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    SVC(kernel="linear", C=0.025, probability=True),
    SVC(gamma=2, C=1, probability=True),
    SVC(kernel="rbf", C=0.025, probability=True),
    MLPClassifier(alpha=1),
    GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1),
]

for clf in classifiers:
    print('_' * 80)
    print(clf.__class__.__name__)
    clf.fit(x_train, y_train)
    print('Train/val/test accuracy: ', clf.score(x_train, y_train), clf.score(x_valid, y_valid), clf.score(x_test, y_test))
    print('Classification report of Test data')
    print(classification_report(y_test, clf.predict(x_test)))
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(dataset.drop('Purchased',axis=1))
scaled_features = scaler.transform(dataset.drop('Purchased',axis=1))

df_feat = pd.DataFrame(scaled_features,columns=['Age', 'EstimatedSalary','Male'])
df_feat.head()

#train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_feat,dataset['Purchased'],
                                                    test_size=0.30)

#import model
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

#training
classifier.fit(X_train, y_train)

#predicting
y_pred = classifier.predict(X_test)

#evaluation
from sklearn.metrics import classification_report,confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cr=classification_report(y_test,y_pred)
print("Confusion Matrix: \n",cm)
print("Classification Report: \n", cr)

print("Accuracy is micro avg in classification report")
Example #49
0
Test loss: 0.04145523567434866
Test accuracy: 0.9863

We define most misclassified as the 3 digits that have the least precision.
Precision: Out of all digits that were classified as 'C' how many were actually correct?

For now these are digit 7, 0 and 4 in order of worst to best.

"""

predictions = model.predict(x_test)
predictions = predictions.argmax(axis=1)
y_test2 = y_test.argmax(axis=1)

print (classification_report(y_test2,predictions,digits=5))


"""
For using mean squared error we get:

Test loss: 0.006500150103870692
Test accuracy: 0.9596

So it's worse than categorical cross-entropy cost.

Now the worst digits are 8, 3 and 0 in order of worst to best

Running it again gives:
Test loss: 0.007368972765450598
Test accuracy: 0.9501
Example #50
0
        dimensions = class_names
    cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    fig, ax = plt.subplots(figsize=(10, 10))
    colors = ["orange", "green"]
    sn.heatmap(cmn,
               annot=True,
               fmt='.2f',
               xticklabels=dimensions,
               cmap=colors,
               yticklabels=dimensions)
    plt.title("Confusion Matrix")
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show(block=False)

print(classification_report(y_true, y_pred))

print("\nAccuracy Metrics")

TP, FP, TN, FN = perf_measure(y_true, y_pred)

precision = TP / (TP + FP)
recall = TP / (TP + FN)

print('Accuracy score: ' + str(accuracy_score(y_true, y_pred)))
print("Precision Score: " + str(precision))
print("Recall: " + str(recall))

print("Logging...")

if record:
Example #51
0
    plt.tight_layout()
plt

from sklearn.metrics import classification_report, confusion_matrix

#進行預測並製作混淆矩陣

Y_pred = model.predict(X_test)
print("Y_pred:", Y_pred)
y_pred = np.argmax(Y_pred, axis=1)
print("Y_pred:", y_pred)
''' 
                       # (or)

y_pred = model.predict_classes(X_test)
print("Y_pred:",y_pred)
'''
p = model.predict_proba(X_test)  # to predict probability

target_names = ['class 0(Flowers)', 'class 1(Dogs)']
print(
    classification_report(np.argmax(Y_test, axis=1),
                          y_pred,
                          target_names=target_names))
print(confusion_matrix(np.argmax(Y_test, axis=1), y_pred))

# 儲存模型

fname = "weights-Test-CNN.hdf5"
model.save_model(fname, overwrite=True)
def calc_acc_proba(df_acc,
                   df_proba,
                   subid,
                   mask_name,
                   X,
                   ev_labels,
                   run_labels,
                   cv,
                   multi_class='ovr',
                   univariate_fsel_k=None,
                   pca_n=None,
                   upsampling=False,
                   undersampling=False,
                   conf_mat=False,
                   cm_group=None,
                   print_report=False,
                   compute_AUC=False,
                   df_auc=None,
                   cv_C=False,
                   repeated_ttest_fsel=None):
    """Classify 2D data & return accuracy + probabilities for each class
    ----------
    df_acc : pandas dataframe
        Must have columns=['subid', 'mask_name', 'category', 'classifier', 'accuracy'])
    df_proba : pandas dataframe
        Must have columns=['subid', 'mask_name', 'true_category', 'guess_category', 'classifier', 'probability'])
    subid : str
        Subject ID (e.g., 'ap01')
    mask_name: str
    	Name of mask used when calling get_subj_data
    X : 2D numpy array
        Selected BOLD data (sample x voxel) for classification
    ev_labels : list/array of strings
        condition labels (length = # of samples)
    run_labels : list/array of ints
        run label by which to perform cross-validation (length = # of samples)
    cv : cross-validation generator (e.g., LeaveOneLabelOut(run_labels))
    multi_class : str
        In multiclass case, training uses one-vs-rest ('ovr') or multinomial ('multinomial')
    univariate_fsel_k : int
        Option to perform univariate (ANOVA) feature selection based on the training data; 
        take the k best features
    pca_n : int
        Option to perform PCA on the training set to reduce the number of features
    upsampling : bool
        Option to over-sample using SMOTE to deal with class imbalance
    undersampling : bool
        Option to under-sample using random under-sampling (randomly pick samples without 
        replacement) to deal with class imbalance
    cv_C : bool
        Option to select C via CV; only works for multinomial LR (multi_class = 'multinomial')
    repeated_ttest_fsel : int
        Option to select k features for each combination of t-tests (None otherwise)

    Returns
    -------
    df_acc : pandas dataframe
        Must have columns=['subid', 'mask_name', 'category', 'classifier', 'accuracy', 'count'])
    df_proba : pandas dataframe
        Must have columns=['subid', 'mask_name', 'true_category', 'guess_category', 'classifier', 'probability'])
    df_auc : pandas dataframe
        Must have columns=['subid', 'mask_name', 'category', 'classifier', 'auc'])
    """

    # quick double check
    if repeated_ttest_fsel and univariate_fsel_k:
        print 'Cannot have both repeated_ttest_fsel and univariate_fsel_k; one needs to be set to None'
        return

    # Determine classifier
    if multi_class == 'ovr':
        lr_classifier = LogisticRegression(penalty='l2', C=1.)
    elif multi_class == 'multinomial':
        lr_classifier = LogisticRegression(penalty='l2',
                                           C=1.,
                                           multi_class='multinomial',
                                           solver='newton-cg')
    elif multi_class == 'balanced':  #useful if classes are unbalanced
        lr_classifier = LogisticRegression(penalty='l2',
                                           C=1.,
                                           class_weight='balanced')
    elif multi_class == 'KNeighbors':
        lr_classifier = KNeighborsClassifier(weights='distance')
    elif multi_class == 'BaggingClassifier':
        lr_classifier = BaggingClassifier(LogisticRegression(penalty='l2',
                                                             C=1.),
                                          max_samples=0.5,
                                          max_features=0.5)
    elif multi_class == 'GradientBoosting':
        lr_classifier = GradientBoostingClassifier(n_estimators=100)
    elif multi_class == 'GradientBoosted_LR':
        print 'Add in the code for this classifier!'
        # http://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html
    elif multi_class == 'AdaBoost':
        lr_classifier = AdaBoostClassifier(n_estimators=100)
    elif multi_class == 'MLP':  #multilayer perceptron
        lr_classifier = MLPClassifier(solver='lbfgs',
                                      random_state=1,
                                      hidden_layer_sizes=(100, 100, 50))
    else:
        print 'Need a classifier!'
        return

    dummy_classifier = DummyClassifier()

    # initialize confusion matrix
    if conf_mat:
        num_cond = len(np.unique(ev_labels))
        cm_sub = np.zeros([num_cond, num_cond], dtype=int)

    for class_type, classifier in zip(['logreg', 'chance'],
                                      [lr_classifier, dummy_classifier]):

        # Calculate C in a CV manner, if requested
        if cv_C and multi_class == 'multinomial':
            calc_c = LogisticRegressionCV(penalty='l2',
                                          cv=cv,
                                          multi_class='multinomial',
                                          solver='newton-cg')
            calc_c.fit(X, ev_labels)
            print 'Setting C to: ' + str(calc_c.C_)
            lr_classifier = LogisticRegression(penalty='l2',
                                               C=calc_c.C_,
                                               multi_class='multinomial',
                                               solver='newton-cg')

        # Go through cross-validation loops
        for train, test in cv:

            # univariate feature selection? t-test version comes later...
            if univariate_fsel_k:
                fsel = SelectKBest(f_classif, k=univariate_fsel_k).fit(
                    X[train], ev_labels[train])
                X_train = fsel.transform(X[train])
            else:
                X_train = X[train]

            # Feature decomposition?
            if pca_n:
                pca = PCA(n_components=pca_n).fit(X_train)
                X_train = pca.transform(X_train)

            # over/under sampling to balance classes during training?
            if upsampling:
                sm = SMOTE(random_state=42
                           )  # Synthetic Minority Over-sampling Technique
                X_train, ev_labels_train = sm.fit_sample(
                    X_train, ev_labels[train])
                # print X_train.shape
                # print ev_labels_train
            elif undersampling:
                rus = RandomUnderSampler(random_state=42, replacement=False)
                X_train, ev_labels_train = rus.fit_sample(
                    X_train, ev_labels[train])
            else:
                ev_labels_train = ev_labels[train]

            # If running feature selection using lowest pvals from combinations of classes
            if repeated_ttest_fsel:
                pvals = []  # initalize list for pvals across all combos

                for i, combo in enumerate(
                        itertools.combinations(list(set(ev_labels_train)), 2)):
                    print i, combo

                    # figure out which samples are of interest
                    mask = np.in1d(ev_labels_train, combo)

                    # get pvals, add on to pvals list
                    fval, pval = f_classif(X_train[mask],
                                           ev_labels_train[mask])
                    pvals.extend(list(pval.argsort()[:repeated_ttest_fsel]))

                # Now just grab relevant features from training data
                selected_voxels = list(set(pvals))
                print 'Total of ' + str(len(selected_voxels)) + ' voxels.'
                X_train = X_train[:, selected_voxels]
                print X_train.shape

            # Fit classifier w/training data & labels
            classifier.fit(X_train, ev_labels_train)

            # Now prepare for testing!
            if univariate_fsel_k:
                X_test = fsel.transform(X[test])
            else:
                X_test = X[test]

            if repeated_ttest_fsel:
                X_test = X_test[:, selected_voxels]
                print X_test.shape

            if pca_n:
                X_test = pca.transform(X_test)

            # update confusion matrix if necessary
            if conf_mat and class_type != 'chance':
                y_pred = classifier.predict(X_test)
                cm_fold = confusion_matrix(ev_labels[test], y_pred)
                cm_sub = np.sum([cm_sub, cm_fold], axis=0)

            if print_report:
                y_pred = classifier.predict(X_test)
                print(
                    classification_report(ev_labels[test],
                                          y_pred,
                                          target_names=classifier.classes_))

            # get logits for all trials
            if compute_AUC and class_type != 'chance':
                y_score = classifier.decision_function(X_test)

            # Iterate through each class to get acc, proba, etc.
            for i, category in enumerate(classifier.classes_):

                # Get indices for the true category
                cat_ind = ev_labels[test] == category

                # if this trial exists
                if sum(cat_ind) > 0:
                    # Determine accuracy (TPR)
                    acc = classifier.score(X_test[cat_ind],
                                           ev_labels[test][cat_ind])

                    if compute_AUC and class_type != 'chance':
                        if len(classifier.classes_) > 2:
                            auc = roc_auc_score(cat_ind, y_score[:, i])
                        else:
                            auc = roc_auc_score(cat_ind, y_score)
                        row = {
                            'subid': subid,
                            'mask_name': mask_name,
                            'category': category,
                            'classifier': class_type,
                            'auc': auc
                        }
                        df_auc = df_auc.append(
                            pd.DataFrame.from_dict({0: row}, orient='index'))

                    # Determine probabilities & save out probabilities for each category guessed
                    probabilities = classifier.predict_proba(
                        X_test[cat_ind]).T  #class x sample
                    prob_byclass = np.mean(
                        probabilities, axis=1
                    )  # mean probability for each class for these samples

                    for guess_cat in classifier.classes_:
                        proba = prob_byclass[classifier.classes_ == guess_cat][
                            0]  # select the relevant column

                        row = {
                            'subid': subid,
                            'mask_name': mask_name,
                            'true_category': category,
                            'guess_category': guess_cat,
                            'classifier': class_type,
                            'probability': proba
                        }

                        df_proba = df_proba.append(
                            pd.DataFrame.from_dict({0: row}, orient='index'))
                else:
                    print 'Nothing to score!'
                    acc = np.nan

                row = {
                    'subid': subid,
                    'mask_name': mask_name,
                    'category': category,
                    'classifier': class_type,
                    'accuracy': acc,
                    'count': sum(cat_ind)
                }

                df_acc = df_acc.append(
                    pd.DataFrame.from_dict({0: row}, orient='index'))

        # save confusion matrix, once iterated through CV folds
        if conf_mat and class_type != 'chance':
            print classifier.classes_

            print 'Confusion matrix (raw counts):'
            print cm_sub

            # normalize, and add to group matrix
            cm_sub = cm_sub.astype('float') / cm_sub.sum(axis=1)[:, np.newaxis]
            print 'Confusion matrix (normalized):'
            print cm_sub

            cm_group = np.append(cm_group, [cm_sub], axis=0)

    # Return calculations
    if compute_AUC:
        if conf_mat:
            return df_acc, df_proba, df_auc, cm_group
        else:
            return df_acc, df_proba, df_auc
    else:
        if conf_mat:
            return df_acc, df_proba, cm_group
        else:
            return df_acc, df_proba
Example #53
0
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)


# ## classification report

# In[61]:


from sklearn.metrics import classification_report


# In[82]:


print(classification_report(messages['label'],all_predictions))


# ## Train Test Split

# In[63]:


from sklearn.model_selection import train_test_split 


# In[72]:


from sklearn.model_selection import train_test_split
# model
from sklearn.ensemble import RandomForestClassifier
# eval
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

raw_wine = datasets.load_wine()

X = raw_wine.data
y = raw_wine.target

X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=42)

std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

clf_rf = RandomForestClassifier(max_depth=2, random_state=42)
clf_rf.fit(X_tn_std, y_tn)

rf_pred = clf_rf.predict(X_te_std)

acc = accuracy_score(y_te, rf_pred)
print('acc : ', acc)

conf_matrix = confusion_matrix(y_te, rf_pred)
print('confusion matrix : ', conf_matrix)

class_report = classification_report(y_te, rf_pred)
print(class_report)
#Begin Pipeline Setup
select = SelectKBest()
bNb = BernoulliNB()
steps = [("feature_selection", select), ("bernouli_nb", bNb)]
pipeNb = Pipeline(steps)

paraGridBnb = dict(feature_selection__k=[20,25,30])

gsBnb = GridSearchCV(pipeNb, param_grid=paraGridBnb, scoring="f1_micro", n_jobs=-1)

gsBnb.fit(X_trainBnb, y_trainBnb)

BnbPreds = gsBnb.predict(X_testBnb)

BnbReport  = classification_report(BnbPreds, y_testBnb)
BnbScore = accuracy_score(BnbPreds, y_testBnb)
BnbMatrix = confusion_matrix(BnbPreds, y_testBnb)

bestModelBnb = gsBnb.best_estimator_
from sklearn.externals import joblib
joblib.dump(bestModelGb, 'ZIP_BnbBestModel.pkl', compress = 9)

#==============================================================================
#If I did a feature selection, I believe that the feature removal due to Bnb would have
# higher feature importances. Will need to return. 
#              precision    recall  f1-score   support
# 
#           1       0.35      0.33      0.34      9042
#           2       0.00      0.11      0.00        18
#           3       0.01      0.31      0.02       282
Example #56
0
params = {'n_neighbors': [i for i in range(1, 30)],
          'weights': ['uniform', 'distance'],
          'p': [1, 2]}
gcv = GridSearchCV(knn, params, scoring='accuracy', cv=6) # cv 数据分成6份
gcv.fit(X_train, y_train)

# 查看了GridSearchCV 最佳的参数组合
# print(gcv.best_params_) # 最佳参数
# print(gcv.best_estimator_) # 最佳估计量
# print(gcv.best_score_) # 最佳得分

# 直接使用gcv进行预测,结果一样, 计算准确率
# y_ = gcv.predict(X_test)
# print((y_ == y_test).mean())
# print(gcv.score(X_test, y_test))
# print(accuracy_score(y_test, y_)) #

# 取出了最好的模型,进行预测
knn_best = gcv.best_estimator_
y_ = knn_best.predict(X_test)
# print(accuracy_score(y_test, y_)) # 最佳得分

# print(pd.crosstab(index=y_test, columns=y_, rownames=['True'], colnames=['Predict'], margins=True))
# print(y_test.value_counts()) # 真实的数据
# print(Series(y_).value_counts()) # 预测的数据
# print(confusion_matrix(y_test, y_))
# print(np.round(6/9, 2))
# precision    recall  f1-score
# 精确率       召回率     f1-score调和平均值
print(classification_report(y_test, y_, target_names=['B', 'M']))
# Predicting the results for our test dataset
predicted_values = lr.predict(X_test)

# Printing the residuals: difference between real and predicted
for (real, predicted) in list(zip(y_test, predicted_values)):
    print(
        f'Value: {real}, pred: {predicted} {"is different" if real != predicted else ""}'
    )

# Printing accuracy score(mean accuracy) from 0 - 1
print(f'Accuracy score is {lr.score(X_test, y_test):.2f}/1 \n')

# Printing the classification report
from sklearn.metrics import classification_report, confusion_matrix, f1_score
print('Classification Report')
print(classification_report(y_test, predicted_values))

# Printing the classification confusion matrix (diagonal is true)
print('Confusion Matrix')
print(confusion_matrix(y_test, predicted_values))
print('Overall f1-score')
print(f1_score(y_test, predicted_values, average="macro"))

# #Printing the colormap
# from matplotlib.colors import ListedColormap
# from sklearn import neighbors, datasets
# # Create color maps for 3-class classification problem, as with wine
# cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
# cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
#
# wine = load_wine()
Example #58
0
def ModelEvaluator(model_name,data): 
    from sklearn import metrics
    from sklearn.metrics import confusion_matrix
    import matplotlib.pyplot as plt
    import matplotlib.gridspec as gridspec
    from matplotlib.ticker import FixedLocator, FixedFormatter

   #partition processed data into vectors
    actualClass = data.label
    predictedClass = data.predicted
    probability = data.probability 
    
    #build a confusion matrix
    cm = confusion_matrix(actualClass, predictedClass, labels = [0,1])
    
    TruePositive = cm[1, 1]
    TrueNegative = cm[0,0]
    FalsePositive = cm[0,1]
    FalseNegative = cm[1,0]
    
    numberOfPositives = TruePositive + FalseNegative
    numberOfNegatives = TrueNegative + FalsePositive
    
    #calculate the Null accuracy
    null_accuracy = 1 - actualClass.mean()
    
    #define the model accuracy
    model_accuracy = metrics.accuracy_score(actualClass, predictedClass)
    
    #Generate a metrics report
    report = metrics.classification_report(actualClass, predictedClass, output_dict = True)
  
    #calculate the model performance over the null accuracy
    performance_over_null = model_accuracy - null_accuracy
    
    #Calculate the Specificity of the model 
    specificity = TrueNegative / (TrueNegative + FalsePositive)
    
    #Calculate the True positive rate, false positive rate, and thresholds to plot a rock curve
    fpr, tpr, thresholds = metrics.roc_curve(actualClass, probability)
    
    #Calculate the Area under the ROC Curve
    rocAuc = metrics.roc_auc_score(actualClass, probability)
    
    #Calculate the Michaels Correlation Coefficient
    mcc = metrics.matthews_corrcoef(actualClass, predictedClass)
    
    #generate figure
    fig = plt.figure(figsize = (10, 5))
    spec = gridspec.GridSpec(ncols=2, nrows=2, wspace=0.5, hspace = 0.8, width_ratios=[1, 1], height_ratios = [1, 20],  figure=fig)
    
    text = fig.add_subplot(spec[0,0])
    text.axis('off')
    text. set_title('%s' % (model_name), fontweight = 'bold', fontsize = 16) 
    text.text(0,0,'The performance of this model over the null accuracy is %2.2f%%\nModel Sensitivity: %2.6f%% \nModel Specificity: %2.6f%% \nModel F1 Score: %2.6f \nMatthews Correlation Coeffiecient: %2.6f' 
      % ((performance_over_null *100), (report['1.0']['recall']*100), (specificity*100), (report['1.0']['f1-score']), mcc), bbox=dict(facecolor='white'), verticalalignment="top")
    
    
    #plot confusion matrix in pos 0,0
    confusionMatrixLabels = ['Normal Traffic', 'Intrusion']
    confusionMatrixColourMap = plt.cm.Blues
    confusionMatrix = fig.add_subplot(spec[1,0])
    confusionMatrix.set_aspect('equal')
    confusionMatrix.imshow(cm, interpolation = 'nearest', cmap = confusionMatrixColourMap)
    confusionMatrix.set(ylabel ='True class', xlabel ='Predicted class')
    #confusionMatrix.xlabel(labelpad=5)
            
    confusionMatrix.set_xticks(np.arange(0,2))
    formatter = FixedFormatter(['Normal Traffic', 'Intrusion'])
    locator = FixedLocator([0,1])
    
    confusionMatrix.yaxis.set_major_formatter(formatter)
    confusionMatrix.yaxis.set_major_locator(locator)
    confusionMatrix.xaxis.set_major_formatter(formatter)
    confusionMatrix.xaxis.set_major_locator(locator)
    
    #confusionMatrix.set_yticks(np.arange(0,2))
    #confusionMatrix.set_xticklabels(np.arange(0,1), confusionMatrixLabels, fontdict = None)
    
    tot = sum(data.label)
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            confusionMatrix.text(j, i, (format(cm[i, j])),ha ='center', va="baseline", color="white" if cm[i,j] > (0.5*tot) else 'black', size = 'larger')
    
   
    cmLabels = ['TN', 'FP', 'FN', 'TP' ]
    a = 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            confusionMatrix.text(j + 0.3, i + 0.4, (cmLabels[a]), ha ='center', va="baseline", color="white" if cm[i,j] >(0.5*tot) else 'black', size = 'larger')
            if a < 4:
                a += 1
    a=0          
    for i in range(cm.shape[a]):
        if a == 0:
            confusionMatrix.text(j+0.8, i, ('Total:\n %d' % (numberOfNegatives)), ha ='center', va="center", color = 'black', size = 'larger')
            a += 1
        else:
            confusionMatrix.text(j+0.8, i, ('Total:\n %d' % (numberOfPositives)), ha ='center', va="center", color = 'black', size = 'larger')
    
    #plot roc curve in position 0,1 
    rocCurve = fig.add_subplot(spec[1, 1])
    rocCurve.set_aspect('equal')
    rocCurve.plot(fpr, tpr, color='red', lw=2, label = 'ROC area = %0.5f)' % rocAuc )
    rocCurve.set(xlabel = 'False Positive Rate (1-Specifcity)', ylabel = 'True Positive Rate (Sensitivity)' )
    rocCurve.legend(loc="lower right")
data = pickle.load(pick_in)
pick_in.close()
random.shuffle(data)
features = []
labels = []

#Split the elements in data into features and labels
for feature, label in data:
    features.append(feature)
    labels.append(label)

#Split the data into train (70%) and test data (30%)
xtrain, xtest, ytrain, ytest = train_test_split(features,
                                                labels,
                                                test_size=0.3)

decision_trees_model = tree.DecisionTreeClassifier()
decision_trees_model.fit(xtrain, ytrain)
prediction = decision_trees_model.predict(xtest)
score = decision_trees_model.score(xtest, ytest)

print(classification_report(ytest, prediction))
print("depth: ", decision_trees_model.get_depth())
print("prediction", prediction)
print("Testing accuracy ", score)
print("Numpy accuracy ", np.mean(ytest == prediction))

#Saves the model in 'model.sav' folder
pick = open('decision_trees_model.sav', 'wb')
pickle.dump(decision_trees_model, pick)
pick.close()
print(Y_test.value_counts())
print

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)

lr=LogisticRegression()
sgdc=SGDClassifier()
lr.fit(X_train,Y_train)
lr_y_predict=lr.predict(X_test)

sgdc.fit(X_train,Y_train)
sgdc_y_predit=sgdc.predict(X_test)


from sklearn.metrics import classification_report

print 'Accuracy of LR Classifier: ',lr.score(X_test,Y_test)

print classification_report(Y_test,lr_y_predict,target_names=['Benign','Malignant'])


print 'Accuracy of SGD Classifier: ',sgdc.score(X_test,Y_test)

print classification_report(Y_test,sgdc_y_predit,target_names=['Benign','Malignant'])