Beispiel #1
0
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        nnz = clf.coef_.nonzero()[0].shape[0]
        print "non-zero coef: %d" % nnz
        print

    if print_report:
        print "classification report:"
        print metrics.classification_report(y_test, pred,
                                            target_names=categories)

    if print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    return score, train_time, test_time
def benchmark(clf_class, params, name):
    print "parameters:", params
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print "done in %fs" % (time() - t0)

    if hasattr(clf, 'coef_'):
        print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)

    print "Predicting the outcomes of the testing set"
    t0 = time()
    pred = clf.predict(X_test)
    print "done in %fs" % (time() - t0)
    
    print "Classification report on test set for classifier:"
    print clf
    print
    print classification_report(y_test, pred, target_names=news_test.target_names)
    
    cm = confusion_matrix(y_test, pred)
    print "Confusion matrix:"
    print cm
    
    # Show confusion matrix
    pl.matshow(cm)
    pl.title('Confusion matrix of the %s classifier' % name)
    pl.colorbar()
def benchmark(clf_class, params, name):
    print "parameters:", params
    t0 = time()
    clf = clf_class(**params).fit(X_train, y_train)
    print "done in %fs" % (time() - t0)

    if hasattr(clf, 'coef_'):
        print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100)

    print "Predicting the outcomes of the testing set"
    t0 = time()
    pred = clf.predict(X_test)
    print "done in %fs" % (time() - t0)
    
    print "Classification report on test set for classifier:"
    print clf
    print
    print classification_report(y_test, pred, target_names=news_test.target_names)
    
    cm = confusion_matrix(y_test, pred)
    print "Confusion matrix:"
    print cm
    
    # Show confusion matrix
    pl.matshow(cm)
    pl.title('Confusion matrix of the %s classifier' % name)
    pl.colorbar()
Beispiel #4
0
def train(*filenames):
    """Returns a classifier that """
    data = None
    answers = None
    all_images = []
    for filename in filenames:
        print filename
        if not os.path.exists(filename) and os.path.exists(filename + '.code'):
            return False
        keys = getTrainingKey(filename + '.code')
        images = getImDictFromImage(filename)
        this_data = images.reshape(images.shape[0], -1)
        this_answers = numpy.array(keys)
        all_images.extend(images)
        if data is None:
            answers = this_answers
            data = this_data
        else:
            data = numpy.concatenate([data, this_data], 0)
            answers = numpy.concatenate([answers, this_answers], 0)

    print 'image shape', images.shape
    print 'data shape', data.shape
    print 'answers shape', answers.shape

    from scikits.learn import svm
    from scikits.learn.metrics import classification_report
    from scikits.learn.metrics import confusion_matrix
    classifier = svm.SVC()

    divider = 400

    classifier.fit(data[:divider], answers[:divider])

    expected = answers[divider:]
    predicted = classifier.predict(data[divider:])

    print "check:"
    print classifier
    print 'predicted', predicted
    print
    print classification_report(expected, predicted)

    print confusion_matrix(expected, predicted)
    print 'len of all_images:', len(all_images)

    for index, (image, prediction) in enumerate(
            zip(all_images[divider:], predicted)[:25]):
        #for index, (image, prediction) in enumerate(zip(all_images, answers)[50:75]):
        print index, prediction

        pylab.subplot(5, 5, index + 1)
        pylab.imshow(image, cmap=pylab.cm.gray_r)
        pylab.title('Prediction: ' + numToTile(prediction))

    pylab.show()
Beispiel #5
0
def train(*filenames):
    """Returns a classifier that """
    data = None
    answers = None
    all_images = []
    for filename in filenames:
        print filename
        if not os.path.exists(filename) and os.path.exists(filename+'.code'):
            return False
        keys = getTrainingKey(filename+'.code')
        images = getImDictFromImage(filename)
        this_data = images.reshape(images.shape[0], -1)
        this_answers = numpy.array(keys)
        all_images.extend(images)
        if data is None:
            answers = this_answers
            data = this_data
        else:
            data = numpy.concatenate([data, this_data], 0)
            answers = numpy.concatenate([answers, this_answers], 0)

    print 'image shape', images.shape
    print 'data shape', data.shape
    print 'answers shape', answers.shape

    from scikits.learn import svm
    from scikits.learn.metrics import classification_report
    from scikits.learn.metrics import confusion_matrix
    classifier = svm.SVC()

    divider = 400

    classifier.fit(data[:divider], answers[:divider])


    expected = answers[divider:]
    predicted = classifier.predict(data[divider:])

    print "check:"
    print classifier
    print 'predicted', predicted
    print
    print classification_report(expected, predicted)

    print confusion_matrix(expected, predicted)
    print 'len of all_images:', len(all_images)

    for index, (image, prediction) in enumerate(zip(all_images[divider:], predicted)[:25]):
    #for index, (image, prediction) in enumerate(zip(all_images, answers)[50:75]):
        print index, prediction

        pylab.subplot(5, 5, index+1)
        pylab.imshow(image, cmap=pylab.cm.gray_r)
        pylab.title('Prediction: '+numToTile(prediction))

    pylab.show()
def evaluate(model, testX, testY, testTitles=None, testLabels=None):
    """ Shows all the performance of `model` at predicting
        the testY from the testX
    """

    dir = '/CurrentPorjects/LatentDirichletAllocation/data/NIPS1-17/'
    cnamesf = open( os.path.join(dir, 'NIPS_category_names.txt') )
    names_of_categories = dict(enumerate( [w.strip().split(" ",1)[1] for w in cnamesf.readlines() ] ))
    cnamesf.close()
    n_of_cats=names_of_categories


    lnamesf = open( os.path.join(dir, 'NIPS_label_names.txt') )
    label_names = dict(enumerate( [w.strip() for w in lnamesf.readlines() ] ))
    lnamesf.close()


    predicted = model.predict(testX)

    print metrics.confusion_matrix(testY, predicted)
    print metrics.classification_report(testY, predicted, target_names=names_of_categories.values()[1:9])

    size = len(testY)

    if testTitles is not None:
        
        if testLabels is not None:
            label_heading = "NIPS Label"
        else:
            label_heading = ""
        
        print ("_"*80),                "_________", "_________"
        print "Paper title".ljust(80), "predicted", "true     ", label_heading
        print ("_"*80),                "_________", "_________"
        for i in range(0, size):
            pred = predicted[i]
            true = testY[i]
            if pred != true:
                title = testTitles[i][0:80].ljust(80)
                pred_str= n_of_cats[pred][0:9].ljust(9)
                true_str= n_of_cats[true][0:9].ljust(9)
                if testLabels is not None:
                    label_id = testLabels[i]
                    label = label_names[label_id]
                else:
                    label = ""
                print title, pred_str, true_str, label
def benchmark(clf):
    print 80 * '_'
    print "Training: "
    print clf
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print "train time: %0.3fs" % train_time

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print "test time:  %0.3fs" % test_time

    score = metrics.f1_score(y_test, pred)
    print "f1-score:   %0.3f" % score

    if hasattr(clf, 'coef_'):
        nnz = clf.coef_.nonzero()[0].shape[0]
        print "non-zero coef: %d" % nnz

        if opts.print_top10:
            print "top 10 keywords per class:"
            for i, category in enumerate(categories):
                top10 = np.argsort(clf.coef_[i, :])[-10:]
                print trim("%s: %s" % (category, " ".join(vocabulary[top10])))
        print

    if opts.print_report:
        print "classification report:"
        print metrics.classification_report(y_test, pred,
                                            target_names=categories)

    if opts.print_cm:
        print "confusion matrix:"
        print metrics.confusion_matrix(y_test, pred)

    print
    return score, train_time, test_time
Beispiel #8
0
  def _svm():

    clf = svm.SVC()

    t = time.time()
    clf.fit(data_train, label_train)
    print "SVM: time elapsed in fitting: %f secs" % (time.time()-t)

    t = time.time()
    predicted = clf.predict(data_test)
    print "SVM: time elapsed in predicting: %f secs" % (time.time()-t)

    print "Classification report for SVM:\n%s\n" % (metrics.classification_report(label_test, predicted))
    print "Confusion matrix:\n%s" % metrics.confusion_matrix(label_test, predicted)
Beispiel #9
0
    def _svm():

        clf = svm.SVC()

        t = time.time()
        clf.fit(data_train, label_train)
        print "SVM: time elapsed in fitting: %f secs" % (time.time() - t)

        t = time.time()
        predicted = clf.predict(data_test)
        print "SVM: time elapsed in predicting: %f secs" % (time.time() - t)

        print "Classification report for SVM:\n%s\n" % (
            metrics.classification_report(label_test, predicted))
        print "Confusion matrix:\n%s" % metrics.confusion_matrix(
            label_test, predicted)
Beispiel #10
0
  def opf():

    # OPF only supports 32 bits labels at the moment
    label_train_32 = label_train.astype(numpy.int32)
    label_test_32  = label_test.astype(numpy.int32)

    O = libopf_py.OPF()

    t = time.time()
    O.fit(dist_train, label_train_32, precomputed_distance=True)
#    O.fit(dist_train, label_train_32, precomputed_distance=True, learning="agglomerative", split=0.8)
    print "OPF: time elapsed in fitting: %f secs" % (time.time()-t)

    t = time.time()
    predicted = O.predict(dist_test)
    print "OPF: time elapsed in predicting: %f secs" % (time.time()-t)

    print "Classification report for OPF:\n%s\n" % (metrics.classification_report(label_test_32, predicted))
    print "Confusion matrix:\n%s" % metrics.confusion_matrix(label_test_32, predicted)
Beispiel #11
0
def test(model, X, y, output_path):
    print "Evaluating svm"
    y_pred = model.predict(X)
    #try:
    if True:
        acc = (y == y_pred).mean()
        print "Accuracy ",acc
        f = open(output_path,'w')
        f.write('Accuracy: '+str(acc)+'\n')
        if classification_report:
            cr =  classification_report(y, y_pred)#, labels=selected_target,
                                #class_names=category_names[selected_target])
            print cr
            f.write(str(cr))
        if confusion_matrix:
            cm =  confusion_matrix(y, y_pred)#, labels=selected_target)
            print cm
            f.write(str(cm))
        f.close()
    """except:
Beispiel #12
0
def test(model, X, y, output_path):
    print "Evaluating svm"
    y_pred = model.predict(X)
    #try:
    if True:
        acc = (y == y_pred).mean()
        print "Accuracy ", acc
        f = open(output_path, 'w')
        f.write('Accuracy: ' + str(acc) + '\n')
        if classification_report:
            cr = classification_report(y, y_pred)  #, labels=selected_target,
            #class_names=category_names[selected_target])
            print cr
            f.write(str(cr))
        if confusion_matrix:
            cm = confusion_matrix(y, y_pred)  #, labels=selected_target)
            print cm
            f.write(str(cm))
        f.close()
    """except:
Beispiel #13
0
    def opf():

        # OPF only supports 32 bits labels at the moment
        label_train_32 = label_train.astype(numpy.int32)
        label_test_32 = label_test.astype(numpy.int32)

        O = libopf_py.OPF()

        t = time.time()
        O.fit(dist_train, label_train_32, precomputed_distance=True)
        #    O.fit(dist_train, label_train_32, precomputed_distance=True, learning="agglomerative", split=0.8)
        print("OPF: time elapsed in fitting: %f secs" % (time.time() - t))

        t = time.time()
        predicted = O.predict(dist_test)
        print("OPF: time elapsed in predicting: %f secs" % (time.time() - t))

        print("Classification report for OPF:\n%s\n" %
              (metrics.classification_report(label_test_32, predicted)))
        print("Confusion matrix:\n%s" %
              metrics.confusion_matrix(label_test_32, predicted))
pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
eigenfaces = pca.components_.reshape((n_components, h, w))

X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a SVM classification model
param_grid = dict(C=[1, 5, 10, 50, 100],
                  gamma=[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1])
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'},
                   verbose=1)
clf = clf.fit(X_train_pca, y_train)
print clf.best_estimator

# Quantitative evaluation of the model quality on the test set
from scikits.learn import metrics
y_pred = clf.predict(X_test_pca)
print metrics.classification_report(y_test, y_pred, target_names=target_names)
print metrics.confusion_matrix(y_test, y_pred,
                               labels=range(len(target_names)))


# Plot the results
import pylab as pl
for index, (img, label_true, label_pred) in enumerate(
                zip(X_test[:8], y_test[:8], y_pred[:8])):
    pl.subplot(2, 4, index+1).imshow(img.reshape(h, w), cmap=pl.cm.gray)
    pl.title('%s, prediction: %s' % (label_true, label_pred))

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer)),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),
])

# Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test, y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print cm

# import pylab as pl
#pl.matshow(cm)
#pl.show()

# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
Beispiel #16
0
    ('clf', LinearSVC(C=1000)),
])

parameters = {
    'vect__analyzer__max_n': (1, 2),
    'vect__max_df': (.95, ),
}

# Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
grid_search.fit(docs_train[:200], y_train[:200])

# Refit the best parameter set on the complete training set
clf = grid_search.best_estimator.fit(docs_train, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test,
                                    y_predicted,
                                    class_names=dataset.target_names)

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print cm

# import pylab as pl
#pl.matshow(cm)
#pl.show()
Beispiel #17
0
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator
print clf
yp = clf.predict(X)
print classification_report(y, yp, targets, target_names)

#pl.figure()
#pl.title("Classification rate for 3-fold stratified CV")
#pl.xlabel("n-gram maximum size")
#pl.ylabel("successful classification rate")
#ns = range(1, 11)
#scores = [grid_search.grid_points_scores_[(('extr__n', i),)] for i in ns]
#pl.plot(ns, scores, 'o-')
#pl.show()

## Now we take apart the pipeline to do the plot
#X = clf.named_steps['extr'].transform(X)
#pca = RandomizedPCA(n_components=2).fit(X)
#Xpca = pca.transform(X)
#svc = clf.named_steps['svc']
clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"})
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator


################################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, class_names=class_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))


################################################################################
# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4


def title(y_pred, y_test, class_names, i):
    pred_name = class_names[y_pred[i]].rsplit(" ", 1)[-1]
    true_name = class_names[y_test[i]].rsplit(" ", 1)[-1]
    return "predicted: %s\ntrue:      %s" % (pred_name, true_name)
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'})

#clf = SVC(kernel='rbf')
#clf = SVC(kernel='linear')

clf.fit(np.vstack([moto_vq_train,plane_vq_train]),
        np.array(labels))

print "Best estimator found by grid search:"
#print clf.best_estimator

###############################################################################
# Evaluation 

moto_vq_eval, plane_vq_eval  = [np.load(file) 
                                for file 
                                in ['moto_vq_eval.npy','plane_vq_eval.npy']]

y_name = ['moto']*moto_vq_eval.shape[0] + ['plane']* plane_vq_eval.shape[0]
y_test = [0]* moto_vq_eval.shape[0] + [1]* plane_vq_eval.shape[0]
y_test = np.array(y_test)


y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval]))


print classification_report(y_test, y_pred, labels=labels, class_names=y_name)
print confusion_matrix(y_test, y_pred)
# split the dataset in two equal part respecting label proportions
train, test = iter(StratifiedKFold(y, 2)).next()

################################################################################
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Beispiel #21
0
X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))
y_test = news_test.target
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape

print "Predicting the outcomes of the testing set"
t0 = time()
pred = clf.predict(X_test)
print "done in %fs" % (time() - t0)
<<<<<<< HEAD
print "precision: %0.3f" % precision(y_test, pred)
print "recall: %0.3f" % recall(y_test, pred)
print "f1_score: %0.3f" % f1_score(y_test, pred)
=======

print "Classification report on test set:"
print classification_report(news_test.target, pred,
                            class_names=news_test.target_names)

>>>>>>> remote

cm = confusion_matrix(y_test, pred)
print "Confusion matrix:"
print cm

# Show confusion matrix
pl.matshow(cm)
pl.title('Confusion matrix')
pl.colorbar()
pl.show()
Beispiel #22
0
param_grid = {
 'C': [1, 5, 10, 100],
 'gamma': [0.0001, 0.001, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'},
                   n_jobs=-1)
clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator


# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target,
                            class_names=target_names[selected_target])

print confusion_matrix(y_test, y_pred, labels=selected_target)


# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4

def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit('_', 1)[-1]
    true_name = target_names[y_test[i]].rsplit('_', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

Beispiel #23
0
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}

clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})

#clf = SVC(kernel='rbf')
#clf = SVC(kernel='linear')

clf.fit(np.vstack([moto_vq_train, plane_vq_train]), np.array(labels))

print "Best estimator found by grid search:"
#print clf.best_estimator

###############################################################################
# Evaluation

moto_vq_eval, plane_vq_eval = [
    np.load(file) for file in ['moto_vq_eval.npy', 'plane_vq_eval.npy']
]

y_name = ['moto'] * moto_vq_eval.shape[0] + ['plane'] * plane_vq_eval.shape[0]
y_test = [0] * moto_vq_eval.shape[0] + [1] * plane_vq_eval.shape[0]
y_test = np.array(y_test)

y_pred = clf.predict(np.vstack([moto_vq_eval, plane_vq_eval]))

print classification_report(y_test, y_pred, labels=labels, class_names=y_name)
print confusion_matrix(y_test, y_pred)
data = digits.images.reshape((n_samples, -1))

# Import a classifier:
from scikits.learn import svm
from scikits.learn.metrics import classification_report
from scikits.learn.metrics import confusion_matrix
classifier = svm.SVC()

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples/2], digits.target[:n_samples/2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples/2:]
predicted = classifier.predict(data[n_samples/2:])

print "Classification report for classifier:"
print classifier
print
print classification_report(expected, predicted)
print
print "Confusion matrix:"
print confusion_matrix(expected, predicted)

for index, (image, prediction) in enumerate(
    zip(digits.images[n_samples/2:], predicted)[:4]):
    pl.subplot(2, 4, index+5)
    pl.imshow(image, cmap=pl.cm.gray_r)
    pl.title('Prediction: %i' % prediction)

pl.show()
param_grid = {
 'C': [1, 5, 10, 50, 100],
 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'), param_grid,
                   fit_params={'class_weight': 'auto'})
clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator


################################################################################
# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target,
                            class_names=category_names[selected_target])

print confusion_matrix(y_test, y_pred, labels=selected_target)


################################################################################
# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4

pl.figure(figsize=(2 * n_col, 2.3 * n_row))
pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15)
for i in range(n_row * n_col):
    pl.subplot(n_row, n_col, i + 1)
    pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray)
print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)

print "Extracting features from the dataset using the same vectorizer"
t0 = time()
X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))
y_test = news_test.target
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_test.shape

print "Predicting the outcomes of the testing set"
t0 = time()
pred = clf.predict(X_test)
print "done in %fs" % (time() - t0)

print "Classification report on test set for classifier:"
print clf
print
print classification_report(y_test, pred, class_names=news_test.target_names)

cm = confusion_matrix(y_test, pred)
print "Confusion matrix:"
print cm

# Show confusion matrix
pl.matshow(cm)
pl.title('Confusion matrix')
pl.colorbar()
pl.show()
Beispiel #27
0
## }
## print("Training LinearSVC on training set")
## clf = LinearSVC(**parameters)
print("Training SGD with alpha=0.001 and n_iter=2")
clf = SGD(alpha=0.001, n_iter=2)
t0 = time()
clf.fit(X_train, y_train)
print "done in %fs" % (time() - t0)

print "Predicting the outcomes of the testing set"
t0 = time()
pred = clf.predict(X_test)
print "done in %fs" % (time() - t0)

print "Classification performance:"
print
print metrics.classification_report(
    y_test,
    pred,
    labels=[-1, 1],
    class_names=['any other types', 'cover type 1'])
print ""

err = metrics.zero_one(y_test, pred) / float(pred.shape[0])
print "Error rate: %.4f" % err
print ""

cm = metrics.confusion_matrix(y_test, pred)
print "Confusion matrix:"
print cm
Beispiel #28
0
tuned_parameters = [{
    'kernel': ['rbf'],
    'gamma': [1e-3, 1e-4],
    'C': [1, 10, 100, 1000]
}, {
    'kernel': ['linear'],
    'C': [1, 10, 100, 1000]
}]

scores = [
    ('precision', precision_score),
    ('recall', recall_score),
]

for score_name, score_func in scores:
    clf = GridSearchCV(SVC(C=1), tuned_parameters, score_func=score_func)
    clf.fit(X[train], y[train], cv=StratifiedKFold(y[train], 5))
    y_true, y_pred = y[test], clf.predict(X[test])

    print "Classification report for the best estimator: "
    print clf.best_estimator
    print "Tuned for '%s' with optimal value: %0.3f" % (
        score_name, score_func(y_true, y_pred))
    print classification_report(y_true, y_pred)
    print "Grid scores:"
    pprint(clf.grid_scores_)
    print

# Note the problem is too easy: the hyperparameter plateau is too flat and the
# output model is the same for precision and recall with ties in quality
Beispiel #29
0
                   param_grid,
                   fit_params={'class_weight': 'auto'})
clf = clf.fit(X_train_pca, y_train)
print "done in %0.3fs" % (time() - t0)
print "Best estimator found by grid search:"
print clf.best_estimator

################################################################################
# Quantitative evaluation of the model quality on the test set

print "Predicting the people names on the testing set"
t0 = time()
y_pred = clf.predict(X_test_pca)
print "done in %0.3fs" % (time() - t0)

print classification_report(y_test, y_pred, target_names=target_names)
print confusion_matrix(y_test, y_pred, labels=range(n_classes))

################################################################################
# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4


def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
    true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
    return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)

Beispiel #30
0
    'extr__n': (3, 4, 5, 6),
    'svc__C': (1e-1, 1e-2, 1e9)
}
grid_search = GridSearchCV(pipeline, parameters)

print "Loading data..."
X, y = load_data()
print "Searching for the best model..."
t0 = time()
grid_search.fit(X, y)
print "Done in %0.3f" % (time() - t0)
print "Best score: %0.3f" % grid_search.best_score
clf = grid_search.best_estimator
print clf
yp = clf.predict(X)
print classification_report(y, yp, targets, target_names)

#pl.figure()
#pl.title("Classification rate for 3-fold stratified CV")
#pl.xlabel("n-gram maximum size")
#pl.ylabel("successful classification rate")
#ns = range(1, 11)
#scores = [grid_search.grid_points_scores_[(('extr__n', i),)] for i in ns]
#pl.plot(ns, scores, 'o-')
#pl.show()

## Now we take apart the pipeline to do the plot
#X = clf.named_steps['extr'].transform(X)
#pca = RandomizedPCA(n_components=2).fit(X)
#Xpca = pca.transform(X)
#svc = clf.named_steps['svc']

# Train a SVM classification model

print "Fitting the classifier to the training set"
param_grid = {"C": [1, 5, 10, 100], "gamma": [0.0001, 0.001, 0.01, 0.1]}
clf = GridSearchCV(SVC(kernel="rbf"), param_grid, fit_params={"class_weight": "auto"}, n_jobs=-1)
clf = clf.fit(X_train_pca, y_train)
print "Best estimator found by grid search:"
print clf.best_estimator


# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print classification_report(y_test, y_pred, labels=selected_target, target_names=target_names[selected_target])

print confusion_matrix(y_test, y_pred, labels=selected_target)


# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4


def title(y_pred, y_test, target_names, i):
    pred_name = target_names[y_pred[i]].rsplit("_", 1)[-1]
    true_name = target_names[y_test[i]].rsplit("_", 1)[-1]
    return "predicted: %s\ntrue:      %s" % (pred_name, true_name)
Beispiel #32
0
    pl.subplot(2, 4, index + 1)
    pl.imshow(image, cmap=pl.cm.gray_r)
    pl.title('Training: %i' % label)

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
classifier = svm.SVC()

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples / 2:]
predicted = classifier.predict(data[n_samples / 2:])

print "Classification report for classifier %s:\n%s\n" % (
    classifier, metrics.classification_report(expected, predicted))
print "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)

for index, (image, prediction) in enumerate(
        zip(digits.images[n_samples / 2:], predicted)[:4]):
    pl.subplot(2, 4, index + 5)
    pl.imshow(image, cmap=pl.cm.gray_r)
    pl.title('Prediction: %i' % prediction)

pl.show()
def evaluate(clf,Xt,yt,Xv,yv,title):
  print title
  clf.fit(Xt,yt)
  pred = clf.predict(Xv)
  print metrics.classification_report(yv,pred)
    pl.subplot(2, 4, index+1)
    pl.imshow(image, cmap=pl.cm.gray_r)
    pl.title('Training: %i' % label)

# To apply an classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
classifier = svm.SVC()

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples/2], digits.target[:n_samples/2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples/2:]
predicted = classifier.predict(data[n_samples/2:])

print "Classification report for classifier %s:\n%s\n" % (
    classifier, metrics.classification_report(expected, predicted))
print "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)

for index, (image, prediction) in enumerate(
    zip(digits.images[n_samples/2:], predicted)[:4]):
    pl.subplot(2, 4, index+5)
    pl.imshow(image, cmap=pl.cm.gray_r)
    pl.title('Prediction: %i' % prediction)

pl.show()
Beispiel #35
0
data = digits.images.reshape((n_samples, -1))

# Import a classifier:
from scikits.learn import svm
from scikits.learn.metrics import classification_report
from scikits.learn.metrics import confusion_matrix
classifier = svm.SVC()

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples / 2:]
predicted = classifier.predict(data[n_samples / 2:])

print "Classification report for classifier:"
print classifier
print
print classification_report(expected, predicted)
print
print "Confusion matrix:"
print confusion_matrix(expected, predicted)

for index, (image, prediction) in enumerate(
        zip(digits.images[n_samples / 2:], predicted)[:4]):
    pl.subplot(2, 4, index + 5)
    pl.imshow(image, cmap=pl.cm.gray_r)
    pl.title('Prediction: %i' % prediction)

pl.show()
Beispiel #36
0
    'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(SVC(kernel='rbf'),
                   param_grid,
                   fit_params={'class_weight': 'auto'})
clf = clf.fit(X_train_pca, y_train)
print("Best estimator found by grid search:")
print(clf.best_estimator)

################################################################################
# Quantitative evaluation of the model quality on the test set

y_pred = clf.predict(X_test_pca)
print(
    classification_report(y_test,
                          y_pred,
                          labels=selected_target,
                          class_names=category_names[selected_target]))

print(confusion_matrix(y_test, y_pred, labels=selected_target))

################################################################################
# Qualitative evaluation of the predictions using matplotlib

n_row = 3
n_col = 4

pl.figure(figsize=(2 * n_col, 2.3 * n_row))
pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.95, hspace=.15)
for i in range(n_row * n_col):
    pl.subplot(n_row, n_col, i + 1)
    pl.imshow(X_test[i].reshape((64, 64)), cmap=pl.cm.gray)