예제 #1
0
def optimize_kernel(x, args):
    ''' A kernel to be minimized, args are P and y and verbose  '''
    c = aCL(args[0], np.array(x))
    if (args[2]):
        print("params", x)
    ni = NI(args[1], c, 2)  # information with respect to target.
    return 1 - ni.NI()
예제 #2
0
def grid_search(P, y, verbose=False):
    ''' A pretty simple grid search with visualization of the 2D space'''
    img = np.zeros([25, 25])
    r = np.linspace(0, 1, 25)
    best_indices = None
    best_NI = 0

    for i1, t1 in enumerate(r):
        for i2, t2 in enumerate(r):
            c = aCL(P, np.array([t1, t2]))
            ni = NI(y, c, 2)
            this_NI = ni.NI()
            img[i1, i2] = this_NI
            if this_NI > best_NI:
                best_NI = this_NI
                best_T = np.array([t1, t2])
            if verbose:
                print("%f %f --- %f" % (t1, t2, ni.NI()))
    print("Optimization Result (Grid Search):%f %f --- %f" %
          (best_T[0], best_T[1], best_NI))
    return best_NI, best_T, img
예제 #3
0
def vote(X_train, y_train, X_test, y_test):
    for clf, name in (
        (MultinomialNB(alpha=.001), "Multinomial Naive Bayes"),
        (MultinomialNB(alpha=.01), "Multinomial Naive Bayes"),
        (MultinomialNB(alpha=.1), "Multinomial Naive Bayes"),
        (BernoulliNB(alpha=.001), "Bernoulli Bayes"),
        (BernoulliNB(alpha=.01), "Bernoulli Bayes"),
        (BernoulliNB(alpha=.1), "Bernoulli Bayes"),
            #-        (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"),
            #-        (Perceptron(n_iter=50), "Perceptron"),
            #-        (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"),
            #            (KNeighborsClassifier(n_neighbors=10), "kNN"),
            #            (RandomForestClassifier(n_estimators=100), "Random forest"),
            #-       (ExtraTreesClassifier(n_estimators=100), "ExtraTree"),
        (SGDClassifier(alpha=.001,
                       max_iter=500,
                       loss="modified_huber",
                       penalty="l2"), "SGD-l2"),
        (SGDClassifier(alpha=.001,
                       max_iter=500,
                       loss="modified_huber",
                       penalty="l1"), "SGD-l1"),
        (LogisticRegression(penalty="l2",
                            dual=False,
                            tol=0.0001,
                            C=1.0,
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=None,
                            random_state=None,
                            solver="liblinear",
                            max_iter=100,
                            multi_class="ovr",
                            verbose=0,
                            warm_start=False,
                            n_jobs=1), "MaxEnt"),
            #            (SGDClassifier(alpha=.001, n_iter=500,loss="log",penalty="elasticnet"), "SGD-elastic"),
            #            (CalibratedClassifierCV(SGDClassifier(alpha=.001, n_iter=500,penalty="elasticnet")), "SGD-elastic"),
            #            (CalibratedClassifierCV(LinearSVC(penalty="l2", dual=False,tol=1e-3)),"L-SVC-l2"),  # turns decision_function to predict_proba
    ):
        print(clf)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_train)
        print("Training error (BIAS)")
        print(metrics.classification_report(y_train, pred))

        pred = clf.predict(X_test)
        print("Validation")
        print(pred.shape)
        print(y_test.shape)
        print(metrics.classification_report(y_test, pred))

        P = clf.predict_proba(X_test)

        direc = np.random.rand(10, 2)
        res = minimize(optimize_kernel, [0.01, 0.01], [P, y_test, False],
                       method='Powell',
                       tol=1e-4,
                       options={
                           'disp': False,
                           'direc': direc
                       })

        pred = aCL(P, res.x)

        print("Abstained Validation")
        print(metrics.classification_report(y_test, pred))

        print(
            "abstained in %d of %d cases (%f)" %
            (np.sum(pred == 2), len(y_test), np.sum(pred == 2) / len(y_test)))
        print(metrics.confusion_matrix(y_test, pred))

        if opts.score == "precision":
            ps = metrics.precision_score(y_test, pred, average=None)
        elif opts.score == "f1":
            ps = metrics.f1_score(y_test, pred, average=None)
        elif opts.score == 'f1squared':
            ps = metrics.f1_score(y_test, pred, average=None)
            ps = [x * x for x in ps]
        elif opts.score == 'f1exp':
            ps = metrics.f1_score(y_test, pred, average=None)
            ps = [exp(x) for x in ps]
        else:
            raise "unknown score " + opts.score
        yield ps, pred
예제 #4
0
print("Abstaining Rate %f" % (float(np.sum(pred == 2)) / len(y_test)))
print(metrics.confusion_matrix(y_test, pred))
#
#
#
#
#### now abstain from the ensemble
direc = np.random.rand(10, 2)
res = minimize(optimize_kernel, [0.01, 0.01], [P, y_test, False],
               method='Powell',
               tol=1e-4,
               options={
                   'disp': False,
                   'direc': direc
               })
pred = aCL(P, res.x)

print("Abstained Ensemble of Abstaining Classifiers")
print(metrics.classification_report(y_test, pred))

print("abstained in %d of %d cases" % (np.sum(pred == 2), len(y_test)))
print("Abstaining Rate %f" % (float(np.sum(pred == 2)) / len(y_test)))
print(metrics.confusion_matrix(y_test, pred))

## Finally, resolve these to actual classifications of tweets that can be rendered in the end.
#
#len(data_test.filenames[pred==1])
#
#
#
#