def optimize_kernel(x, args): ''' A kernel to be minimized, args are P and y and verbose ''' c = aCL(args[0], np.array(x)) if (args[2]): print("params", x) ni = NI(args[1], c, 2) # information with respect to target. return 1 - ni.NI()
def grid_search(P, y, verbose=False): ''' A pretty simple grid search with visualization of the 2D space''' img = np.zeros([25, 25]) r = np.linspace(0, 1, 25) best_indices = None best_NI = 0 for i1, t1 in enumerate(r): for i2, t2 in enumerate(r): c = aCL(P, np.array([t1, t2])) ni = NI(y, c, 2) this_NI = ni.NI() img[i1, i2] = this_NI if this_NI > best_NI: best_NI = this_NI best_T = np.array([t1, t2]) if verbose: print("%f %f --- %f" % (t1, t2, ni.NI())) print("Optimization Result (Grid Search):%f %f --- %f" % (best_T[0], best_T[1], best_NI)) return best_NI, best_T, img
def vote(X_train, y_train, X_test, y_test): for clf, name in ( (MultinomialNB(alpha=.001), "Multinomial Naive Bayes"), (MultinomialNB(alpha=.01), "Multinomial Naive Bayes"), (MultinomialNB(alpha=.1), "Multinomial Naive Bayes"), (BernoulliNB(alpha=.001), "Bernoulli Bayes"), (BernoulliNB(alpha=.01), "Bernoulli Bayes"), (BernoulliNB(alpha=.1), "Bernoulli Bayes"), #- (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), #- (Perceptron(n_iter=50), "Perceptron"), #- (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive"), # (KNeighborsClassifier(n_neighbors=10), "kNN"), # (RandomForestClassifier(n_estimators=100), "Random forest"), #- (ExtraTreesClassifier(n_estimators=100), "ExtraTree"), (SGDClassifier(alpha=.001, max_iter=500, loss="modified_huber", penalty="l2"), "SGD-l2"), (SGDClassifier(alpha=.001, max_iter=500, loss="modified_huber", penalty="l1"), "SGD-l1"), (LogisticRegression(penalty="l2", dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver="liblinear", max_iter=100, multi_class="ovr", verbose=0, warm_start=False, n_jobs=1), "MaxEnt"), # (SGDClassifier(alpha=.001, n_iter=500,loss="log",penalty="elasticnet"), "SGD-elastic"), # (CalibratedClassifierCV(SGDClassifier(alpha=.001, n_iter=500,penalty="elasticnet")), "SGD-elastic"), # (CalibratedClassifierCV(LinearSVC(penalty="l2", dual=False,tol=1e-3)),"L-SVC-l2"), # turns decision_function to predict_proba ): print(clf) clf.fit(X_train, y_train) pred = clf.predict(X_train) print("Training error (BIAS)") print(metrics.classification_report(y_train, pred)) pred = clf.predict(X_test) print("Validation") print(pred.shape) print(y_test.shape) print(metrics.classification_report(y_test, pred)) P = clf.predict_proba(X_test) direc = np.random.rand(10, 2) res = minimize(optimize_kernel, [0.01, 0.01], [P, y_test, False], method='Powell', tol=1e-4, options={ 'disp': False, 'direc': direc }) pred = aCL(P, res.x) print("Abstained Validation") print(metrics.classification_report(y_test, pred)) print( "abstained in %d of %d cases (%f)" % (np.sum(pred == 2), len(y_test), np.sum(pred == 2) / len(y_test))) print(metrics.confusion_matrix(y_test, pred)) if opts.score == "precision": ps = metrics.precision_score(y_test, pred, average=None) elif opts.score == "f1": ps = metrics.f1_score(y_test, pred, average=None) elif opts.score == 'f1squared': ps = metrics.f1_score(y_test, pred, average=None) ps = [x * x for x in ps] elif opts.score == 'f1exp': ps = metrics.f1_score(y_test, pred, average=None) ps = [exp(x) for x in ps] else: raise "unknown score " + opts.score yield ps, pred
print("Abstaining Rate %f" % (float(np.sum(pred == 2)) / len(y_test))) print(metrics.confusion_matrix(y_test, pred)) # # # # #### now abstain from the ensemble direc = np.random.rand(10, 2) res = minimize(optimize_kernel, [0.01, 0.01], [P, y_test, False], method='Powell', tol=1e-4, options={ 'disp': False, 'direc': direc }) pred = aCL(P, res.x) print("Abstained Ensemble of Abstaining Classifiers") print(metrics.classification_report(y_test, pred)) print("abstained in %d of %d cases" % (np.sum(pred == 2), len(y_test))) print("Abstaining Rate %f" % (float(np.sum(pred == 2)) / len(y_test))) print(metrics.confusion_matrix(y_test, pred)) ## Finally, resolve these to actual classifications of tweets that can be rendered in the end. # #len(data_test.filenames[pred==1]) # # # #