def run(self, args, opts): if len(args) < 1: raise UsageError() elif len(args) > 1: raise UsageError( "running 'scrapy benchmark' with more than one argument is not supported" ) classifier_name = args[0] status = Status() CF = ClassifierFactory(status.classifiers[classifier_name]) if opts.reviewed and opts.unreviewed: CF.create_data_set("both") elif opts.reviewed: CF.create_data_set("reviewed") elif opts.unreviewed: CF.create_data_set("unreviewed") results = [] lc = CF.create_classifier( LogisticRegression(C=1e5), status.classifiers[classifier_name]['features']()) lc.benchmark(opts.topn, opts.print_cm, opts.print_report, verbose=True) ## for clf, name in ( ## (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), ## (Perceptron(n_iter=50), "Perceptron"), ## (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive") ## ): ## print('=' * 80) ## print(name) ## c = CF.create_classifier(clf, status.classifiers[classifier_name]['features']()) ## results.append(c.benchmark(opts.topn, opts.print_cm, opts.print_report, verbose=True)) #Multiple classifier comparison # results.append(classifier) ## indices = np.arange(len(results)) ## results = [[x[i] for x in results] for i in range(4)] ## clf_names, score, training_time, test_time = results ## training_time = np.array(training_time) / np.max(training_time) ## test_time = np.array(test_time) / np.max(test_time) ## plt.figure(figsize=(12, 8)) ## plt.title("Score") ## plt.barh(indices, score, .2, label="score", color='r') ## plt.barh(indices + .3, training_time, .2, label="training time", color='g') ## plt.barh(indices + .6, test_time, .2, label="test time", color='b') ## plt.yticks(()) ## plt.legend(loc='best') ## plt.subplots_adjust(left=.25) ## plt.subplots_adjust(top=.95) ## plt.subplots_adjust(bottom=.05) ## ## for i, c in zip(indices, clf_names): ## plt.text(-.3, i, c) ## plt.show()
def __init__(self): self.status = Status() self.classifiers = [] self.exporters = {} for classifier in self.status.classifiers.keys(): CF = ClassifierFactory(self.status.classifiers[classifier]) CF.create_data_set("both") lc = lc = CF.create_classifier(LogisticRegression(C=1e5), self.status.classifiers[classifier]['features']()) lc.fit() self.classifiers.append((classifier, lc)) self.classifiers = sorted(self.classifiers, key = lambda a: a[1].estimate_accuracy(5, verbose=True)) print "Classifier {0} needs the most improvement; selected for export".format(self.classifiers[0][0]) for classification in self.status.classifiers[self.classifiers[0][0]]['classifications']: f = file("{0}.json".format(classification), "wb") self.exporters[classification] = JsonItemExporter(f)
def run(self, args, opts): if len(args) < 1: raise UsageError() elif len(args) > 1: raise UsageError("running 'scrapy benchmark' with more than one argument is not supported") classifier_name = args[0] status = Status() CF = ClassifierFactory(status.classifiers[classifier_name]) if opts.reviewed and opts.unreviewed: CF.create_data_set("both") elif opts.reviewed: CF.create_data_set("reviewed") elif opts.unreviewed: CF.create_data_set("unreviewed") results = [] lc = CF.create_classifier(LogisticRegression(C=1e5), status.classifiers[classifier_name]['features']()) lc.benchmark(opts.topn, opts.print_cm, opts.print_report, verbose=True) ## for clf, name in ( ## (RidgeClassifier(tol=1e-2, solver="lsqr"), "Ridge Classifier"), ## (Perceptron(n_iter=50), "Perceptron"), ## (PassiveAggressiveClassifier(n_iter=50), "Passive-Aggressive") ## ): ## print('=' * 80) ## print(name) ## c = CF.create_classifier(clf, status.classifiers[classifier_name]['features']()) ## results.append(c.benchmark(opts.topn, opts.print_cm, opts.print_report, verbose=True)) #Multiple classifier comparison # results.append(classifier) ## indices = np.arange(len(results)) ## results = [[x[i] for x in results] for i in range(4)] ## clf_names, score, training_time, test_time = results ## training_time = np.array(training_time) / np.max(training_time) ## test_time = np.array(test_time) / np.max(test_time) ## plt.figure(figsize=(12, 8)) ## plt.title("Score") ## plt.barh(indices, score, .2, label="score", color='r') ## plt.barh(indices + .3, training_time, .2, label="training time", color='g') ## plt.barh(indices + .6, test_time, .2, label="test time", color='b') ## plt.yticks(()) ## plt.legend(loc='best') ## plt.subplots_adjust(left=.25) ## plt.subplots_adjust(top=.95) ## plt.subplots_adjust(bottom=.05) ## ## for i, c in zip(indices, clf_names): ## plt.text(-.3, i, c) ## plt.show()
def __init__(self): self.status = Status() self.classifiers = [] self.exporters = {} for classifier in self.status.classifiers.keys(): CF = ClassifierFactory(self.status.classifiers[classifier]) CF.create_data_set("both") lc = lc = CF.create_classifier( LogisticRegression(C=1e5), self.status.classifiers[classifier]['features']()) lc.fit() self.classifiers.append((classifier, lc)) self.classifiers = sorted( self.classifiers, key=lambda a: a[1].estimate_accuracy(5, verbose=True)) print "Classifier {0} needs the most improvement; selected for export".format( self.classifiers[0][0]) for classification in self.status.classifiers[self.classifiers[0] [0]]['classifications']: f = file("{0}.json".format(classification), "wb") self.exporters[classification] = JsonItemExporter(f)