def train(model_name, category_type, dump=False): clf = tfidf_pipeline.make(model_name) categories = names.categories[category_type] print 'Loading data...' data = data_loader.load('full', categories) train_X, train_y, test_X, test_y = data_loader.split(data, 0.1) print 'Done.' print 'Training...' clf.fit(train_X, train_y) print 'Done.' print 'Testing...' predicted = clf.predict(test_X) if model_name in ['svr', 'linreg']: predicted = np.clip(np.round(predicted), 0, 7) accuracy = scorers.err1(test_y, predicted) print 'Off-by-one accuracy: ' + str(accuracy) else: accuracy = scorers.err0(test_y, predicted) print 'Exact accuracy: ' + str(accuracy) print classification_report(test_y, predicted, target_names=categories) cm = confusion_matrix(test_y, predicted) print cm plot.plot_confusion_matrix(cm, category_type) if dump: print 'Saving classifier...' if not exists('dumps'): makedirs('dumps') joblib.dump(clf, join('dumps', category_type + '_' + model_name + '_classifier.pkl')) print 'Done.' return clf
import sys from sklearn.externals.joblib import dump import data_loader import names import tfidf_pipeline import model_presets if __name__ == '__main__': for (category_name, model_name) in [('stars', 'linreg'), ('binary', 'svc')]: print 'Loading ' + category_name + ' data' train,_ = data_loader.load('split', names.categories[category_name]) print 'Training ' + model_name clf = tfidf_pipeline.make(model_name) clf.fit(train.data, train.target) print 'Dumping ' + model_name dump(clf, 'web_clf_' + category_name + '.pkl')