class SVM: def __init__(self, training, classes, vocabulary): vocabulary = load(vocabulary) self.cv = CountVectorizer(vocabulary = vocabulary.tolist()) self.samples = load(training).tolist() self.classes = load(classes) self.classifier = LinearSVC() self.classifier.fit(self.samples, self.classes) def classify(self, text): features = self.cv.transform([text]) return self.classifier.predict(features)[0]
def test_dense_vectorizer_pipeline_grid_selection(): # raw documents data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS # simulate iterables train_data = iter(data[1:-1]) test_data = iter([data[0], data[-1]]) # label junk food as -1, the others as +1 y = np.ones(len(data)) y[:6] = -1 y_train = y[1:-1] y_test = np.array([y[0], y[-1]]) pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())]) parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')} # find the best parameters for both the feature extraction and the # classifier grid_search = GridSearchCV(pipeline, parameters, n_jobs=1) # cross-validation doesn't work if the length of the data is not known, # hence use lists instead of iterators pred = grid_search.fit(list(train_data), y_train).predict(list(test_data)) assert_array_equal(pred, y_test) # on this toy dataset bigram representation which is used in the last of # the grid_search is considered the best estimator since they all converge # to 100% accurracy models assert_equal(grid_search.best_score, 1.0) best_vectorizer = grid_search.best_estimator.named_steps['vect'] assert_equal(best_vectorizer.analyzer.max_n, 1)
def __init__(self, training, classes, vocabulary): vocabulary = load(vocabulary) self.cv = CountVectorizer(vocabulary = vocabulary.tolist()) self.samples = load(training).tolist() self.classes = load(classes) self.classifier = LinearSVC() self.classifier.fit(self.samples, self.classes)
def test_sparse_tf_idf(): hv = SparseHashingVectorizer(dim=1000000, probes=3) hv.vectorize(JUNK_FOOD_DOCS) hv.vectorize(NOTJUNK_FOOD_DOCS) # extract the TF-IDF data X = hv.get_tfidf() assert_equal(X.shape, (11, 1000000)) # label junk food as -1, the others as +1 y = np.ones(X.shape[0]) y[:6] = -1 # train and test a classifier clf = SparseLinearSVC(C=10).fit(X[1:-1], y[1:-1]) assert_equal(clf.predict(X[0, :]), [-1]) assert_equal(clf.predict(X[-1, :]), [1])
def train(cls, labeled_featuresets): train, target_labels = zip(*labeled_featuresets) target_names = sorted(set(target_labels)) targets = [target_names.index(l) for l in target_labels] pipeline = Pipeline([ ('bow', BagOfWordsVectorizer()), ('clf', LinearSVC(C=1000)), ]) pipeline.fit(train, targets) return cls(pipeline, target_names)
print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = { 'loss': 'l2', 'penalty': 'l2', 'C': 10, 'dual': False, 'eps': 1e-4, } print "parameters:", parameters t0 = time() clf = LinearSVC(**parameters).fit(X_train, y_train) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " news_test = load_mlcomp('20news-18828', 'test') t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))
print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time for clf, name in ((RidgeClassifier(tol=1e-1), "Ridge Classifier"), (NeighborsClassifier(n_neighbors=10), "kNN")): print 80*'=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark(LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark(SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet")) # Train sparse Naive Bayes classifiers print 80 * '=' print "Naive Bayes" mnnb_results = benchmark(MultinomialNB(alpha=.01))
# split the dataset in training and test set: n_samples_total = dataset.filenames.shape[0] split = (n_samples_total * 3) / 4 docs_train = [open(f).read() for f in dataset.filenames[:split]] docs_test = [open(f).read() for f in dataset.filenames[split:]] y_train = dataset.target[:split] y_test = dataset.target[split:] # Build a vectorizer / classifier pipeline using the previous analyzer pipeline = Pipeline([ ('vect', CountVectorizer(max_features=100000)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(C=1000)), ]) parameters = { 'vect__analyzer__max_n': (1, 2), 'vect__max_df': (.95, ), } # Fit the pipeline on the training set using grid search for the parameters grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1) grid_search.fit(docs_train[:200], y_train[:200]) # Refit the best parameter set on the complete training set clf = grid_search.best_estimator.fit(docs_train, y_train) # Predict the outcome on the testing set
if print_cm: print "confusion matrix:" print metrics.confusion_matrix(y_test, pred) print return score, train_time, test_time for clf, name in ((RidgeClassifier(), "Ridge Classifier"), ): print 80 * '=' print name results = benchmark(clf) for penalty in ["l2", "l1"]: print 80 * '=' print "%s penalty" % penalty.upper() # Train Liblinear model liblinear_results = benchmark( LinearSVC(loss='l2', penalty=penalty, C=1000, dual=False, tol=1e-3)) # Train SGD model sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty=penalty)) # Train SGD with Elastic Net penalty print 80 * '=' print "Elastic-Net penalty" sgd_results = benchmark( SGDClassifier(alpha=.0001, n_iter=50, penalty="elasticnet"))
y_train = dataset.target[:n_samples_total / 2] y_test = dataset.target[n_samples_total / 2:] # Build a an analyzer that split strings into sequence of 1 to 3 characters # after using the previous preprocessor analyzer = CharNGramAnalyzer( min_n=1, max_n=3, preprocessor=LowerCasePreprocessor(), ) # Build a vectorizer / classifier pipeline using the previous analyzer clf = Pipeline([ ('vec', CountVectorizer(analyzer=analyzer)), ('tfidf', TfidfTransformer()), ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)), ]) # Fit the pipeline on the training set clf.fit(docs_train, y_train) # Predict the outcome on the testing set y_predicted = clf.predict(docs_test) # Print the classification report print metrics.classification_report(y_test, y_predicted, class_names=dataset.target_names) # Plot the confusion matrix cm = metrics.confusion_matrix(y_test, y_predicted)
# The documents have been hashed into TF-IDF (Term Frequencies times Inverse # Document Frequencies) vectors of a fixed dimension. print "n_samples: %d, n_features: %d" % news_train.data.shape print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = { 'loss': 'l2', 'penalty': 'l2', 'C': 10, 'dual': False, 'eps': 1e-4, } print "parameters:", parameters t0 = time() clf = LinearSVC(**parameters).fit(news_train.data, news_train.target) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " t0 = time() news_test = load_mlcomp('20news-18828', 'test', sparse=True) print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." t0 = time() pred = clf.predict(news_test.data) print "done in %fs" % (time() - t0) print "Classification accuracy: %f" % (np.mean(pred == news_test.target) * 100) cm = confusion_matrix(news_test.target, pred)
print "%d categories" % len(news_train.target_names) print "Extracting features from the dataset using a sparse vectorizer" t0 = time() vectorizer = Vectorizer() X_train = vectorizer.fit_transform((open(f).read() for f in news_train.filenames)) print "done in %fs" % (time() - t0) print "n_samples: %d, n_features: %d" % X_train.shape assert sp.issparse(X_train) y_train = news_train.target print "Training a linear SVM (hinge loss and L2 regularizer)..." parameters = {"loss": "l2", "penalty": "l2", "C": 10, "dual": False, "eps": 1e-4} print "parameters:", parameters t0 = time() clf = LinearSVC(**parameters).fit(X_train, y_train) print "done in %fs" % (time() - t0) print "Percentage of non zeros coef: %f" % (np.mean(clf.coef_ != 0) * 100) print "Loading 20 newsgroups test set... " news_test = load_mlcomp("20news-18828", "test") t0 = time() print "done in %fs" % (time() - t0) print "Predicting the labels of the test set..." print "%d documents" % len(news_test.filenames) print "%d categories" % len(news_test.target_names) print "Extracting features from the dataset using the same vectorizer" t0 = time() X_test = vectorizer.transform((open(f).read() for f in news_test.filenames))