Esempio n. 1
def test_countvectorizer_custom_vocabulary():
    what_we_like = ["pizza", "beer"]
    vect = CountVectorizer(vocabulary=what_we_like)
    assert_equal(set(vect.vocabulary), set(what_we_like))
    X = vect.transform(JUNK_FOOD_DOCS)
    assert_equal(X.shape[1], len(what_we_like))
Esempio n. 2
def test_dense_vectorizer_pipeline_grid_selection():
    # raw documents
    # simulate iterables
    train_data = iter(data[1:-1])
    test_data = iter([data[0], data[-1]])

    # label junk food as -1, the others as +1
    y = np.ones(len(data))
    y[:6] = -1
    y_train = y[1:-1]
    y_test = np.array([y[0], y[-1]])

    pipeline = Pipeline([('vect', CountVectorizer()), ('svc', LinearSVC())])

    parameters = {'vect__analyzer__max_n': (1, 2), 'svc__loss': ('l1', 'l2')}

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)

    # cross-validation doesn't work if the length of the data is not known,
    # hence use lists instead of iterators
    pred =, y_train).predict(list(test_data))
    assert_array_equal(pred, y_test)

    # on this toy dataset bigram representation which is used in the last of
    # the grid_search is considered the best estimator since they all converge
    # to 100% accurracy models
    assert_equal(grid_search.best_score, 1.0)
    best_vectorizer = grid_search.best_estimator.named_steps['vect']
    assert_equal(best_vectorizer.analyzer.max_n, 1)
Esempio n. 3
def test_pickle():
    for obj in (CountVectorizer(), SparseCountVectorizer(),
                TfidfTransformer(), SparseTfidfTransformer(),
                Vectorizer(), SparseVectorizer()):

        s = pickle.dumps(obj)
        assert_equal(type(pickle.loads(s)), obj.__class__)
Esempio n. 4
 def __init__(self, training, classes, vocabulary):
     vocabulary = load(vocabulary) = CountVectorizer(vocabulary = vocabulary.tolist())
     self.samples = load(training).tolist()
     self.classes = load(classes)
     self.classifier = LinearSVC(), self.classes)
Esempio n. 5
def test_vectorizer_max_df():
    test_data = [u'abc', u'dea']  # the letter a occurs in all strings
    vect = CountVectorizer(CharNGramAnalyzer(min_n=1, max_n=1), max_df=1.0)
    assert u'a' in vect.vocabulary.keys()
    assert_equals(len(vect.vocabulary.keys()), 5)
    vect.max_df = 0.5
    assert u'a' not in vect.vocabulary.keys()  # 'a' is ignored
    assert_equals(len(vect.vocabulary.keys()), 4)  # the others remain
Esempio n. 6
class SVM:

    def __init__(self, training, classes, vocabulary):
        vocabulary = load(vocabulary) = CountVectorizer(vocabulary = vocabulary.tolist())
        self.samples = load(training).tolist()
        self.classes = load(classes)
        self.classifier = LinearSVC(), self.classes)

    def classify(self, text):
        features =[text])
        return self.classifier.predict(features)[0]
Esempio n. 7
def get_clf(n=3, binarize=True):
    steps = [('vectorizer',
    if binarize:
        steps.append(('binarizer', Binarizer(copy=False)))
        steps.append(('clf', naive_bayes.BernoulliNB()))
        steps.append(('clf', naive_bayes.MultinomialNB()))

    return Pipeline(steps)
Esempio n. 9
def test_vectorizer():
    # results to be compared
    res = []

    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))


    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1') = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    return res
Esempio n. 10
 def __init__(self, analyzer=BOWAnalyzer, max_df=1.0):
     CountVectorizer.__init__(self, analyzer=analyzer, max_df=max_df)
Esempio n. 11
def test_vectorizer():
    # raw documents as an iterator
    train_data = iter(ALL_FOOD_DOCS[:-1])
    test_data = [ALL_FOOD_DOCS[-1]]
    n_train = len(ALL_FOOD_DOCS) - 1

    # test without vocabulary
    v1 = CountVectorizer(max_df=0.5)
    counts_train = v1.fit_transform(train_data)
    if hasattr(counts_train, 'tocsr'):
        counts_train = counts_train.tocsr()
    assert_equal(counts_train[0, v1.vocabulary[u"pizza"]], 2)

    # build a vectorizer v1 with the same vocabulary as the one fitted by v1
    v2 = CountVectorizer(vocabulary=v1.vocabulary)

    # compare that the two vectorizer give the same output on the test sample
    for v in (v1, v2):
        counts_test = v.transform(test_data)
        if hasattr(counts_test, 'tocsr'):
            counts_test = counts_test.tocsr()

        assert_equal(counts_test[0, v.vocabulary[u"salad"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"tomato"]], 1)
        assert_equal(counts_test[0, v.vocabulary[u"water"]], 1)

        # stop word from the fixed list
        assert_false(u"the" in v.vocabulary)

        # stop word found automatically by the vectorizer DF thresholding
        # words that are high frequent across the complete corpus are likely
        # to be not informative (either real stop words of extraction
        # artifacts)
        assert_false(u"copyright" in v.vocabulary)

        # not present in the sample
        assert_equal(counts_test[0, v.vocabulary[u"coke"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"burger"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"beer"]], 0)
        assert_equal(counts_test[0, v.vocabulary[u"pizza"]], 0)

    # test tf-idf
    t1 = TfidfTransformer(norm='l1')
    tfidf = toarray(
    assert_equal(len(t1.idf_), len(v1.vocabulary))
    assert_equal(tfidf.shape, (n_train, len(v1.vocabulary)))

    # test tf-idf with new data
    tfidf_test = toarray(t1.transform(counts_test))
    assert_equal(tfidf_test.shape, (len(test_data), len(v1.vocabulary)))

    # test tf alone
    t2 = TfidfTransformer(norm='l1', use_idf=False)
    tf = toarray(
    assert_equal(t2.idf_, None)

    # L1-normalized term frequencies sum to one
    assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)

    # test the direct tfidf vectorizer
    # (equivalent to term count vectorizer + tfidf transformer)
    train_data = iter(ALL_FOOD_DOCS[:-1])
    tv = Vectorizer(norm='l1') = v1.max_df
    tfidf2 = toarray(tv.fit_transform(train_data))
    assert_array_almost_equal(tfidf, tfidf2)

    # test the direct tfidf vectorizer with new data
    tfidf_test2 = toarray(tv.transform(test_data))
    assert_array_almost_equal(tfidf_test, tfidf_test2)

    # test empty vocabulary
    v3 = CountVectorizer(vocabulary=None)
    assert_raises(ValueError, v3.transform, train_data)
Esempio n. 12
def test_fit_countvectorizer_twice():
    cv = CountVectorizer()
    X1 = cv.fit_transform(ALL_FOOD_DOCS[:5])
    X2 = cv.fit_transform(ALL_FOOD_DOCS[5:])
    assert_not_equal(X1.shape[1], X2.shape[1])
Esempio n. 13
docs_test = [open(f).read() for f in dataset.filenames[n_samples_total / 2:]]

y_train =[:n_samples_total / 2]
y_test =[n_samples_total / 2:]

# Build a an analyzer that split strings into sequence of 1 to 3 characters
# after using the previous preprocessor
analyzer = CharNGramAnalyzer(

# Build a vectorizer / classifier pipeline using the previous analyzer
clf = Pipeline([
    ('vec', CountVectorizer(analyzer=analyzer)),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', LinearSVC(loss='l2', penalty='l1', dual=False, C=100)),

# Fit the pipeline on the training set, y_train)

# Predict the outcome on the testing set
y_predicted = clf.predict(docs_test)

# Print the classification report
print metrics.classification_report(y_test,
dataset = load_files(movie_reviews_data_folder)

# split the dataset in training and test set:
n_samples_total = dataset.filenames.shape[0]

split = (n_samples_total * 3) / 4

docs_train = [open(f).read() for f in dataset.filenames[:split]]
docs_test = [open(f).read() for f in dataset.filenames[split:]]

y_train =[:split]
y_test =[split:]

# Build a vectorizer / classifier pipeline using the previous analyzer
pipeline = Pipeline([
    ('vect', CountVectorizer(max_features=100000)),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(C=1000)),

parameters = {
    'vect__analyzer__max_n': (1, 2),
    'vect__max_df': (.95,),

# Fit the pipeline on the training set using grid search for the parameters
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)[:200], y_train[:200])

# Refit the best parameter set on the complete training set
clf =, y_train)
Esempio n. 15
# Uncomment the following to do the analysis on all the categories
#categories = None

print "Loading 20 newsgroups dataset for categories:"
print categories

data = fetch_20newsgroups(subset='train', categories=categories)
print "%d documents" % len(data.filenames)
print "%d categories" % len(data.target_names)

# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),

parameters = {
    # uncommenting more parameters will give better exploring power but will
    # increase processing time in a combinatorial way
    'vect__max_df': (0.5, 0.75, 1.0),
    #    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__analyzer__max_n': (1, 2),  # words or bigrams
    #    'tfidf__use_idf': (True, False),
    #    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l2', 'elasticnet'),
    #    'clf__n_iter': (10, 50, 80),