Beispiel #1
0
def test_word_analyzer_unigrams_and_bigrams():
    wa = WordNGramAnalyzer(min_n=1, max_n=2, stop_words=None)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    expected = [
        u"ai",
        u"mange",
        u"du",
        u"kangourou",
        u"ce",
        u"midi",
        u"etait",
        u"pas",
        u"tres",
        u"bon",
        u"ai mange",
        u"mange du",
        u"du kangourou",
        u"kangourou ce",
        u"ce midi",
        u"midi etait",
        u"etait pas",
        u"pas tres",
        u"tres bon",
    ]
    assert_equal(wa.analyze(text), expected)
Beispiel #2
0
def test_word_analyzer_unigrams_and_bigrams():
    wa = WordNGramAnalyzer(min_n=1, max_n=2, stop_words=None)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi', u'etait',
                u'pas', u'tres', u'bon', u'ai mange', u'mange du',
                u'du kangourou', u'kangourou ce', u'ce midi', u'midi etait',
                u'etait pas', u'pas tres', u'tres bon']
    assert_equal(wa.analyze(text), expected)
Beispiel #3
0
def test_word_analyzer_unigrams_and_bigrams():
    wa = WordNGramAnalyzer(min_n=1, max_n=2, stop_words=None)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi', u'etait',
                u'pas', u'tres', u'bon', u'ai mange', u'mange du',
                u'du kangourou', u'kangourou ce', u'ce midi', u'midi etait',
                u'etait pas', u'pas tres', u'tres bon']
    assert_equal(wa.analyze(text), expected)
Beispiel #4
0
def test_word_analyzer_unigrams():
    wa = WordNGramAnalyzer(min_n=1, max_n=1, stop_words=None)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    expected = [u"ai", u"mange", u"du", u"kangourou", u"ce", u"midi", u"etait", u"pas", u"tres", u"bon"]
    assert_equal(wa.analyze(text), expected)

    text = "This is a test, really.\n\n I met Harry yesterday."
    expected = [u"this", u"is", u"test", u"really", u"met", u"harry", u"yesterday"]
    assert_equal(wa.analyze(text), expected)

    text = StringIO("This is a test with a file-like object!")
    expected = [u"this", u"is", u"test", u"with", u"file", u"like", u"object"]
    assert_equal(wa.analyze(text), expected)
Beispiel #5
0
def test_word_analyzer_unigrams():
    wa = WordNGramAnalyzer(min_n=1, max_n=1, stop_words=None)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi',
                u'etait', u'pas', u'tres', u'bon']
    assert_equal(wa.analyze(text), expected)

    text = "This is a test, really.\n\n I met Harry yesterday."
    expected = [u'this', u'is', u'test', u'really', u'met', u'harry',
                u'yesterday']
    assert_equal(wa.analyze(text), expected)

    text = StringIO("This is a test with a file-like object!")
    expected = [u'this', u'is', u'test', u'with', u'file', u'like',
                u'object']
    assert_equal(wa.analyze(text), expected)
Beispiel #6
0
def test_word_analyzer_unigrams():
    wa = WordNGramAnalyzer(min_n=1, max_n=1, stop_words=None)

    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    expected = [u'ai', u'mange', u'du', u'kangourou', u'ce', u'midi',
                u'etait', u'pas', u'tres', u'bon']
    assert_equal(wa.analyze(text), expected)

    text = "This is a test, really.\n\n I met Harry yesterday."
    expected = [u'this', u'is', u'test', u'really', u'met', u'harry',
                u'yesterday']
    assert_equal(wa.analyze(text), expected)

    text = StringIO("This is a test with a file-like object!")
    expected = [u'this', u'is', u'test', u'with', u'file', u'like',
                u'object']
    assert_equal(wa.analyze(text), expected)
Beispiel #7
0
def test_unicode_decode_error():
    # decode_error default to strict, so this should fail
    # First, encode (as bytes) a unicode string.
    text = u"J'ai mang\xe9 du kangourou  ce midi, c'\xe9tait pas tr\xeas bon."
    text_bytes = text.encode('utf-8')
    # Then let the Analyzer try to decode it as ascii. It should fail,
    # because we have given it an incorrect charset.
    wa = WordNGramAnalyzer(min_n=1, max_n=2, stop_words=None, charset='ascii')
    assert_raises(UnicodeDecodeError, wa.analyze, text_bytes)
    ca = CharNGramAnalyzer(min_n=1, max_n=2, charset='ascii')
    assert_raises(UnicodeDecodeError, ca.analyze, text_bytes)
Beispiel #8
0
#    'svc': [{'probability': True}],
}

# split a training set and a test set
iter = ShuffleSplit(num_posts, n_iterations=1, test_fraction=0.15, indices=False)
for (iter_no, (train_index, test_index)) in enumerate(iter):
    print 'Iteration no. %d' %(iter_no + 1)
    y_train = np.array([ x for (x, y) in zip(all_data['target'], train_index) if y ])
    y_test  = np.array([ x for (x, y) in zip(all_data['target'], test_index) if y ])
    print 'Sampled %d training and %d test posts' %(len(y_train), len(y_test))

    print "Extracting features from the training dataset using a sparse vectorizer"
    t0 = time()
    title_vectorizer = Vectorizer(
        analyzer=WordNGramAnalyzer(
            charset='utf-8', 
            stop_words=set(['a', 'an', 'and', 'in', 'is', 'of', 'on', 'the', 'to']),
            )
        )
    title_train = title_vectorizer.fit_transform([ x for (x, y) in zip(all_data['title'], train_index) if y ])
    
    domain_vectorizer = extract.SimpleVectorizer()
    domain_train = domain_vectorizer.fit_transform([ x for (x, y) in zip(all_data['domain'], train_index) if y ])
    X_train = title_train
    print "done in %fs" % (time() - t0)
    print "n_samples: %d, n_features: %d" % X_train.shape
    print

    print "Extracting features from the test dataset using the same vectorizer"
    t0 = time()
    title_test = title_vectorizer.transform([ x for (x, y) in zip(all_data['title'], test_index) if y ])
    domain_test = domain_vectorizer.transform([ x for (x, y) in zip(all_data['domain'], test_index) if y ])