def test_load_nips_raw(): docs = data.load_nips(years=[2008], raw=True) assert_equal(len(docs), 250) assert_true(isinstance(docs[0], basestring)) docs = data.load_nips(raw=True) assert_equal(len(docs), 2261) assert_true(isinstance(docs[0], basestring))
def test_load_nips_tokenized(): docs = data.load_nips(years=[2008], raw=False) assert_equal(len(docs), 250) assert_true(isinstance(docs[0], list)) docs = data.load_nips(raw=False) assert_equal(len(docs), 2261) assert_true(isinstance(docs[0], list))
def test_label_finder(): finder = BigramLabelFinder(measure='pmi', pos=None) labels = finder.find(load_nips(years=[2009]), top_n=5) assert_equal(labels, [(u'monte', u'carlo'), (u'high', u'dimensional'), (u'does', u'not'), # not so good (u'experimental', u'results'), (u'nonparametric', u'bayesian')])
def test_label_finder(): finder = BigramLabelFinder(measure='pmi', pos=None) labels = finder.find(load_nips(years=[2009]), top_n=5) assert_equal( labels, [ (u'monte', u'carlo'), (u'high', u'dimensional'), (u'does', u'not'), # not so good (u'experimental', u'results'), (u'nonparametric', u'bayesian') ])
def test_label_finder_with_pos(): tagger = CorpusPOSTagger() finder = BigramLabelFinder(measure='pmi', pos=[('NN', 'NN'), ('JJ', 'NN')]) docs = load_nips(years=[2009]) docs = tagger.transform(docs) labels = finder.find(docs, top_n=5, strip_tags=False) assert_equal(labels, [((u'monte', 'NN'), (u'carlo', 'NN')), ((u'nonparametric', 'JJ'), (u'bayesian', 'NN')), ((u'active', 'JJ'), (u'learning', 'NN')), ((u'machine', 'NN'), (u'learning', 'NN')), ((u'semi-supervised', 'JJ'), (u'learning', 'NN'))]) labels = finder.find(docs, top_n=5) assert_equal(labels, [(u'monte', u'carlo'), (u'nonparametric', u'bayesian'), (u'active', u'learning'), (u'machine', u'learning'), (u'semi-supervised', u'learning')])