Python Doc.count Examples

Programming Language: Python

Namespace/Package Name: textacy

Class/Type: Doc

Method/Function: count

Examples at hotexamples.com: 2

Python Doc.count - 2 examples found. These are the top rated real world Python examples of textacy.Doc.count extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Doc(30)

to_terms_list(3)

count(2)

load(2)

to_bag_of_words(2)

save(1)

Example #1

Show file

class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.doc = Doc(TEXT.strip(), lang='en_core_web_sm')

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], int)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)

Example #2

Show file

File: test_doc.py Project: mdlynch37/textacy

class DocMethodsTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_doc',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))
        self.doc = Doc(TEXT.strip(), lang='en', metadata={'foo': 'bar!'})

    def test_n_tokens_and_sents(self):
        self.assertEqual(self.doc.n_tokens, 241)
        self.assertEqual(self.doc.n_sents, 8)

    def test_term_count(self):
        self.assertEqual(self.doc.count('statistical'), 3)
        self.assertEqual(self.doc.count('machine learning'), 2)
        self.assertEqual(self.doc.count('foo'), 0)

    def test_tokenized_text(self):
        tokenized_text = self.doc.tokenized_text
        self.assertIsInstance(tokenized_text, list)
        self.assertIsInstance(tokenized_text[0], list)
        self.assertIsInstance(tokenized_text[0][0], compat.unicode_)
        self.assertEqual(len(tokenized_text), self.doc.n_sents)

    def test_pos_tagged_text(self):
        pos_tagged_text = self.doc.pos_tagged_text
        self.assertIsInstance(pos_tagged_text, list)
        self.assertIsInstance(pos_tagged_text[0], list)
        self.assertIsInstance(pos_tagged_text[0][0], tuple)
        self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_)
        self.assertEqual(len(pos_tagged_text), self.doc.n_sents)

    def test_to_terms_list(self):
        full_terms_list = list(self.doc.to_terms_list(as_strings=True))
        full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False))
        self.assertEqual(len(full_terms_list), len(full_terms_list_ids))
        self.assertIsInstance(full_terms_list[0], compat.unicode_)
        self.assertIsInstance(full_terms_list_ids[0], compat.int_types)
        self.assertNotEqual(
            full_terms_list[0],
            list(self.doc.to_terms_list(as_strings=True, normalize=False))[0])
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))),
                        len(full_terms_list))
        self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))),
                        len(full_terms_list))

    def test_to_bag_of_words(self):
        bow = self.doc.to_bag_of_words(weighting='count')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        bow = self.doc.to_bag_of_words(weighting='binary')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], int)
        for value in list(bow.values())[0:10]:
            self.assertLess(value, 2)
        bow = self.doc.to_bag_of_words(weighting='freq')
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.int_types)
        self.assertIsInstance(list(bow.values())[0], float)
        bow = self.doc.to_bag_of_words(as_strings=True)
        self.assertIsInstance(bow, dict)
        self.assertIsInstance(list(bow.keys())[0], compat.unicode_)

    def test_doc_save_and_load(self):
        filepath = os.path.join(self.tempdir, 'test_doc_save_and_load.pkl')
        self.doc.save(filepath)
        new_doc = Doc.load(filepath)
        self.assertIsInstance(new_doc, Doc)
        self.assertEqual(len(new_doc), len(self.doc))
        self.assertEqual(new_doc.lang, self.doc.lang)
        self.assertEqual(new_doc.metadata, self.doc.metadata)

    def tearDown(self):
        shutil.rmtree(self.tempdir)