class DocMethodsTestCase(unittest.TestCase): def setUp(self): self.doc = Doc(TEXT.strip(), lang='en_core_web_sm') def test_n_tokens_and_sents(self): self.assertEqual(self.doc.n_tokens, 241) self.assertEqual(self.doc.n_sents, 8) def test_term_count(self): self.assertEqual(self.doc.count('statistical'), 3) self.assertEqual(self.doc.count('machine learning'), 2) self.assertEqual(self.doc.count('foo'), 0) def test_tokenized_text(self): tokenized_text = self.doc.tokenized_text self.assertIsInstance(tokenized_text, list) self.assertIsInstance(tokenized_text[0], list) self.assertIsInstance(tokenized_text[0][0], compat.unicode_) self.assertEqual(len(tokenized_text), self.doc.n_sents) def test_pos_tagged_text(self): pos_tagged_text = self.doc.pos_tagged_text self.assertIsInstance(pos_tagged_text, list) self.assertIsInstance(pos_tagged_text[0], list) self.assertIsInstance(pos_tagged_text[0][0], tuple) self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_) self.assertEqual(len(pos_tagged_text), self.doc.n_sents) def test_to_terms_list(self): full_terms_list = list(self.doc.to_terms_list(as_strings=True)) full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False)) self.assertEqual(len(full_terms_list), len(full_terms_list_ids)) self.assertIsInstance(full_terms_list[0], compat.unicode_) self.assertIsInstance(full_terms_list_ids[0], int) self.assertNotEqual( full_terms_list[0], list(self.doc.to_terms_list(as_strings=True, normalize=False))[0]) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) def test_to_bag_of_words(self): bow = self.doc.to_bag_of_words(weighting='count') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], int_types) self.assertIsInstance(list(bow.values())[0], int) bow = self.doc.to_bag_of_words(weighting='binary') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], int_types) self.assertIsInstance(list(bow.values())[0], int) for value in list(bow.values())[0:10]: self.assertLess(value, 2) bow = self.doc.to_bag_of_words(weighting='freq') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], int_types) self.assertIsInstance(list(bow.values())[0], float) bow = self.doc.to_bag_of_words(as_strings=True) self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.unicode_)
class DocMethodsTestCase(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='test_doc', dir=os.path.dirname( os.path.abspath(__file__))) self.doc = Doc(TEXT.strip(), lang='en', metadata={'foo': 'bar!'}) def test_n_tokens_and_sents(self): self.assertEqual(self.doc.n_tokens, 241) self.assertEqual(self.doc.n_sents, 8) def test_term_count(self): self.assertEqual(self.doc.count('statistical'), 3) self.assertEqual(self.doc.count('machine learning'), 2) self.assertEqual(self.doc.count('foo'), 0) def test_tokenized_text(self): tokenized_text = self.doc.tokenized_text self.assertIsInstance(tokenized_text, list) self.assertIsInstance(tokenized_text[0], list) self.assertIsInstance(tokenized_text[0][0], compat.unicode_) self.assertEqual(len(tokenized_text), self.doc.n_sents) def test_pos_tagged_text(self): pos_tagged_text = self.doc.pos_tagged_text self.assertIsInstance(pos_tagged_text, list) self.assertIsInstance(pos_tagged_text[0], list) self.assertIsInstance(pos_tagged_text[0][0], tuple) self.assertIsInstance(pos_tagged_text[0][0][0], compat.unicode_) self.assertEqual(len(pos_tagged_text), self.doc.n_sents) def test_to_terms_list(self): full_terms_list = list(self.doc.to_terms_list(as_strings=True)) full_terms_list_ids = list(self.doc.to_terms_list(as_strings=False)) self.assertEqual(len(full_terms_list), len(full_terms_list_ids)) self.assertIsInstance(full_terms_list[0], compat.unicode_) self.assertIsInstance(full_terms_list_ids[0], compat.int_types) self.assertNotEqual( full_terms_list[0], list(self.doc.to_terms_list(as_strings=True, normalize=False))[0]) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=1))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=(1, 2)))), len(full_terms_list)) self.assertLess(len(list(self.doc.to_terms_list(ngrams=False))), len(full_terms_list)) def test_to_bag_of_words(self): bow = self.doc.to_bag_of_words(weighting='count') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.int_types) self.assertIsInstance(list(bow.values())[0], int) bow = self.doc.to_bag_of_words(weighting='binary') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.int_types) self.assertIsInstance(list(bow.values())[0], int) for value in list(bow.values())[0:10]: self.assertLess(value, 2) bow = self.doc.to_bag_of_words(weighting='freq') self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.int_types) self.assertIsInstance(list(bow.values())[0], float) bow = self.doc.to_bag_of_words(as_strings=True) self.assertIsInstance(bow, dict) self.assertIsInstance(list(bow.keys())[0], compat.unicode_) def test_doc_save_and_load(self): filepath = os.path.join(self.tempdir, 'test_doc_save_and_load.pkl') self.doc.save(filepath) new_doc = Doc.load(filepath) self.assertIsInstance(new_doc, Doc) self.assertEqual(len(new_doc), len(self.doc)) self.assertEqual(new_doc.lang, self.doc.lang) self.assertEqual(new_doc.metadata, self.doc.metadata) def tearDown(self): shutil.rmtree(self.tempdir)