def test_get_term_idx_and_x(self):
		docs = [whitespace_nlp('aa aa bb.'),
		        whitespace_nlp('bb aa a.')]
		df = pd.DataFrame({'category': ['a', 'b'],
		                   'parsed': docs})
		# corpus_fact = CorpusFromParsedDocuments(convention_df, 'category', 'parsed')
		corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed')
		corpus = corpus_fact.build()

		kvs = list(corpus_fact._term_idx_store.items())
		keys = [k for k, v in kvs]
		values = [v for k, v in kvs]
		self.assertEqual(sorted(keys), list(range(7)))
		self.assertEqual(sorted(values),
		                 ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa'])

		def assert_word_in_doc_cnt(doc, word, count):
			self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count)

		assert_word_in_doc_cnt(0, 'aa', 2)
		assert_word_in_doc_cnt(0, 'bb', 1)
		assert_word_in_doc_cnt(0, 'aa aa', 1)
		assert_word_in_doc_cnt(0, 'aa bb', 1)
		assert_word_in_doc_cnt(0, 'bb aa', 0)
		assert_word_in_doc_cnt(1, 'bb', 1)
		assert_word_in_doc_cnt(1, 'aa', 1)
		assert_word_in_doc_cnt(1, 'a', 1)
		assert_word_in_doc_cnt(1, 'bb aa', 1)
		assert_word_in_doc_cnt(1, 'aa aa', 0)
		assert_word_in_doc_cnt(1, 'aa a', 1)
		self.assertTrue(isinstance(corpus, ParsedCorpus))
Esempio n. 2
0
	def test_get_term_idx_and_x(self):
		docs = [whitespace_nlp('aa aa bb.'),
		        whitespace_nlp('bb aa a.')]
		df = pd.DataFrame({'category': ['a', 'b'],
		                   'parsed': docs})
		#corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
		corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed')
		corpus = corpus_fact.build()

		kvs = list(corpus_fact._term_idx_store.items())
		keys = [k for k, v in kvs]
		values = [v for k, v in kvs]
		self.assertEqual(sorted(keys), list(range(7)))
		self.assertEqual(sorted(values),
		                 ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa'])

		def assert_word_in_doc_cnt(doc, word, count):
			self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count)

		assert_word_in_doc_cnt(0, 'aa', 2)
		assert_word_in_doc_cnt(0, 'bb', 1)
		assert_word_in_doc_cnt(0, 'aa aa', 1)
		assert_word_in_doc_cnt(0, 'aa bb', 1)
		assert_word_in_doc_cnt(0, 'bb aa', 0)
		assert_word_in_doc_cnt(1, 'bb', 1)
		assert_word_in_doc_cnt(1, 'aa', 1)
		assert_word_in_doc_cnt(1, 'a', 1)
		assert_word_in_doc_cnt(1, 'bb aa', 1)
		assert_word_in_doc_cnt(1, 'aa aa', 0)
		assert_word_in_doc_cnt(1, 'aa a', 1)
		self.assertTrue(isinstance(corpus, ParsedCorpus))
Esempio n. 3
0
 def test_entity_tags(self):
     doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'})
     term_freq = FeatsFromSpacyDoc(
         entity_types_to_censor=set(['BAD'])).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'a _BAD': 1,
             '_BAD cc': 1,
             'cc': 1,
             'a a': 1,
             '_BAD': 1,
             'bob': 1,
             'cc bob': 1
         }), term_freq)
     term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']),
                                   tag_types_to_censor=set(
                                       ['NNP'])).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'a _BAD': 1,
             '_BAD cc': 1,
             'cc': 1,
             'a a': 1,
             '_BAD': 1,
             'NNP': 1,
             'cc NNP': 1
         }), term_freq)
Esempio n. 4
0
 def test_main(self):
     try:
         from mock import Mock
     except:
         from unittest.mock import Mock
     feat_getter = FeatsFromSpacyDocAndEmpath(
         empath_analyze_function=mock_empath_analyze)
     sys.modules['empath'] = Mock(analyze=mock_empath_analyze)
     FeatsFromSpacyDocAndEmpath()
     doc = whitespace_nlp('Hello this is a document.')
     term_freq = feat_getter.get_feats(doc)
     self.assertEqual(
         set(term_freq.items()),
         set({
             'document': 1,
             'hello': 1,
             'is': 1,
             'this': 1,
             'a document': 1,
             'hello this': 1,
             'is a': 1,
             'a': 1,
             'this is': 1
         }.items()))
     metadata_freq = feat_getter.get_doc_metadata(doc)
     self.assertEqual(metadata_freq['ridicule'], 1)
     self.assertNotIn('empath_fashion', metadata_freq)
    def test_hamlet(self):
        raw_docs = get_hamlet_docs()
        categories = [
            get_hamlet_snippet_binary_category(doc) for doc in raw_docs
        ]
        docs = [whitespace_nlp(doc) for doc in raw_docs]
        df = pd.DataFrame({'category': categories, 'parsed': docs})
        corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
        corpus = corpus_fact.build()
        tdf = corpus.get_term_freq_df()
        self.assertEqual(list(tdf.ix['play']), [37, 5])
        self.assertFalse(
            any(
                corpus.search('play').apply(
                    lambda x: 'plfay' in str(x['parsed']), axis=1)))
        self.assertTrue(
            all(
                corpus.search('play').apply(
                    lambda x: 'play' in str(x['parsed']), axis=1)))

        # !!! to do verify term doc matrix
        play_term_idx = corpus_fact._term_idx_store.getidx('play')
        play_X = corpus_fact._X.todok()[:, play_term_idx]

        self.assertEqual(play_X.sum(), 37 + 5)
Esempio n. 6
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs})
		cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs})
		cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'parsed': cls.parsed_docs,
		                       'orig': [d.upper() for d in cls.documents]})
		cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
		cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'author': ['a', 'a', 'c', 'c', 'c',
		                                  'c', 'd', 'd', 'e', 'e'],
		                       'parsed': cls.parsed_docs,
		                       'document_lengths': [len(doc) for doc in cls.documents]})
		cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
Esempio n. 10
0
	def setUp(cls):
		cls.categories, cls.documents = get_docs_categories()
		cls.parsed_docs = []
		for doc in cls.documents:
			cls.parsed_docs.append(whitespace_nlp(doc))
		cls.df = pd.DataFrame({'category': cls.categories,
		                       'author': ['a', 'a', 'c', 'c', 'c',
		                                  'c', 'd', 'd', 'e', 'e'],
		                       'parsed': cls.parsed_docs,
		                       'document_lengths': [len(doc) for doc in cls.documents]})
		cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
Esempio n. 11
0
 def test_main(self):
     doc = whitespace_nlp("A a bb cc.")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'bb': 1,
             'a bb': 1,
             'cc': 1,
             'a a': 1,
             'bb cc': 1
         }), term_freq)
Esempio n. 12
0
 def test_lemmas(self):
     doc = whitespace_nlp("A a bb ddddd.")
     term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc)
     self.assertEqual(
         Counter({
             'a': 2,
             'bb': 1,
             'a bb': 1,
             'dd': 1,
             'a a': 1,
             'bb dd': 1
         }), term_freq)
 def _make_political_corpus(self):
     clean = clean_function_factory()
     data = []
     for party, speech in iter_party_speech_pairs():
         cleaned_speech = clean(speech)
         if cleaned_speech and cleaned_speech != '':
             parsed_speech = whitespace_nlp(cleaned_speech)
             data.append({'party': party, 'text': parsed_speech})
     corpus = CorpusFromParsedDocuments(pd.DataFrame(data),
                                        category_col='party',
                                        parsed_col='text').build()
     return corpus
	def _make_political_corpus(self):
		clean = clean_function_factory()
		data = []
		for party, speech in iter_party_speech_pairs():
			cleaned_speech = clean(speech)
			if cleaned_speech and cleaned_speech != '':
				parsed_speech = whitespace_nlp(cleaned_speech)
				data.append({'party': party,
				             'text': parsed_speech})
		corpus = CorpusFromParsedDocuments(pd.DataFrame(data),
		                                   category_col='party',
		                                   parsed_col='text').build()
		return corpus
	def test_entity_tags(self):
		doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'})
		term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD'])).get_feats(doc)
		self.assertEqual(Counter({'a': 2, 'a _BAD': 1,
		                          '_BAD cc': 1, 'cc': 1,
		                          'a a': 1, '_BAD': 1, 'bob': 1, 'cc bob': 1}),
		                 term_freq)
		term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']),
		                              tag_types_to_censor=set(['NNP'])).get_feats(doc)
		self.assertEqual(Counter({'a': 2, 'a _BAD': 1,
		                          '_BAD cc': 1, 'cc': 1,
		                          'a a': 1, '_BAD': 1, 'NNP': 1, 'cc NNP': 1}),
		                 term_freq)
    def test_main(self):
        sys.modules['empath'] = Mock(analyze=mock_empath_analyze)
        FeatsFromOnlyEmpath()
        feat_getter = FeatsFromOnlyEmpath(
            empath_analyze_function=mock_empath_analyze)
        doc = whitespace_nlp('Hello this is a document.')
        term_freq = feat_getter.get_feats(doc)
        metadata_freq = feat_getter.get_doc_metadata(doc)

        self.assertEqual(term_freq, Counter())
        self.assertEqual(metadata_freq['ridicule'], 1)
        self.assertNotIn('fashion', metadata_freq)
        self.assertNotIn('document', metadata_freq)
        self.assertNotIn('a document', metadata_freq)
Esempio n. 17
0
 def setUp(cls):
     cls.categories, cls.documents = get_docs_categories()
     cls.parsed_docs = []
     for doc in cls.documents:
         cls.parsed_docs.append(whitespace_nlp(doc))
     cls.df = pd.DataFrame({
         'category': cls.categories,
         'parsed': cls.parsed_docs,
         'orig': [d.upper() for d in cls.documents]
     })
     cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category',
                                                   'parsed').build()
     cls.corpus = CorpusFromPandas(cls.df,
                                   'category',
                                   'orig',
                                   nlp=whitespace_nlp).build()
	def test_main(self):
		try:
			from mock import Mock
		except:
			from unittest.mock import Mock
		feat_getter = FeatsFromSpacyDocAndEmpath(empath_analyze_function=mock_empath_analyze)
		sys.modules['empath'] = Mock(analyze=mock_empath_analyze)
		FeatsFromSpacyDocAndEmpath()
		doc = whitespace_nlp('Hello this is a document.')
		term_freq = feat_getter.get_feats(doc)
		self.assertEqual(set(term_freq.items()),
		                 set({'document': 1, 'hello': 1, 'is': 1, 'this': 1,
		                      'a document': 1, 'hello this': 1, 'is a': 1,
		                      'a': 1, 'this is': 1}.items()))
		metadata_freq = feat_getter.get_doc_metadata(doc)
		self.assertEqual(metadata_freq['ridicule'], 1)
		self.assertNotIn('empath_fashion', metadata_freq)
	def test_main(self):
		try:
			from mock import Mock
		except:
			from unittest.mock import Mock

		sys.modules['empath'] = Mock(analyze=mock_empath_analyze)
		FeatsFromOnlyEmpath()
		feat_getter = FeatsFromOnlyEmpath(empath_analyze_function=mock_empath_analyze)
		doc = whitespace_nlp('Hello this is a document.')
		term_freq = feat_getter.get_feats(doc)
		metadata_freq = feat_getter.get_doc_metadata(doc)

		self.assertEqual(term_freq, Counter())
		self.assertEqual(metadata_freq['ridicule'], 1)
		self.assertNotIn('fashion', metadata_freq)
		self.assertNotIn('document', metadata_freq)
		self.assertNotIn('a document', metadata_freq)
	def test_hamlet(self):
		raw_docs = get_hamlet_docs()
		categories = [get_hamlet_snippet_binary_category(doc) for doc in raw_docs]
		docs = [whitespace_nlp(doc) for doc in raw_docs]
		df = pd.DataFrame({'category': categories,
		                   'parsed': docs})
		corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed')
		corpus = corpus_fact.build()
		tdf = corpus.get_term_freq_df()
		self.assertEqual(list(tdf.ix['play']), [37, 5])
		self.assertFalse(any(corpus.search('play').apply(lambda x: 'plfay' in str(x['parsed']), axis=1)))
		self.assertTrue(all(corpus.search('play').apply(lambda x: 'play' in str(x['parsed']), axis=1)))

		# !!! to do verify term doc matrix
		play_term_idx = corpus_fact._term_idx_store.getidx('play')
		play_X = corpus_fact._X.todok()[:, play_term_idx]

		self.assertEqual(play_X.sum(), 37 + 5)
	def test_lemmas(self):
		doc = whitespace_nlp("A a bb ddddd.")
		term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc)
		self.assertEqual(Counter({'a': 2, 'bb': 1, 'a bb': 1, 'dd': 1, 'a a': 1, 'bb dd': 1}),
		                 term_freq)
	def test_main(self):
		doc = whitespace_nlp("A a bb cc.")
		term_freq = FeatsFromSpacyDoc().get_feats(doc)
		self.assertEqual(Counter({'a': 2, 'bb': 1, 'a bb': 1, 'cc': 1, 'a a': 1, 'bb cc': 1}),
		                 term_freq)
Esempio n. 23
0
 def test_entity_types_to_censor_not_a_set(self):
     doc = whitespace_nlp("A a bb cc.", {'bb': 'A'})
     with self.assertRaises(AssertionError):
         FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)
Esempio n. 24
0
 def test_empty(self):
     doc = whitespace_nlp("")
     term_freq = FeatsFromSpacyDoc().get_feats(doc)
     self.assertEqual(Counter(), term_freq)
	def test_empty(self):
		doc = whitespace_nlp("")
		term_freq = FeatsFromSpacyDoc().get_feats(doc)
		self.assertEqual(Counter(), term_freq)
	def test_entity_types_to_censor_not_a_set(self):
		doc = whitespace_nlp("A a bb cc.", {'bb': 'A'})
		with self.assertRaises(AssertionError):
			FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)