def test_get_term_idx_and_x(self): docs = [whitespace_nlp('aa aa bb.'), whitespace_nlp('bb aa a.')] df = pd.DataFrame({'category': ['a', 'b'], 'parsed': docs}) # corpus_fact = CorpusFromParsedDocuments(convention_df, 'category', 'parsed') corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed') corpus = corpus_fact.build() kvs = list(corpus_fact._term_idx_store.items()) keys = [k for k, v in kvs] values = [v for k, v in kvs] self.assertEqual(sorted(keys), list(range(7))) self.assertEqual(sorted(values), ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa']) def assert_word_in_doc_cnt(doc, word, count): self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count) assert_word_in_doc_cnt(0, 'aa', 2) assert_word_in_doc_cnt(0, 'bb', 1) assert_word_in_doc_cnt(0, 'aa aa', 1) assert_word_in_doc_cnt(0, 'aa bb', 1) assert_word_in_doc_cnt(0, 'bb aa', 0) assert_word_in_doc_cnt(1, 'bb', 1) assert_word_in_doc_cnt(1, 'aa', 1) assert_word_in_doc_cnt(1, 'a', 1) assert_word_in_doc_cnt(1, 'bb aa', 1) assert_word_in_doc_cnt(1, 'aa aa', 0) assert_word_in_doc_cnt(1, 'aa a', 1) self.assertTrue(isinstance(corpus, ParsedCorpus))
def test_get_term_idx_and_x(self): docs = [whitespace_nlp('aa aa bb.'), whitespace_nlp('bb aa a.')] df = pd.DataFrame({'category': ['a', 'b'], 'parsed': docs}) #corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus_fact = CorpusFromParsedDocuments(df, category_col='category', parsed_col='parsed') corpus = corpus_fact.build() kvs = list(corpus_fact._term_idx_store.items()) keys = [k for k, v in kvs] values = [v for k, v in kvs] self.assertEqual(sorted(keys), list(range(7))) self.assertEqual(sorted(values), ['a', 'aa', 'aa a', 'aa aa', 'aa bb', 'bb', 'bb aa']) def assert_word_in_doc_cnt(doc, word, count): self.assertEqual(corpus_fact._X[doc, corpus_fact._term_idx_store.getidx(word)], count) assert_word_in_doc_cnt(0, 'aa', 2) assert_word_in_doc_cnt(0, 'bb', 1) assert_word_in_doc_cnt(0, 'aa aa', 1) assert_word_in_doc_cnt(0, 'aa bb', 1) assert_word_in_doc_cnt(0, 'bb aa', 0) assert_word_in_doc_cnt(1, 'bb', 1) assert_word_in_doc_cnt(1, 'aa', 1) assert_word_in_doc_cnt(1, 'a', 1) assert_word_in_doc_cnt(1, 'bb aa', 1) assert_word_in_doc_cnt(1, 'aa aa', 0) assert_word_in_doc_cnt(1, 'aa a', 1) self.assertTrue(isinstance(corpus, ParsedCorpus))
def test_entity_tags(self): doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'}) term_freq = FeatsFromSpacyDoc( entity_types_to_censor=set(['BAD'])).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'bob': 1, 'cc bob': 1 }), term_freq) term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']), tag_types_to_censor=set( ['NNP'])).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'NNP': 1, 'cc NNP': 1 }), term_freq)
def test_main(self): try: from mock import Mock except: from unittest.mock import Mock feat_getter = FeatsFromSpacyDocAndEmpath( empath_analyze_function=mock_empath_analyze) sys.modules['empath'] = Mock(analyze=mock_empath_analyze) FeatsFromSpacyDocAndEmpath() doc = whitespace_nlp('Hello this is a document.') term_freq = feat_getter.get_feats(doc) self.assertEqual( set(term_freq.items()), set({ 'document': 1, 'hello': 1, 'is': 1, 'this': 1, 'a document': 1, 'hello this': 1, 'is a': 1, 'a': 1, 'this is': 1 }.items())) metadata_freq = feat_getter.get_doc_metadata(doc) self.assertEqual(metadata_freq['ridicule'], 1) self.assertNotIn('empath_fashion', metadata_freq)
def test_hamlet(self): raw_docs = get_hamlet_docs() categories = [ get_hamlet_snippet_binary_category(doc) for doc in raw_docs ] docs = [whitespace_nlp(doc) for doc in raw_docs] df = pd.DataFrame({'category': categories, 'parsed': docs}) corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus = corpus_fact.build() tdf = corpus.get_term_freq_df() self.assertEqual(list(tdf.ix['play']), [37, 5]) self.assertFalse( any( corpus.search('play').apply( lambda x: 'plfay' in str(x['parsed']), axis=1))) self.assertTrue( all( corpus.search('play').apply( lambda x: 'play' in str(x['parsed']), axis=1))) # !!! to do verify term doc matrix play_term_idx = corpus_fact._term_idx_store.getidx('play') play_X = corpus_fact._X.todok()[:, play_term_idx] self.assertEqual(play_X.sum(), 37 + 5)
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'parsed': cls.parsed_docs}) cls.corpus_fact = CorpusFromParsedDocuments(cls.df, 'category', 'parsed')
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'parsed': cls.parsed_docs, 'orig': [d.upper() for d in cls.documents]}) cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build() cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({'category': cls.categories, 'author': ['a', 'a', 'c', 'c', 'c', 'c', 'd', 'd', 'e', 'e'], 'parsed': cls.parsed_docs, 'document_lengths': [len(doc) for doc in cls.documents]}) cls.corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build()
def test_main(self): doc = whitespace_nlp("A a bb cc.") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'bb': 1, 'a bb': 1, 'cc': 1, 'a a': 1, 'bb cc': 1 }), term_freq)
def test_lemmas(self): doc = whitespace_nlp("A a bb ddddd.") term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'bb': 1, 'a bb': 1, 'dd': 1, 'a a': 1, 'bb dd': 1 }), term_freq)
def _make_political_corpus(self): clean = clean_function_factory() data = [] for party, speech in iter_party_speech_pairs(): cleaned_speech = clean(speech) if cleaned_speech and cleaned_speech != '': parsed_speech = whitespace_nlp(cleaned_speech) data.append({'party': party, 'text': parsed_speech}) corpus = CorpusFromParsedDocuments(pd.DataFrame(data), category_col='party', parsed_col='text').build() return corpus
def test_entity_tags(self): doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'}) term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD'])).get_feats(doc) self.assertEqual(Counter({'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'bob': 1, 'cc bob': 1}), term_freq) term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']), tag_types_to_censor=set(['NNP'])).get_feats(doc) self.assertEqual(Counter({'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'NNP': 1, 'cc NNP': 1}), term_freq)
def test_main(self): sys.modules['empath'] = Mock(analyze=mock_empath_analyze) FeatsFromOnlyEmpath() feat_getter = FeatsFromOnlyEmpath( empath_analyze_function=mock_empath_analyze) doc = whitespace_nlp('Hello this is a document.') term_freq = feat_getter.get_feats(doc) metadata_freq = feat_getter.get_doc_metadata(doc) self.assertEqual(term_freq, Counter()) self.assertEqual(metadata_freq['ridicule'], 1) self.assertNotIn('fashion', metadata_freq) self.assertNotIn('document', metadata_freq) self.assertNotIn('a document', metadata_freq)
def setUp(cls): cls.categories, cls.documents = get_docs_categories() cls.parsed_docs = [] for doc in cls.documents: cls.parsed_docs.append(whitespace_nlp(doc)) cls.df = pd.DataFrame({ 'category': cls.categories, 'parsed': cls.parsed_docs, 'orig': [d.upper() for d in cls.documents] }) cls.parsed_corpus = CorpusFromParsedDocuments(cls.df, 'category', 'parsed').build() cls.corpus = CorpusFromPandas(cls.df, 'category', 'orig', nlp=whitespace_nlp).build()
def test_main(self): try: from mock import Mock except: from unittest.mock import Mock feat_getter = FeatsFromSpacyDocAndEmpath(empath_analyze_function=mock_empath_analyze) sys.modules['empath'] = Mock(analyze=mock_empath_analyze) FeatsFromSpacyDocAndEmpath() doc = whitespace_nlp('Hello this is a document.') term_freq = feat_getter.get_feats(doc) self.assertEqual(set(term_freq.items()), set({'document': 1, 'hello': 1, 'is': 1, 'this': 1, 'a document': 1, 'hello this': 1, 'is a': 1, 'a': 1, 'this is': 1}.items())) metadata_freq = feat_getter.get_doc_metadata(doc) self.assertEqual(metadata_freq['ridicule'], 1) self.assertNotIn('empath_fashion', metadata_freq)
def test_main(self): try: from mock import Mock except: from unittest.mock import Mock sys.modules['empath'] = Mock(analyze=mock_empath_analyze) FeatsFromOnlyEmpath() feat_getter = FeatsFromOnlyEmpath(empath_analyze_function=mock_empath_analyze) doc = whitespace_nlp('Hello this is a document.') term_freq = feat_getter.get_feats(doc) metadata_freq = feat_getter.get_doc_metadata(doc) self.assertEqual(term_freq, Counter()) self.assertEqual(metadata_freq['ridicule'], 1) self.assertNotIn('fashion', metadata_freq) self.assertNotIn('document', metadata_freq) self.assertNotIn('a document', metadata_freq)
def test_hamlet(self): raw_docs = get_hamlet_docs() categories = [get_hamlet_snippet_binary_category(doc) for doc in raw_docs] docs = [whitespace_nlp(doc) for doc in raw_docs] df = pd.DataFrame({'category': categories, 'parsed': docs}) corpus_fact = CorpusFromParsedDocuments(df, 'category', 'parsed') corpus = corpus_fact.build() tdf = corpus.get_term_freq_df() self.assertEqual(list(tdf.ix['play']), [37, 5]) self.assertFalse(any(corpus.search('play').apply(lambda x: 'plfay' in str(x['parsed']), axis=1))) self.assertTrue(all(corpus.search('play').apply(lambda x: 'play' in str(x['parsed']), axis=1))) # !!! to do verify term doc matrix play_term_idx = corpus_fact._term_idx_store.getidx('play') play_X = corpus_fact._X.todok()[:, play_term_idx] self.assertEqual(play_X.sum(), 37 + 5)
def test_lemmas(self): doc = whitespace_nlp("A a bb ddddd.") term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc) self.assertEqual(Counter({'a': 2, 'bb': 1, 'a bb': 1, 'dd': 1, 'a a': 1, 'bb dd': 1}), term_freq)
def test_main(self): doc = whitespace_nlp("A a bb cc.") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual(Counter({'a': 2, 'bb': 1, 'a bb': 1, 'cc': 1, 'a a': 1, 'bb cc': 1}), term_freq)
def test_entity_types_to_censor_not_a_set(self): doc = whitespace_nlp("A a bb cc.", {'bb': 'A'}) with self.assertRaises(AssertionError): FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)
def test_empty(self): doc = whitespace_nlp("") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual(Counter(), term_freq)