def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = (TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).build()) clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0) fdc = FeatsFromDoc( term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
def test_entity_tags(self): doc = whitespace_nlp("A a bb cc Bob.", {'bb': 'BAD'}, {'Bob': 'NNP'}) term_freq = FeatsFromSpacyDoc( entity_types_to_censor=set(['BAD'])).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'bob': 1, 'cc bob': 1 }), term_freq) term_freq = FeatsFromSpacyDoc(entity_types_to_censor=set(['BAD']), tag_types_to_censor=set( ['NNP'])).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'a _BAD': 1, '_BAD cc': 1, 'cc': 1, 'a a': 1, '_BAD': 1, 'NNP': 1, 'cc NNP': 1 }), term_freq)
def __init__(self, topic_model, use_lemmas=False, entity_types_to_censor=set(), entity_types_to_use=None, tag_types_to_censor=set(), strip_final_period=False, keyword_processor_args = {'case_sensitive' :False}): from flashtext import KeywordProcessor self._keyword_processor = KeywordProcessor(**keyword_processor_args) self._topic_model = topic_model for keyphrase in reduce(lambda x, y: set(x) | set(y), topic_model.values()): self._keyword_processor.add_keyword(keyphrase) FeatsFromSpacyDoc.__init__(self, use_lemmas, entity_types_to_censor, tag_types_to_censor, strip_final_period) FeatsFromTopicModelBase.__init__(self, topic_model)
def test_lemmas(self): doc = whitespace_nlp("A a bb ddddd.") term_freq = FeatsFromSpacyDoc(use_lemmas=True).get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'bb': 1, 'a bb': 1, 'dd': 1, 'a a': 1, 'bb dd': 1 }), term_freq)
def test_main(self): doc = whitespace_nlp("A a bb cc.") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual( Counter({ 'a': 2, 'bb': 1, 'a bb': 1, 'cc': 1, 'a a': 1, 'bb cc': 1 }), term_freq)
def test_build_censor_entities(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text term_doc_mat = (TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=set(['GPE']))).build()) self.assertIn('_GPE', set(term_doc_mat.get_term_freq_df().index)) self.assertNotIn('brooklyn', set(term_doc_mat.get_term_freq_df().index))
def test_entity_types_to_censor_not_a_set(self): doc = whitespace_nlp("A a bb cc.", {'bb': 'A'}) with self.assertRaises(AssertionError): FeatsFromSpacyDoc(entity_types_to_censor='A').get_feats(doc)
def test_empty(self): doc = whitespace_nlp("") term_freq = FeatsFromSpacyDoc().get_feats(doc) self.assertEqual(Counter(), term_freq)
def test_strip_final_period(self): doc = bad_whitespace_nlp('''I CAN'T ANSWER THAT QUESTION. I HAVE NOT ASKED THEM SPECIFICALLY IF THEY HAVE ENOUGH.''') feats = FeatsFromSpacyDoc().get_feats(doc) print(feats) self.assertEqual( feats, Counter({ 'i': 2, 'have': 2, 'that question.': 1, 'answer': 1, 'question.': 1, 'enough.': 1, 'i have': 1, 'them specifically': 1, 'have enough.': 1, 'not asked': 1, 'they have': 1, 'have not': 1, 'specifically': 1, 'answer that': 1, 'question. i': 1, "can't": 1, 'if': 1, 'they': 1, "can't answer": 1, 'asked': 1, 'them': 1, 'if they': 1, 'asked them': 1, 'that': 1, 'not': 1, "i can't": 1, 'specifically if': 1 })) feats = FeatsFromSpacyDoc(strip_final_period=True).get_feats(doc) print(feats) self.assertEqual( feats, Counter({ 'i': 2, 'have': 2, 'that question': 1, 'answer': 1, 'question': 1, 'enough': 1, 'i have': 1, 'them specifically': 1, 'have enough': 1, 'not asked': 1, 'they have': 1, 'have not': 1, 'specifically': 1, 'answer that': 1, 'question i': 1, "can't": 1, 'if': 1, 'they': 1, "can't answer": 1, 'asked': 1, 'them': 1, 'if they': 1, 'asked them': 1, 'that': 1, 'not': 1, "i can't": 1, 'specifically if': 1 }))