def test_no_options(self): corpus = burr_sir_corpus() cleaner = TextCleaner(text_cleaner=lambda text: 'cleaned text') cleaned_corpus = cleaner.transform(deepcopy(corpus)) for utterance in cleaned_corpus.iter_utterances(): self.assertEqual(utterance.text, 'cleaned text')
def test_dont_replace_text(self): corpus = burr_sir_corpus() cleaner = TextCleaner(text_cleaner=lambda text: 'cleaned text', replace_text=False) cleaned_corpus = cleaner.transform(deepcopy(corpus)) for original_utterance, cleaned_utterance in zip( corpus.iter_utterances(), cleaned_corpus.iter_utterances()): self.assertEqual(original_utterance.text, cleaned_utterance.text) self.assertEqual(cleaned_utterance.meta['cleaned'], 'cleaned text')
def test_save_original(self): corpus = burr_sir_corpus() cleaner = TextCleaner(text_cleaner=lambda text: 'cleaned text', replace_text=True, save_original=True) cleaned_corpus = cleaner.transform(deepcopy(corpus)) for original_utterance, cleaned_utterance in zip( corpus.iter_utterances(), cleaned_corpus.iter_utterances()): self.assertEqual(cleaned_utterance.text, 'cleaned text') self.assertEqual(original_utterance.text, cleaned_utterance.meta['original'])
def test_transform_utterances(self): corpus = burr_sir_corpus() transformer = BoWTransformer(obj_type='utterance', vectorizer=FakeVectorizer()) corpus = transformer.fit_transform(corpus) expected_vectors = [ burr_sir_sentence_1_vector(), burr_sir_sentence_2_vector() ] for expected_vector, utterance in zip(expected_vectors, corpus.iter_utterances()): actual_vector = utterance.get_vector('bow_vector') assert_sparse_matrices_equal(expected_vector, actual_vector)
def test_process_text_parse_mode(self): def fake_spacy_nlp(input_text): text_to_doc = { BURR_SIR_TEXT_1: burr_sir_doc_1(), BURR_SIR_TEXT_2: burr_sir_doc_2() } return text_to_doc[input_text] parser = TextParser(spacy_nlp=fake_spacy_nlp, mode='parse') corpus = burr_sir_corpus() actual = [utterance.meta['parsed'] for utterance in parser.transform(corpus).iter_utterances()] expected = [ [ { 'rt': 0, 'toks': [ { 'tok': 'Pardon', 'tag': 'VB', 'dep': 'ROOT', 'dn': [1, 2] }, { 'tok': 'me', 'tag': 'PRP', 'dep': 'dobj', 'up': 0, 'dn': [] }, { 'tok': '.', 'tag': '.', 'dep': 'punct', 'up': 0, 'dn': [] } ] }, { 'rt': 0, 'toks': [ { 'tok': 'Are', 'tag': 'VBP', 'dep': 'ROOT', 'dn': [1, 3, 4, 5, 6] }, { 'tok': 'you', 'tag': 'PRP', 'dep': 'nsubj', 'up': 0, 'dn': [] }, { 'tok': 'Aaron', 'tag': 'NNP', 'dep': 'compound', 'up': 3, 'dn': [] }, { 'tok': 'Burr', 'tag': 'NNP', 'dep': 'attr', 'up': 0, 'dn': [ 2 ] }, { 'tok': ',', 'tag': ',', 'dep': 'punct', 'up': 0, 'dn': [] }, { 'tok': 'sir', 'tag': 'NN', 'dep': 'npadvmod', 'up': 0, 'dn': [] }, { 'tok': '?', 'tag': '.', 'dep': 'punct', 'up': 0, 'dn': [] } ] } ], [ { 'rt': 1, 'toks': [ { 'tok': 'That', 'tag': 'DT', 'dep': 'nsubj', 'up': 1, 'dn': [] }, { 'tok': 'depends', 'tag': 'VBZ', 'dep': 'ROOT', 'dn': [ 0, 2 ] }, { 'tok': '.', 'tag': '.', 'dep': 'punct', 'up': 1, 'dn': [] } ] }, { 'rt': 2, 'toks': [ { 'tok': 'Who', 'tag': 'WP', 'dep': 'nsubj', 'up': 2, 'dn': [] }, { 'tok': "'s", 'tag': 'VBZ', 'dep': 'aux', 'up': 2, 'dn': [] }, { 'tok': 'asking', 'tag': 'VBG', 'dep': 'ROOT', 'dn': [0, 1, 3] }, { 'tok': '?', 'tag': '.', 'dep': 'punct', 'up': 2, 'dn': [] } ] } ] ] self.assertListEqual(expected, actual)
def test_process_text_tokenize_mode(self): class FakeSentenceTokenizer: def tokenize(self, input_text): text_to_sentences = { BURR_SIR_TEXT_1: [ 'Pardon me.', 'Are you Aaron Burr, sir?' ], BURR_SIR_TEXT_2: [ 'That depends.', "Who's asking?" ] } return text_to_sentences[input_text] def fake_spacy_nlp(input_text): text_to_doc = { BURR_SIR_SENTENCE_1: burr_sir_sentence_doc_1(), BURR_SIR_SENTENCE_2: burr_sir_sentence_doc_2(), BURR_SIR_SENTENCE_3: burr_sir_sentence_doc_3(), BURR_SIR_SENTENCE_4: burr_sir_sentence_doc_4() } return text_to_doc[input_text] parser = TextParser(spacy_nlp=fake_spacy_nlp, sent_tokenizer=FakeSentenceTokenizer(), mode='tokenize') corpus = burr_sir_corpus() actual = [utterance.meta['parsed'] for utterance in parser.transform(corpus).iter_utterances()] expected = [ [ { "toks": [ { "tok": "Pardon" }, { "tok": "me" }, { "tok": "." } ] }, { "toks": [ { "tok": "Are" }, { "tok": "you" }, { "tok": "Aaron" }, { "tok": "Burr" }, { "tok": "," }, { "tok": "sir" }, { "tok": "?" } ] } ], [ { "toks": [ { "tok": "That" }, { "tok": "depends" }, { "tok": "." } ] }, { "toks": [ { "tok": "Who" }, { "tok": "'s" }, { "tok": "asking" }, { "tok": "?" } ] } ] ] self.assertListEqual(expected, actual)