def test_no_options(self):
        corpus = burr_sir_corpus()
        cleaner = TextCleaner(text_cleaner=lambda text: 'cleaned text')
        cleaned_corpus = cleaner.transform(deepcopy(corpus))

        for utterance in cleaned_corpus.iter_utterances():
            self.assertEqual(utterance.text, 'cleaned text')
    def test_dont_replace_text(self):
        corpus = burr_sir_corpus()
        cleaner = TextCleaner(text_cleaner=lambda text: 'cleaned text',
                              replace_text=False)
        cleaned_corpus = cleaner.transform(deepcopy(corpus))

        for original_utterance, cleaned_utterance in zip(
                corpus.iter_utterances(), cleaned_corpus.iter_utterances()):
            self.assertEqual(original_utterance.text, cleaned_utterance.text)
            self.assertEqual(cleaned_utterance.meta['cleaned'], 'cleaned text')
    def test_save_original(self):
        corpus = burr_sir_corpus()
        cleaner = TextCleaner(text_cleaner=lambda text: 'cleaned text',
                              replace_text=True,
                              save_original=True)
        cleaned_corpus = cleaner.transform(deepcopy(corpus))

        for original_utterance, cleaned_utterance in zip(
                corpus.iter_utterances(), cleaned_corpus.iter_utterances()):
            self.assertEqual(cleaned_utterance.text, 'cleaned text')
            self.assertEqual(original_utterance.text,
                             cleaned_utterance.meta['original'])
コード例 #4
0
    def test_transform_utterances(self):
        corpus = burr_sir_corpus()
        transformer = BoWTransformer(obj_type='utterance', vectorizer=FakeVectorizer())
        corpus = transformer.fit_transform(corpus)

        expected_vectors = [
            burr_sir_sentence_1_vector(),
            burr_sir_sentence_2_vector()
        ]

        for expected_vector, utterance in zip(expected_vectors, corpus.iter_utterances()):
            actual_vector = utterance.get_vector('bow_vector')
            assert_sparse_matrices_equal(expected_vector, actual_vector)
    def test_process_text_parse_mode(self):
        def fake_spacy_nlp(input_text):
            text_to_doc = {
                BURR_SIR_TEXT_1: burr_sir_doc_1(),
                BURR_SIR_TEXT_2: burr_sir_doc_2()
            }

            return text_to_doc[input_text]
        
        parser = TextParser(spacy_nlp=fake_spacy_nlp, mode='parse')
        corpus = burr_sir_corpus()
        actual = [utterance.meta['parsed'] for utterance in parser.transform(corpus).iter_utterances()]
        expected = [
            [
                {
                    'rt': 0,
                    'toks': [
                        {
                            'tok': 'Pardon',
                            'tag': 'VB',
                            'dep': 'ROOT',
                            'dn': [1, 2]
                        },
                        {
                            'tok': 'me',
                            'tag': 'PRP',
                            'dep': 'dobj',
                            'up': 0,
                            'dn': []
                        },
                        {
                            'tok': '.',
                            'tag': '.',
                            'dep': 'punct',
                            'up': 0,
                            'dn': []
                        }
                    ]
                },
                {
                    'rt': 0,
                    'toks': [
                        {
                            'tok': 'Are',
                            'tag': 'VBP',
                            'dep': 'ROOT',
                            'dn': [1, 3, 4, 5, 6]
                        },
                        {
                            'tok': 'you',
                            'tag': 'PRP',
                            'dep': 'nsubj',
                            'up': 0,
                            'dn': []
                        },
                        {
                            'tok': 'Aaron',
                            'tag': 'NNP',
                            'dep': 'compound',
                            'up': 3,
                            'dn': []
                        },
                        {
                            'tok': 'Burr',
                            'tag': 'NNP',
                            'dep': 'attr',
                            'up': 0,
                            'dn': [
                                2
                            ]
                        },
                        {
                            'tok': ',',
                            'tag': ',',
                            'dep': 'punct',
                            'up': 0,
                            'dn': []
                        },
                        {
                            'tok': 'sir',
                            'tag': 'NN',
                            'dep': 'npadvmod',
                            'up': 0,
                            'dn': []
                        },
                        {
                            'tok': '?',
                            'tag': '.',
                            'dep': 'punct',
                            'up': 0,
                            'dn': []
                        }
                    ]
                }
            ],
            [
                {
                    'rt': 1,
                    'toks': [
                        {
                            'tok': 'That',
                            'tag': 'DT',
                            'dep': 'nsubj',
                            'up': 1,
                            'dn': []
                        },
                        {
                            'tok': 'depends',
                            'tag': 'VBZ',
                            'dep': 'ROOT',
                            'dn': [
                                0,
                                2
                            ]
                        },
                        {
                            'tok': '.',
                            'tag': '.',
                            'dep': 'punct',
                            'up': 1,
                            'dn': []
                        }
                    ]
                },
                {
                    'rt': 2,
                    'toks': [
                        {
                            'tok': 'Who',
                            'tag': 'WP',
                            'dep': 'nsubj',
                            'up': 2,
                            'dn': []
                        },
                        {
                            'tok': "'s",
                            'tag': 'VBZ',
                            'dep': 'aux',
                            'up': 2,
                            'dn': []
                        },
                        {
                            'tok': 'asking',
                            'tag': 'VBG',
                            'dep': 'ROOT',
                            'dn': [0, 1, 3]
                        },
                        {
                            'tok': '?',
                            'tag': '.',
                            'dep': 'punct',
                            'up': 2,
                            'dn': []
                        }
                    ]
                }
            ]
        ]

        self.assertListEqual(expected, actual)
    def test_process_text_tokenize_mode(self):
        class FakeSentenceTokenizer:
            def tokenize(self, input_text):
                text_to_sentences = {
                    BURR_SIR_TEXT_1: [
                        'Pardon me.',
                        'Are you Aaron Burr, sir?'
                    ],
                    BURR_SIR_TEXT_2: [
                        'That depends.',
                        "Who's asking?"
                    ]
                }

                return text_to_sentences[input_text]


        def fake_spacy_nlp(input_text):
            text_to_doc = {
                BURR_SIR_SENTENCE_1: burr_sir_sentence_doc_1(),
                BURR_SIR_SENTENCE_2: burr_sir_sentence_doc_2(),
                BURR_SIR_SENTENCE_3: burr_sir_sentence_doc_3(),
                BURR_SIR_SENTENCE_4: burr_sir_sentence_doc_4()
            }

            return text_to_doc[input_text]
        
        parser = TextParser(spacy_nlp=fake_spacy_nlp, sent_tokenizer=FakeSentenceTokenizer(), mode='tokenize')
        corpus = burr_sir_corpus()
        actual = [utterance.meta['parsed'] for utterance in parser.transform(corpus).iter_utterances()]
        expected = [
            [
                {
                    "toks": [
                        {
                            "tok": "Pardon"
                        },
                        {
                            "tok": "me"
                        },
                        {
                            "tok": "."
                        }
                    ]
                },
                {
                    "toks": [
                        {
                            "tok": "Are"
                        },
                        {
                            "tok": "you"
                        },
                        {
                            "tok": "Aaron"
                        },
                        {
                            "tok": "Burr"
                        },
                        {
                            "tok": ","
                        },
                        {
                            "tok": "sir"
                        },
                        {
                            "tok": "?"
                        }
                    ]
                }
            ],
            [
                {
                    "toks": [
                        {
                            "tok": "That"
                        },
                        {
                            "tok": "depends"
                        },
                        {
                            "tok": "."
                        }
                    ]
                },
                {
                    "toks": [
                        {
                            "tok": "Who"
                        },
                        {
                            "tok": "'s"
                        },
                        {
                            "tok": "asking"
                        },
                        {
                            "tok": "?"
                        }
                    ]
                }
            ]
        ]

        self.assertListEqual(expected, actual)