Exemple #1
0
def get_sentence_tokenizer(language):
    """
    Return the sentence tokenizer callable.
    """

    pickle_path = 'sentence_tokenizer.pickle'

    try:
        input_file = open(pickle_path, 'rb')
        sentence_tokenizer = load(input_file)
        input_file.close()
    except FileNotFoundError:

        data_file_paths = []

        sentences = []

        try:
            # Get the paths to each file the bot will be trained with
            corpus_files = list_corpus_files('chatterbot.corpus.{language}'.format(
                language=language.ENGLISH_NAME.lower()
            ))
        except LookupError:
            # Fall back to English sentence splitting rules if a language is not supported
            corpus_files = list_corpus_files('chatterbot.corpus.{language}'.format(
                language=languages.ENG.ENGLISH_NAME.lower()
            ))

        data_file_paths.extend(corpus_files)

        for corpus, _categories, _file_path in load_corpus(*data_file_paths):
            for conversation in corpus:
                for text in conversation:
                    sentences.append(text.upper())
                    sentences.append(text.lower())

        trainer = PunktTrainer()
        trainer.INCLUDE_ALL_COLLOCS = True
        trainer.train('\n'.join(sentences))

        sentence_tokenizer = PunktSentenceTokenizer(trainer.get_params())

        # Pickle the sentence tokenizer for future use
        output_file = open(pickle_path, 'wb')
        dump(sentence_tokenizer, output_file, -1)
        output_file.close()

    return sentence_tokenizer
Exemple #2
0
    def test_load_corpus_english_greetings(self):
        file_path = os.path.join(corpus.DATA_DIRECTORY, 'english',
                                 'greetings.yml')
        data_files = corpus.list_corpus_files(file_path)
        corpus_data = corpus.load_corpus(*data_files)

        self.assertEqual(len(list(corpus_data)), 1)
Exemple #3
0
    def test_load_corpus_file(self):
        """
        Test that a file path can be specified for a corpus.
        """

        # Create a file for testing
        file_path = './test_corpus.yml'
        with io.open(file_path, 'w') as test_corpus:
            yml_data = u'\n'.join([
                'conversations:', '- - Hello', '  - Hi', '- - Hi', '  - Hello'
            ])
            test_corpus.write(yml_data)

        data_files = corpus.list_corpus_files(file_path)
        corpus_data = list(corpus.load_corpus(*data_files))

        # Remove the test file
        if os.path.exists(file_path):
            os.remove(file_path)

        self.assertEqual(len(corpus_data), 1)

        # Load the content from the corpus
        conversations, _categories, _file_path = corpus_data[0]

        self.assertEqual(len(conversations[0]), 2)
Exemple #4
0
    def test_load_english_corpus(self):
        files = corpus.list_corpus_files(
            'chatterbot_corpus/data/english/greetings.yml')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertEqual(len(corpus_data), 1)
        self.assertIn(['Hi', 'Hello'], corpus_data[0][0])
Exemple #5
0
    def test_load_corpus_file(self):
        """
        Test that a file path can be specified for a corpus.
        """

        # Create a file for testing
        file_path = './test_corpus.yml'
        with io.open(file_path, 'w') as test_corpus:
            yml_data = u'\n'.join(
                ['conversations:', '- - Hello', '  - Hi', '- - Hi', '  - Hello']
            )
            test_corpus.write(yml_data)

        data_files = corpus.list_corpus_files(file_path)
        corpus_data = list(corpus.load_corpus(*data_files))

        # Remove the test file
        if os.path.exists(file_path):
            os.remove(file_path)

        self.assertEqual(len(corpus_data), 1)

        # Load the content from the corpus
        conversations, _categories, _file_path = corpus_data[0]

        self.assertEqual(len(conversations[0]), 2)
Exemple #6
0
    def test_load_corpus(self):
        """
        Test loading the entire corpus of languages.
        """
        files = corpus.list_corpus_files('chatterbot_corpus')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertTrue(len(corpus_data))
Exemple #7
0
    def test_load_corpus(self):
        """
        Test loading the entire corpus of languages.
        """
        corpus_files = corpus.list_corpus_files('chatterbot.corpus')
        corpus_data = corpus.load_corpus(*corpus_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #8
0
    def test_load_corpus_english_categories(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.english.greetings')
        corpus_data = list(corpus.load_corpus(*data_files))

        self.assertEqual(len(corpus_data), 1)

        # Test that each conversation gets labeled with the correct category
        for _conversation, categories, _file_path in corpus_data:
            self.assertIn('greetings', categories)
    def test_load_english_corpus_categories(self):
        files = corpus.list_corpus_files('chatterbot_corpus/data/english/greetings.yml')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertEqual(len(corpus_data), 1)

        # Test that each conversation gets labeled with the correct category
        for conversation in corpus_data:
            self.assertIn('greetings', conversation[1])
    def test_conversation_format(self):
        files = corpus.list_corpus_files('chatterbot_corpus')

        for dialog_corpus, _categories, _file_path in corpus.load_corpus(
                *files):
            for conversation in dialog_corpus:
                for text in conversation:
                    if not isinstance(text, str):
                        self.fail('"{}" must be a string, not {}.'.format(
                            str(text), type(text)))
Exemple #11
0
    def test_load_corpus_english_greetings(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.english.greetings')
        corpus_data = list(corpus.load_corpus(*data_files))

        self.assertEqual(len(corpus_data), 1)

        conversations, categories, file_path = corpus_data[0]

        self.assertIn(['Hi', 'Hello'], conversations)
        self.assertEqual(['greetings'], categories)
        self.assertIn('chatterbot_corpus/data/english/greetings.yml', file_path)
Exemple #12
0
    def train(self, *corpus_paths):
        from chatterbot.corpus import load_corpus, list_corpus_files

        data_file_paths = []

        # Get the paths to each file the bot will be trained with
        for corpus_path in corpus_paths:
            data_file_paths.extend(list_corpus_files(corpus_path))

        for corpus, categories, file_path in load_corpus(*data_file_paths):

            statements_to_create = []

            # Train the chat bot with each statement and response pair
            for conversation_count, conversation in enumerate(corpus):

                if self.show_training_progress:
                    utils.print_progress_bar(
                        'Training ' + str(os.path.basename(file_path)),
                        conversation_count + 1, len(corpus))

                previous_statement_text = None
                previous_statement_search_text = ''

                for text in conversation:

                    statement_search_text = self.stemmer.stem(text)

                    _statement = Statement(
                        text=text,
                        search_text=statement_search_text,
                        in_response_to=previous_statement_text,
                        search_in_response_to=previous_statement_search_text,
                        conversation='training')

                    _statement.add_tags(*categories)

                    statement = self.get_preprocessed_statement(_statement)

                    previous_statement_text = statement.text
                    previous_statement_search_text = statement_search_text

                    statements_to_create.append({
                        'text': statement.text,
                        'in_response_to': statement.in_response_to,
                        'conversation': statement.conversation,
                        'tags': statement.tags
                    })

            self.chatbot.storage.create_many(statements_to_create)
    def test_character_count(self):
        """
        Test that no line in the corpus exceeds the maximum number of characters.
        """
        files = corpus.list_corpus_files('chatterbot_corpus')

        for dialog_corpus, _categories, _file_path in corpus.load_corpus(
                *files):
            for conversation in dialog_corpus:
                for text in conversation:
                    if len(text) > STATEMENT_TEXT_MAX_LENGTH:
                        self.fail(
                            '"{}" cannot be longer than {} characters'.format(
                                text, STATEMENT_TEXT_MAX_LENGTH))
Exemple #14
0
    def train(self, *corpus_paths):
        from chatterbot.corpus import load_corpus, list_corpus_files

        data_file_paths = []

        # Get the paths to each file the bot will be trained with
        for corpus_path in corpus_paths:
            data_file_paths.extend(list_corpus_files(corpus_path))

        for corpus, categories, file_path in load_corpus(*data_file_paths):

            statements_to_create = []

            # Train the chat bot with each statement and response pair
            for conversation_count, conversation in enumerate(corpus):

                if self.show_training_progress:
                    utils.print_progress_bar(
                        'Training ' + str(os.path.basename(file_path)),
                        conversation_count + 1,
                        len(corpus)
                    )

                previous_statement_text = None
                previous_statement_search_text = ''

                for text in conversation:

                    statement_search_text = self.stemmer.get_bigram_pair_string(text)

                    statement = Statement(
                        text=text,
                        search_text=statement_search_text,
                        in_response_to=previous_statement_text,
                        search_in_response_to=previous_statement_search_text,
                        conversation='training'
                    )

                    statement.add_tags(*categories)

                    statement = self.get_preprocessed_statement(statement)

                    previous_statement_text = statement.text
                    previous_statement_search_text = statement_search_text

                    statements_to_create.append(statement)

            self.chatbot.storage.create_many(statements_to_create)
Exemple #15
0
    def test_list_english_corpus_files(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.english')

        self.assertIn('.yml', data_files[0])
Exemple #16
0
    def test_load_corpus_spanish(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.spanish')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #17
0
    def test_load_corpus_telugu(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.telugu')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #18
0
    def test_load_corpus_portuguese(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.portuguese')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #19
0
    def test_load_corpus_russian(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.russian')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #20
0
    def test_load_corpus_indonesia(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.indonesia')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #21
0
    def test_load_corpus_marathi(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.marathi')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #22
0
    def test_load_corpus_marathi(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.marathi')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #23
0
    def test_list_english_corpus_files(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.english')

        for data_file in data_files:
            self.assertIn('.yml', data_file)
Exemple #24
0
    def test_load_corpus_portuguese(self):
        files = corpus.list_corpus_files('chatterbot_corpus/data/portuguese')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertTrue(len(corpus_data))
Exemple #25
0
    def test_load_corpus_chinese(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.chinese')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #26
0
    def test_load_corpus_russian(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.russian')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #27
0
    def test_load_corpus_spanish(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.spanish')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #28
0
    def test_load_corpus_telugu(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.telugu')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))
Exemple #29
0
    def train(self, *corpus_paths):
        from chatterbot.corpus import load_corpus, list_corpus_files

        data_file_paths = []

        # Get the paths to each file the bot will be trained with
        for corpus_path in corpus_paths:
            data_file_paths.extend(list_corpus_files(corpus_path))

        for corpus, categories, file_path in load_corpus(*data_file_paths):

            statements_to_create = []

            # Train the chat bot with each statement and response pair
            for conversation_count, conversations in enumerate(corpus):

                if self.show_training_progress:
                    utils.print_progress_bar(
                        'Training ' + str(os.path.basename(file_path)),
                        conversation_count + 1, len(corpus))

                previous_statements_texts = [None]
                previous_statements_search_texts = ['']

                for conversation in conversations:

                    if isinstance(conversation, str):
                        conversation = [conversation]

                    statements_texts = []
                    statements_search_texts = []

                    for previous_statement_text, previous_statement_search_text in zip(
                            previous_statements_texts,
                            previous_statements_search_texts):

                        for text in conversation:
                            statement_search_text = self.chatbot.storage.tagger.get_bigram_pair_string(
                                text)

                            statement = Statement(
                                text=text,
                                search_text=statement_search_text,
                                in_response_to=previous_statement_text,
                                search_in_response_to=
                                previous_statement_search_text,
                                conversation='training')

                            statement.add_tags(*categories)

                            statement = self.get_preprocessed_statement(
                                statement)

                            statements_texts.append(statement.text)
                            statements_search_texts.append(
                                statement_search_text)

                            statements_to_create.append(statement)

                    previous_statements_texts = statements_texts
                    previous_statements_search_texts = statements_search_texts

            if statements_to_create:
                self.chatbot.storage.create_many(statements_to_create)
Exemple #30
0
    def test_load_corpus_english_greetings(self):
        file_path = os.path.join(corpus.DATA_DIRECTORY, 'english', 'greetings.yml')
        data_files = corpus.list_corpus_files(file_path)
        corpus_data = corpus.load_corpus(*data_files)

        self.assertEqual(len(list(corpus_data)), 1)
Exemple #31
0
    def test_load_corpus_traditional_chinese(self):
        files = corpus.list_corpus_files('chatterbot_corpus/data/tchinese')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertTrue(len(corpus_data))
Exemple #32
0
    def test_load_corpus_english_trailing_slash(self):
        file_path = os.path.join(corpus.DATA_DIRECTORY, 'english') + '/'
        data_files = corpus.list_corpus_files(file_path)
        corpus_data = list(corpus.load_corpus(*data_files))

        self.assertGreater(len(list(corpus_data)), 1)
Exemple #33
0
    def train(self, *corpus_paths):
        from chatterbot.corpus import load_corpus, list_corpus_files

        data_file_paths = []

        # Get the paths to each file the bot will be trained with
        for corpus_path in corpus_paths:
            data_file_paths.extend(list_corpus_files(corpus_path))

        for corpus, categories, file_path in load_corpus(*data_file_paths):

            statements_to_create = []

            # Train the chat bot with each statement and response pair
            for conversation_count, conversation in enumerate(corpus):

                if self.show_training_progress:
                    utils.print_progress_bar(
                        'Training ' + str(os.path.basename(file_path)),
                        conversation_count + 1, len(corpus))

                previous_statement_text = None
                previous_statement_search_text = ''

                for text in conversation:
                    suggestion_tags = []
                    if text.strip('.?!/;:\'\"') in constants.AFFIRMATIVES:
                        text = 'AFF'
                    elif text.strip('.?!/;:\'\"') in constants.NEGATIVES:
                        text = 'NEG'
                    elif text[0] is '^':
                        (suggestion, text) = text.split(maxsplit=1)
                        suggestion = suggestion[1:]
                        if not suggestion.find('/'):
                            suggestion_tags.append(suggestion)
                        else:
                            for suggestion in suggestion.split('/'):
                                suggestion_tags.append(suggestion)

                    statement_search_text = self.chatbot.storage.tagger.get_bigram_pair_string(
                        text)

                    statement = Statement(
                        text=text,
                        search_text=statement_search_text,
                        in_response_to=previous_statement_text,
                        search_in_response_to=previous_statement_search_text,
                        conversation='training')

                    # YesNoLogicAdapter deals with responses to AFF/NEG via statement tags.
                    # No need for statements in_response_to = AFF/NEG   In fact, it was causing
                    # erroneous responses
                    if statement.in_response_to in ['AFF', 'NEG']:
                        statement.in_response_to = None
                        statement.search_in_response_to = None

                    statement.add_tags(*categories)

                    if suggestion_tags:
                        for suggestion in suggestion_tags:
                            statement.add_tags('SUGGESTION:' + suggestion)

                    if previous_statement_text:
                        if previous_statement_text == 'AFF':
                            statements_to_create[-2].add_tags('AFF:' +
                                                              statement.text)
                        elif previous_statement_text == 'NEG':
                            statements_to_create[-2].add_tags('NEG:' +
                                                              statement.text)

                    statement = self.get_preprocessed_statement(statement)
                    previous_statement_text = statement.text
                    previous_statement_search_text = statement_search_text

                    statements_to_create.append(statement)

            # Using update() because create_many() makes duplicate statements. AFF/NEG tag data was lost on some.
            for stmnts in statements_to_create:
                self.chatbot.storage.update(stmnts)
Exemple #34
0
    def test_load_corpus_italian(self):
        files = corpus.list_corpus_files('chatterbot_corpus/data/italian')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertTrue(len(corpus_data))
Exemple #35
0
    def test_load_corpus_english(self):
        files = corpus.list_corpus_files('chatterbot_corpus/data/english')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertTrue(len(corpus_data))
Exemple #36
0
    def test_load_corpus_english_trailing_slash(self):
        file_path = os.path.join(corpus.DATA_DIRECTORY, 'english') + '/'
        data_files = corpus.list_corpus_files(file_path)
        corpus_data = list(corpus.load_corpus(*data_files))

        self.assertGreater(len(list(corpus_data)), 1)
Exemple #37
0
    def test_load_corpus_marathi(self):
        files = corpus.list_corpus_files('chatterbot_corpus/data/marathi')
        corpus_data = list(corpus.load_corpus(*files))

        self.assertTrue(len(corpus_data))
Exemple #38
0
    def test_load_corpus_indonesia(self):
        data_files = corpus.list_corpus_files('chatterbot.corpus.indonesia')
        corpus_data = corpus.load_corpus(*data_files)

        self.assertTrue(len(list(corpus_data)))