Example #1
0
    def __init__(self, chatbot, **kwargs):
        self.chatbot = chatbot
        self.show_training_progress = kwargs.get('show_training_progress',
                                                 True)

        self.stemmer = SimpleStemmer(
            language=kwargs.get('stemmer_language', 'english'))
Example #2
0
    def __init__(self, chatbot, **kwargs):
        self.chatbot = chatbot

        environment_default = os.getenv('CHATTERBOT_SHOW_TRAINING_PROGRESS',
                                        True)
        self.show_training_progress = kwargs.get('show_training_progress',
                                                 environment_default)

        self.stemmer = SimpleStemmer(
            language=kwargs.get('stemmer_language', 'english'))
Example #3
0
    def __init__(self, *args, **kwargs):
        """
        Initialize common attributes shared by all storage adapters.
        """
        self.kwargs = kwargs
        self.logger = kwargs.get('logger', logging.getLogger(__name__))
        self.adapter_supports_queries = True

        self.stemmer = SimpleStemmer(
            language=kwargs.get('stemmer_language', 'english'))
Example #4
0
    def train(self):
        import glob
        from chatterbot.stemming import SimpleStemmer

        stemmer = SimpleStemmer()

        # Download and extract the Ubuntu dialog corpus if needed
        corpus_download_path = self.download(self.data_download_url)

        # Extract if the directory does not already exist
        if not self.is_extracted(self.extracted_data_directory):
            self.extract(corpus_download_path)

        extracted_corpus_path = os.path.join(
            self.extracted_data_directory,
            '**', '**', '*.tsv'
        )

        manager = Manager()
        queue = manager.Queue()
        pool = Pool()

        def chunks(l, n):
            for i in range(0, len(l), n):
                yield l[i:i + n]

        file_list = glob.glob(extracted_corpus_path)

        file_groups = list(chunks(file_list, 10000))

        arguments = [
            (file_group, queue, self.chatbot.preprocessors, stemmer) for file_group in file_groups
        ]

        batch_number = 0
        remaining_batches = len(arguments)

        map_result = pool.starmap_async(read_file, arguments)

        start_time = time.time()

        print('After map call')

        while True:

            if not queue.empty():
                queue_statemens = queue.get()

                batch_number += 1
                remaining_batches -= 1

                print('Training with batch {} with {} batches remaining..'.format(
                    batch_number,
                    remaining_batches
                ))

                elapsed_time = time.time() - start_time
                time_per_batch = elapsed_time / batch_number
                remaining_time = time_per_batch * remaining_batches

                print('{:.0f} hours {:.0f} minutes {:.0f} seconds elapsed.'.format(
                    elapsed_time // 3600 % 24,
                    elapsed_time // 60 % 60,
                    elapsed_time % 60
                ))

                print('{:.0f} hours {:.0f} minutes {:.0f} seconds remaining.'.format(
                    remaining_time // 3600 % 24,
                    remaining_time // 60 % 60,
                    remaining_time % 60
                ))
                print('---')

                self.chatbot.storage.create_many(queue_statemens)

            if map_result.ready() and queue.empty():
                break

            time.sleep(0.1)

        print('Pool about to close')

        pool.close()

        print('Pool closed')

        pool.join()

        print('Pool joined')

        print('Training took', time.time() - start_time, 'seconds.')
Example #5
0
    def train(self):
        import glob
        from chatterbot.stemming import SimpleStemmer

        stemmer = SimpleStemmer()

        # Download and extract the Ubuntu dialog corpus if needed
        corpus_download_path = self.download(self.data_download_url)

        # Extract if the directory does not already exist
        if not self.is_extracted(self.extracted_data_directory):
            self.extract(corpus_download_path)

        extracted_corpus_path = os.path.join(self.extracted_data_directory,
                                             '**', '**', '*.tsv')

        manager = Manager()
        queue = manager.Queue()

        def chunks(items, items_per_chunk):
            for start_index in range(0, len(items), items_per_chunk):
                end_index = start_index + items_per_chunk
                yield items[start_index:end_index]

        file_list = glob.glob(extracted_corpus_path)

        file_groups = tuple(chunks(file_list, 10000))

        argument_groups = tuple((
            file_names,
            queue,
            self.chatbot.preprocessors,
            stemmer,
        ) for file_names in file_groups)

        pool_batches = chunks(argument_groups, 9)

        total_batches = len(file_groups)
        batch_number = 0

        start_time = time.time()

        with Pool() as pool:
            for pool_batch in pool_batches:
                pool.starmap(read_file, pool_batch)

                while True:

                    if queue.empty():
                        break

                    batch_number += 1

                    print(
                        'Training with batch {} with {} batches remaining...'.
                        format(batch_number, total_batches - batch_number))

                    self.chatbot.storage.create_many(queue.get())

                elapsed_time = time.time() - start_time
                time_per_batch = elapsed_time / batch_number
                remaining_time = time_per_batch * (total_batches -
                                                   batch_number)

                print('{:.0f} hours {:.0f} minutes {:.0f} seconds elapsed.'.
                      format(elapsed_time // 3600 % 24,
                             elapsed_time // 60 % 60, elapsed_time % 60))

                print('{:.0f} hours {:.0f} minutes {:.0f} seconds remaining.'.
                      format(remaining_time // 3600 % 24,
                             remaining_time // 60 % 60, remaining_time % 60))
                print('---')

        print('Training took', time.time() - start_time, 'seconds.')