def __init__(self, chatbot, **kwargs): self.chatbot = chatbot self.show_training_progress = kwargs.get('show_training_progress', True) self.stemmer = SimpleStemmer( language=kwargs.get('stemmer_language', 'english'))
def __init__(self, chatbot, **kwargs): self.chatbot = chatbot environment_default = os.getenv('CHATTERBOT_SHOW_TRAINING_PROGRESS', True) self.show_training_progress = kwargs.get('show_training_progress', environment_default) self.stemmer = SimpleStemmer( language=kwargs.get('stemmer_language', 'english'))
def __init__(self, *args, **kwargs): """ Initialize common attributes shared by all storage adapters. """ self.kwargs = kwargs self.logger = kwargs.get('logger', logging.getLogger(__name__)) self.adapter_supports_queries = True self.stemmer = SimpleStemmer( language=kwargs.get('stemmer_language', 'english'))
def train(self): import glob from chatterbot.stemming import SimpleStemmer stemmer = SimpleStemmer() # Download and extract the Ubuntu dialog corpus if needed corpus_download_path = self.download(self.data_download_url) # Extract if the directory does not already exist if not self.is_extracted(self.extracted_data_directory): self.extract(corpus_download_path) extracted_corpus_path = os.path.join( self.extracted_data_directory, '**', '**', '*.tsv' ) manager = Manager() queue = manager.Queue() pool = Pool() def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] file_list = glob.glob(extracted_corpus_path) file_groups = list(chunks(file_list, 10000)) arguments = [ (file_group, queue, self.chatbot.preprocessors, stemmer) for file_group in file_groups ] batch_number = 0 remaining_batches = len(arguments) map_result = pool.starmap_async(read_file, arguments) start_time = time.time() print('After map call') while True: if not queue.empty(): queue_statemens = queue.get() batch_number += 1 remaining_batches -= 1 print('Training with batch {} with {} batches remaining..'.format( batch_number, remaining_batches )) elapsed_time = time.time() - start_time time_per_batch = elapsed_time / batch_number remaining_time = time_per_batch * remaining_batches print('{:.0f} hours {:.0f} minutes {:.0f} seconds elapsed.'.format( elapsed_time // 3600 % 24, elapsed_time // 60 % 60, elapsed_time % 60 )) print('{:.0f} hours {:.0f} minutes {:.0f} seconds remaining.'.format( remaining_time // 3600 % 24, remaining_time // 60 % 60, remaining_time % 60 )) print('---') self.chatbot.storage.create_many(queue_statemens) if map_result.ready() and queue.empty(): break time.sleep(0.1) print('Pool about to close') pool.close() print('Pool closed') pool.join() print('Pool joined') print('Training took', time.time() - start_time, 'seconds.')
def train(self): import glob from chatterbot.stemming import SimpleStemmer stemmer = SimpleStemmer() # Download and extract the Ubuntu dialog corpus if needed corpus_download_path = self.download(self.data_download_url) # Extract if the directory does not already exist if not self.is_extracted(self.extracted_data_directory): self.extract(corpus_download_path) extracted_corpus_path = os.path.join(self.extracted_data_directory, '**', '**', '*.tsv') manager = Manager() queue = manager.Queue() def chunks(items, items_per_chunk): for start_index in range(0, len(items), items_per_chunk): end_index = start_index + items_per_chunk yield items[start_index:end_index] file_list = glob.glob(extracted_corpus_path) file_groups = tuple(chunks(file_list, 10000)) argument_groups = tuple(( file_names, queue, self.chatbot.preprocessors, stemmer, ) for file_names in file_groups) pool_batches = chunks(argument_groups, 9) total_batches = len(file_groups) batch_number = 0 start_time = time.time() with Pool() as pool: for pool_batch in pool_batches: pool.starmap(read_file, pool_batch) while True: if queue.empty(): break batch_number += 1 print( 'Training with batch {} with {} batches remaining...'. format(batch_number, total_batches - batch_number)) self.chatbot.storage.create_many(queue.get()) elapsed_time = time.time() - start_time time_per_batch = elapsed_time / batch_number remaining_time = time_per_batch * (total_batches - batch_number) print('{:.0f} hours {:.0f} minutes {:.0f} seconds elapsed.'. format(elapsed_time // 3600 % 24, elapsed_time // 60 % 60, elapsed_time % 60)) print('{:.0f} hours {:.0f} minutes {:.0f} seconds remaining.'. format(remaining_time // 3600 % 24, remaining_time // 60 % 60, remaining_time % 60)) print('---') print('Training took', time.time() - start_time, 'seconds.')