def _create_lookup_from_states( self, trackers_as_states: List[List[State]], trackers_as_actions: List[List[Text]], ) -> Dict[Text, Text]: """Creates lookup dictionary from the tracker represented as states. Args: trackers_as_states: representation of the trackers as a list of states trackers_as_actions: representation of the trackers as a list of actions Returns: lookup dictionary """ lookup = {} if not trackers_as_states: return lookup assert len(trackers_as_actions[0]) == 1, ( f"The second dimension of trackers_as_action should be 1, " f"instead of {len(trackers_as_actions[0])}" ) ambiguous_feature_keys = set() pbar = tqdm( zip(trackers_as_states, trackers_as_actions), desc="Processed actions", disable=is_logging_disabled(), ) for states, actions in pbar: action = actions[0] feature_key = self._create_feature_key(states) if not feature_key: continue if feature_key not in ambiguous_feature_keys: if feature_key in lookup.keys(): if lookup[feature_key] != action: # delete contradicting example created by # partial history augmentation from memory ambiguous_feature_keys.add(feature_key) del lookup[feature_key] else: lookup[feature_key] = action pbar.set_postfix({"# examples": "{:d}".format(len(lookup))}) return lookup
def fit( self, model_data: RasaModelData, epochs: int, batch_size: Union[List[int], int], evaluate_on_num_examples: int, evaluate_every_num_epochs: int, batch_strategy: Text, silent: bool = False, loading: bool = False, eager: bool = False, ) -> None: """Fit model data.""" # don't setup tensorboard writers when training during loading if not loading: self._set_up_tensorboard_writer() tf.random.set_seed(self.random_seed) np.random.seed(self.random_seed) disable = silent or is_logging_disabled() evaluation_model_data = None if evaluate_on_num_examples > 0: if not disable: logger.info(f"Validation accuracy is calculated every " f"{evaluate_every_num_epochs} epochs.") model_data, evaluation_model_data = model_data.split( evaluate_on_num_examples, self.random_seed) ( train_dataset_function, tf_train_on_batch_function, ) = self._get_tf_train_functions(eager, model_data, batch_strategy) ( evaluation_dataset_function, tf_evaluation_on_batch_function, ) = self._get_tf_evaluation_functions(eager, evaluation_model_data) val_results = {} # validation is not performed every epoch progress_bar = tqdm(range(epochs), desc="Epochs", disable=disable) training_steps = 0 for epoch in progress_bar: epoch_batch_size = self.linearly_increasing_batch_size( epoch, batch_size, epochs) training_steps = self._batch_loop( train_dataset_function, tf_train_on_batch_function, epoch_batch_size, True, training_steps, self.train_summary_writer, ) if self.tensorboard_log_on_epochs: self._log_metrics_for_tensorboard(epoch, self.train_summary_writer) postfix_dict = self._get_metric_results() if evaluate_on_num_examples > 0: if self._should_evaluate(evaluate_every_num_epochs, epochs, epoch): self._batch_loop( evaluation_dataset_function, tf_evaluation_on_batch_function, epoch_batch_size, False, training_steps, self.test_summary_writer, ) if self.tensorboard_log_on_epochs: self._log_metrics_for_tensorboard( epoch, self.test_summary_writer) val_results = self._get_metric_results(prefix="val_") self._save_model_checkpoint(current_results=val_results, epoch=epoch) postfix_dict.update(val_results) progress_bar.set_postfix(postfix_dict) if self.checkpoint_model: logger.info(f"The model of epoch {self.best_model_epoch} " f"(out of {epochs} in total) will be stored!") if self.model_summary_file is not None: self._write_model_summary() self._training = None # training phase should be defined when building a graph if not disable: logger.info("Finished training.")
def _generate(self, story_steps: List[StoryStep], is_rule_data: bool = False) -> List[TrackerWithCachedStates]: if not story_steps: logger.debug( f"No {'rules' if is_rule_data else 'story blocks'} found.") return [] if self.config.remove_duplicates and self.config.unique_last_num_states: logger.debug("Generated trackers will be deduplicated " "based on their unique last {} states." "".format(self.config.unique_last_num_states)) self._mark_first_action_in_story_steps_as_unpredictable() active_trackers = defaultdict(list) init_tracker = TrackerWithCachedStates( "", self.domain.slots, max_event_history=self.config.tracker_limit, domain=self.domain, is_rule_tracker=is_rule_data, ) active_trackers[STORY_START].append(init_tracker) # trackers that are sent to a featurizer finished_trackers = [] # keep story end trackers separately for augmentation story_end_trackers = [] phase = 0 # one phase is one traversal of all story steps. # do not augment rule data if not is_rule_data: min_num_aug_phases = 3 if self.config.augmentation_factor > 0 else 0 logger.debug( f"Number of augmentation rounds is {min_num_aug_phases}") else: min_num_aug_phases = 0 # placeholder to track gluing process of checkpoints used_checkpoints = set() previous_unused = set() everything_reachable_is_reached = False # we will continue generating data until we have reached all # checkpoints that seem to be reachable. This is a heuristic, # if we did not reach any new checkpoints in an iteration, we # assume we have reached all and stop. while not everything_reachable_is_reached or phase < min_num_aug_phases: phase_name = self._phase_name(everything_reachable_is_reached, phase) num_active_trackers = self._count_trackers(active_trackers) if num_active_trackers: logger.debug("Starting {} ... (with {} trackers)" "".format(phase_name, num_active_trackers)) else: logger.debug(f"There are no trackers for {phase_name}") break # track unused checkpoints for this phase unused_checkpoints: Set[Text] = set() desc = f"Processed {'rules' if is_rule_data else 'story blocks'}" pbar = tqdm(story_steps, desc=desc, disable=is_logging_disabled()) for step in pbar: incoming_trackers: List[TrackerWithCachedStates] = [] for start in step.start_checkpoints: if active_trackers[start.name]: ts = start.filter_trackers(active_trackers[start.name]) incoming_trackers.extend(ts) used_checkpoints.add(start.name) elif start.name not in used_checkpoints: # need to skip - there was no previous step that # had this start checkpoint as an end checkpoint # it will be processed in next phases unused_checkpoints.add(start.name) if not incoming_trackers: # if there are no trackers, # we can skip the rest of the loop continue # these are the trackers that reached this story # step and that need to handle all events of the step if self.config.remove_duplicates: incoming_trackers, end_trackers = self._remove_duplicate_trackers( incoming_trackers) # append end trackers to finished trackers finished_trackers.extend(end_trackers) if everything_reachable_is_reached: # augmentation round incoming_trackers = self._subsample_trackers( incoming_trackers, self.config.max_number_of_augmented_trackers) # update progress bar pbar.set_postfix( {"# trackers": "{:d}".format(len(incoming_trackers))}) trackers, end_trackers = self._process_step( step, incoming_trackers) # add end trackers to finished trackers finished_trackers.extend(end_trackers) # update our tracker dictionary with the trackers # that handled the events of the step and # that can now be used for further story steps # that start with the checkpoint this step ended with for end in step.end_checkpoints: start_name = self._find_start_checkpoint_name(end.name) active_trackers[start_name].extend(trackers) if start_name in used_checkpoints: # add end checkpoint as unused # if this checkpoint was processed as # start one before unused_checkpoints.add(start_name) if not step.end_checkpoints: unique_ends = self._remove_duplicate_story_end_trackers( trackers) story_end_trackers.extend(unique_ends) num_finished = len(finished_trackers) + len(story_end_trackers) logger.debug( f"Finished phase ({num_finished} training samples found).") # prepare next round phase += 1 if not everything_reachable_is_reached: # check if we reached all nodes that can be reached # if we reached at least one more node this round # than last one, we assume there is still # something left to reach and we continue unused_checkpoints = self._add_unused_end_checkpoints( set(active_trackers.keys()), unused_checkpoints, used_checkpoints) active_trackers = self._filter_active_trackers( active_trackers, unused_checkpoints) num_active_trackers = self._count_trackers(active_trackers) everything_reachable_is_reached = (unused_checkpoints == previous_unused or num_active_trackers == 0) previous_unused = unused_checkpoints if everything_reachable_is_reached: # should happen only once previous_unused -= used_checkpoints # add trackers with unused checkpoints # to finished trackers for start_name in previous_unused: finished_trackers.extend(active_trackers[start_name]) logger.debug("Data generation rounds finished.") logger.debug("Found {} unused checkpoints".format( len(previous_unused))) phase = 0 else: logger.debug("Found {} unused checkpoints " "in current phase." "".format(len(unused_checkpoints))) logger.debug("Found {} active trackers " "for these checkpoints." "".format(num_active_trackers)) if everything_reachable_is_reached: # augmentation round, so we process only # story end checkpoints # reset used checkpoints used_checkpoints: Set[Text] = set() # generate active trackers for augmentation active_trackers = self._create_start_trackers_for_augmentation( story_end_trackers) finished_trackers.extend(story_end_trackers) self._issue_unused_checkpoint_notification(previous_unused) logger.debug("Found {} training trackers.".format( len(finished_trackers))) if self.config.augmentation_factor > 0: augmented_trackers, original_trackers = [], [] for t in finished_trackers: if t.is_augmented: augmented_trackers.append(t) else: original_trackers.append(t) augmented_trackers = self._subsample_trackers( augmented_trackers, self.config.max_number_of_augmented_trackers) logger.debug("Subsampled to {} augmented training trackers." "".format(len(augmented_trackers))) logger.debug("There are {} original trackers.".format( len(original_trackers))) finished_trackers = original_trackers + augmented_trackers return finished_trackers