def __init__(self, output_series: str, decoder: MP) -> None: self.output_series = output_series self._decoder = decoder self.all_coders = decoder.get_dependencies() if not hasattr(decoder, "data_id"): notice( "Top-level decoder {} does not have the 'data_id' attribute". format(decoder.name))
def log_after_validation( self, val_examples: int, train_examples: int) -> None: train_duration = self.inter_val_times[-1] val_duration = self.validation_times[-1] train_speed = train_examples / train_duration val_speed = val_examples / val_duration log("Validation time: {:.2f}s ({:.1f} instances/sec), " "inter-validation: {:.2f}s, ({:.1f} instances/sec)" .format(val_duration, val_speed, train_duration, train_speed), color="blue") if self.inter_val_times[-1] < 2 * self.validation_times[-1]: notice("Validation period setting is inefficient.")
def training_loop( tf_manager: TensorFlowManager, epochs: int, trainer: GenericTrainer, # TODO better annotate batch_size: int, log_directory: str, evaluators: EvalConfiguration, runners: List[BaseRunner], train_dataset: Dataset, val_dataset: Union[Dataset, List[Dataset]], test_datasets: Optional[List[Dataset]] = None, logging_period: Union[str, int] = 20, validation_period: Union[str, int] = 500, val_preview_input_series: Optional[List[str]] = None, val_preview_output_series: Optional[List[str]] = None, val_preview_num_examples: int = 15, train_start_offset: int = 0, runners_batch_size: Optional[int] = None, initial_variables: Optional[Union[str, List[str]]] = None, postprocess: Postprocess = None) -> None: """Execute the training loop for given graph and data. Args: tf_manager: TensorFlowManager with initialized sessions. epochs: Number of epochs for which the algoritm will learn. trainer: The trainer object containg the TensorFlow code for computing the loss and optimization operation. batch_size: number of examples in one mini-batch log_directory: Directory where the TensordBoard log will be generated. If None, nothing will be done. evaluators: List of evaluators. The last evaluator is used as the main. An evaluator is a tuple of the name of the generated series, the name of the dataset series the generated one is evaluated with and the evaluation function. If only one series names is provided, it means the generated and dataset series have the same name. runners: List of runners for logging and evaluation runs train_dataset: Dataset used for training val_dataset: used for validation. Can be Dataset or a list of datasets. The last dataset is used as the main one for storing best results. When using multiple datasets. It is recommended to name them for better Tensorboard visualization. test_datasets: List of datasets used for testing logging_period: after how many batches should the logging happen. It can also be defined as a time period in format like: 3s; 4m; 6h; 1d; 3m15s; 3seconds; 4minutes; 6hours; 1days validation_period: after how many batches should the validation happen. It can also be defined as a time period in same format as logging val_preview_input_series: which input series to preview in validation val_preview_output_series: which output series to preview in validation val_preview_num_examples: how many examples should be printed during validation train_start_offset: how many lines from the training dataset should be skipped. The training starts from the next batch. runners_batch_size: batch size of runners. It is the same as batch_size if not specified initial_variables: variables used for initialization, for example for continuation of training. Provide it with a path to your model directory and its checkpoint file group common prefix, e.g. "variables.data", or "variables.data.3" in case of multiple checkpoints per experiment. postprocess: A function which takes the dataset with its output series and generates additional series from them. """ check_argument_types() if isinstance(val_dataset, Dataset): val_datasets = [val_dataset] else: val_datasets = val_dataset log_period_batch, log_period_time = _resolve_period(logging_period) val_period_batch, val_period_time = _resolve_period(validation_period) _check_series_collisions(runners, postprocess) _log_model_variables(var_list=trainer.var_list) if runners_batch_size is None: runners_batch_size = batch_size evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e for e in evaluators] if evaluators: main_metric = "{}/{}".format(evaluators[-1][0], evaluators[-1][-1].name) else: main_metric = "{}/{}".format(runners[-1].decoder_data_id, runners[-1].loss_names[0]) if not tf_manager.minimize_metric: raise ValueError("minimize_metric must be set to True in " "TensorFlowManager when using loss as " "the main metric") step = 0 seen_instances = 0 last_seen_instances = 0 if initial_variables is None: # Assume we don't look at coder checkpoints when global # initial variables are supplied tf_manager.initialize_model_parts(runners + [trainer], save=True) # type: ignore else: try: tf_manager.restore(initial_variables) except tf.errors.NotFoundError: warn("Some variables were not found in checkpoint.)") if log_directory: log("Initializing TensorBoard summary writer.") tb_writer = tf.summary.FileWriter(log_directory, tf_manager.sessions[0].graph) log("TensorBoard writer initialized.") log("Starting training") last_log_time = time.process_time() last_val_time = time.process_time() interrupt = None try: for epoch_n in range(1, epochs + 1): log_print("") log("Epoch {} begins".format(epoch_n), color="red") train_dataset.shuffle() train_batched_datasets = train_dataset.batch_dataset(batch_size) if epoch_n == 1 and train_start_offset: if not isinstance(train_dataset, LazyDataset): warn("Not skipping training instances with " "shuffled in-memory dataset") else: _skip_lines(train_start_offset, train_batched_datasets) for batch_n, batch_dataset in enumerate(train_batched_datasets): step += 1 seen_instances += len(batch_dataset) if _is_logging_time(step, log_period_batch, last_log_time, log_period_time): trainer_result = tf_manager.execute(batch_dataset, [trainer], train=True, summaries=True) train_results, train_outputs = run_on_dataset( tf_manager, runners, batch_dataset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure train outputs are iterable more than once train_outputs = { k: list(v) for k, v in train_outputs.items() } train_evaluation = evaluation(evaluators, batch_dataset, runners, train_results, train_outputs) _log_continuous_evaluation(tb_writer, main_metric, train_evaluation, seen_instances, epoch_n, epochs, trainer_result, train=True) last_log_time = time.process_time() else: tf_manager.execute(batch_dataset, [trainer], train=True, summaries=False) if _is_logging_time(step, val_period_batch, last_val_time, val_period_time): log_print("") val_duration_start = time.process_time() val_examples = 0 for val_id, valset in enumerate(val_datasets): val_examples += len(valset) val_results, val_outputs = run_on_dataset( tf_manager, runners, valset, postprocess, write_out=False, batch_size=runners_batch_size) # ensure val outputs are iterable more than once val_outputs = { k: list(v) for k, v in val_outputs.items() } val_evaluation = evaluation(evaluators, valset, runners, val_results, val_outputs) valheader = ( "Validation (epoch {}, batch number {}):".format( epoch_n, batch_n)) log(valheader, color="blue") _print_examples(valset, val_outputs, val_preview_input_series, val_preview_output_series, val_preview_num_examples) log_print("") log(valheader, color="blue") # The last validation set is selected to be the main if val_id == len(val_datasets) - 1: this_score = val_evaluation[main_metric] tf_manager.validation_hook(this_score, epoch_n, batch_n) if this_score == tf_manager.best_score: best_score_str = colored("{:.4g}".format( tf_manager.best_score), attrs=["bold"]) # store also graph parts all_coders = set.union(*[ rnr.all_coders for rnr in runners + [trainer] ]) # type: ignore for coder in all_coders: for session in tf_manager.sessions: coder.save(session) else: best_score_str = "{:.4g}".format( tf_manager.best_score) log("best {} on validation: {} (in epoch {}, " "after batch number {})".format( main_metric, best_score_str, tf_manager.best_score_epoch, tf_manager.best_score_batch), color="blue") v_name = valset.name if len(val_datasets) > 1 else None _log_continuous_evaluation(tb_writer, main_metric, val_evaluation, seen_instances, epoch_n, epochs, val_results, train=False, dataset_name=v_name) # how long was the training between validations training_duration = val_duration_start - last_val_time val_duration = time.process_time() - val_duration_start # the training should take at least twice the time of val. steptime = (training_duration / (seen_instances - last_seen_instances)) valtime = val_duration / val_examples last_seen_instances = seen_instances log("Validation time: {:.2f}s, inter-validation: {:.2f}s, " "per-instance (train): {:.2f}s, per-instance (val): " "{:.2f}s".format(val_duration, training_duration, steptime, valtime), color="blue") if training_duration < 2 * val_duration: notice("Validation period setting is inefficient.") log_print("") last_val_time = time.process_time() except KeyboardInterrupt as ex: interrupt = ex log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}". format(main_metric, tf_manager.best_score, tf_manager.best_score_epoch)) if test_datasets: tf_manager.restore_best_vars() for dataset in test_datasets: test_results, test_outputs = run_on_dataset( tf_manager, runners, dataset, postprocess, write_out=True, batch_size=runners_batch_size) # ensure test outputs are iterable more than once test_outputs = {k: list(v) for k, v in test_outputs.items()} eval_result = evaluation(evaluators, dataset, runners, test_results, test_outputs) print_final_evaluation(dataset.name, eval_result) log("Finished.") if interrupt is not None: raise interrupt # pylint: disable=raising-bad-type
def from_wordlist(path: str, encoding: str = "utf-8", contains_header: bool = True, contains_frequencies: bool = True) -> "Vocabulary": """Load a vocabulary from a wordlist. The file can contain either list of words with no header. Or it can contain words and their counts separated by tab and a header on the first line. Arguments: path: The path to the wordlist file encoding: The encoding of the wordlist file (defaults to UTF-8) contains_header: if the file have a header on first line contains_frequencies: if the file contains a second column Returns: The new Vocabulary instance. """ check_argument_types() vocabulary = [] # type: List[str] with open(path, encoding=encoding) as wordlist: line_number = 1 if contains_header: # skip the header line_number += 1 next(wordlist) for line in wordlist: line = line.strip() # check if line is empty if not line: warn("Vocabulary file {}:{}: line empty" .format(path, line_number)) line_number += 1 continue if contains_frequencies: info = line.split("\t") if len(info) != 2: raise ValueError( "Vocabulary file {}:{}: line does not have two columns" .format(path, line_number)) word = info[0] else: if "\t" in line: warn("Vocabulary file {}:{}: line contains a tabulator" .format(path, line_number)) word = line if line_number <= len(SPECIAL_TOKENS) + int(contains_header): should_be = SPECIAL_TOKENS[ line_number - 1 - int(contains_header)] if word != should_be: notice("Expected special token {} but encountered a " "different word: {}".format(should_be, word)) vocabulary.append(word) line_number += 1 continue vocabulary.append(word) line_number += 1 log("Vocabulary from wordlist loaded, containing {} words" .format(len(vocabulary))) log_sample(vocabulary) return Vocabulary(vocabulary)