コード例 #1
0
    def test_buckets_similar_size(self):
        # testing dataset is 3 x 6 sequences of lengths 0 - 5
        iterators = {
            "sentences":
            lambda: [["word" for _ in range(l)] for l in range(6)] * 3
        }

        dataset = Dataset("dataset", iterators=iterators, shuffled=True)

        # we use batch size 6 and bucket span 2
        scheme = BatchingScheme(6, 2, False, None)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches(scheme):
            batches.append(list(batch.get_series("sentences")))

        # this setup should divide the data to 3 batches
        self.assertEqual(len(batches), 3)

        for batch in batches:
            # each batch should contain 6 values
            self.assertEqual(len(batch), 6)

            lengths = set(len(b) for b in batch)

            # the values in the batch should have two lengths
            self.assertEqual(len(lengths), 2)

            # the lengths should differ by one
            self.assertEqual(max(lengths) - min(lengths), 1)
コード例 #2
0
    def test_bucketing_no_leftovers(self):

        # testing dataset is 50 sequences of lengths 1 - 50
        iterators = {
            "sentences": lambda:
            (["word" for _ in range(l)] for l in range(1, 50))
        }

        dataset = Dataset("dataset", iterators=iterators, shuffled=False)

        # we use batch size 7 and bucket span 10
        scheme = BatchingScheme(7, 10, False, None, False)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches(scheme):
            batches.append(list(batch.get_series("sentences")))

        ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)],
                       [["word" for _ in range(l)] for l in range(10, 17)],
                       [["word" for _ in range(l)] for l in range(20, 27)],
                       [["word" for _ in range(l)] for l in range(30, 37)],
                       [["word" for _ in range(l)] for l in range(40, 47)]]

        self.assertSequenceEqual(ref_batches, batches)
コード例 #3
0
    def test_batching_lazy_shuffle(self):
        iterators = {"a": lambda: range(5), "b": lambda: range(5, 10)}

        dataset = Dataset("dataset",
                          iterators=iterators,
                          shuffled=True,
                          buffer_size=(3, 5))

        batches = []
        for epoch in range(2):
            epoch = []
            for batch in dataset.batches(DEFAULT_BATCHING_SCHEME):
                epoch.append({s: list(batch.get_series(s)) for s in iterators})

            batches.append(epoch)

        epoch_data = []
        epoch_data.append(
            [c for batch in batches[0] for b in batch.values() for c in b])
        epoch_data.append(
            [c for batch in batches[1] for b in batch.values() for c in b])

        self.assertEqual(set(epoch_data[0]), set(range(10)))
        self.assertEqual(set(epoch_data[0]), set(epoch_data[1]))
        self.assertNotEqual(epoch_data[0], epoch_data[1])
コード例 #4
0
    def test_batching_lazy_noshuffle(self):
        iterators = {"a": lambda: range(5), "b": lambda: range(10, 15)}

        dataset = Dataset("dataset",
                          iterators=iterators,
                          shuffled=False,
                          buffer_size=(3, 5))

        batches = []
        for epoch in range(2):
            epoch = []
            for batch in dataset.batches(DEFAULT_BATCHING_SCHEME):
                epoch.append({s: list(batch.get_series(s)) for s in iterators})

            batches.append(epoch)

        self.assertEqual(batches, [[{
            "a": [0, 1, 2],
            "b": [10, 11, 12]
        }, {
            "a": [3, 4],
            "b": [13, 14]
        }], [{
            "a": [0, 1, 2],
            "b": [10, 11, 12]
        }, {
            "a": [3, 4],
            "b": [13, 14]
        }]])
コード例 #5
0
    def test_bucketing(self):

        # testing dataset is 50 sequences of lengths 1 - 50
        iterators = {
            "sentences": lambda:
            (["word" for _ in range(l)] for l in range(1, 50))
        }

        # we use batch size 7 and bucket span 10
        scheme = BatchingScheme(bucket_boundaries=[9, 19, 29, 39, 49],
                                bucket_batch_sizes=[7, 7, 7, 7, 7, 7])

        dataset = Dataset("dataset",
                          iterators=iterators,
                          batching=scheme,
                          shuffled=False)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches():
            batches.append(list(batch.get_series("sentences")))

        ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)],
                       [["word" for _ in range(l)] for l in range(10, 17)],
                       [["word" for _ in range(l)] for l in range(20, 27)],
                       [["word" for _ in range(l)] for l in range(30, 37)],
                       [["word" for _ in range(l)] for l in range(40, 47)],
                       [["word" for _ in range(l)] for l in range(8, 10)],
                       [["word" for _ in range(l)] for l in range(17, 20)],
                       [["word" for _ in range(l)] for l in range(27, 30)],
                       [["word" for _ in range(l)] for l in range(37, 40)],
                       [["word" for _ in range(l)] for l in range(47, 50)]]

        self.assertSequenceEqual(ref_batches, batches)
コード例 #6
0
def training_loop(tf_manager: TensorFlowManager,
                  epochs: int,
                  trainers: List[Trainer],
                  batching_scheme: BatchingScheme,
                  runners_batching_scheme: BatchingScheme,
                  log_directory: str,
                  evaluators: EvalConfiguration,
                  main_metric: str,
                  runners: List[BaseRunner],
                  train_dataset: Dataset,
                  val_datasets: List[Dataset],
                  test_datasets: Optional[List[Dataset]],
                  log_timer: Callable[[int, float], bool],
                  val_timer: Callable[[int, float], bool],
                  val_preview_input_series: Optional[List[str]],
                  val_preview_output_series: Optional[List[str]],
                  val_preview_num_examples: int,
                  postprocess: Optional[Postprocess],
                  train_start_offset: int,
                  initial_variables: Optional[Union[str, List[str]]],
                  final_variables: str) -> None:
    """Execute the training loop for given graph and data.

    Args:
        tf_manager: TensorFlowManager with initialized sessions.
        epochs: Number of epochs for which the algoritm will learn.
        trainer: The trainer object containg the TensorFlow code for computing
            the loss and optimization operation.
        batch_size: Number of examples in one mini-batch.
        batching_scheme: Batching scheme specification. Cannot be provided when
            batch_size is specified.
        log_directory: Directory where the TensordBoard log will be generated.
            If None, nothing will be done.
        evaluators: List of evaluators. The last evaluator is used as the main.
            An evaluator is a tuple of the name of the generated
            series, the name of the dataset series the generated one is
            evaluated with and the evaluation function. If only one
            series names is provided, it means the generated and
            dataset series have the same name.
        runners: List of runners for logging and evaluation runs
        train_dataset: Dataset used for training
        val_dataset: used for validation. Can be Dataset or a list of datasets.
            The last dataset is used as the main one for storing best results.
            When using multiple datasets. It is recommended to name them for
            better Tensorboard visualization.
        test_datasets: List of datasets used for testing
        logging_period: after how many batches should the logging happen. It
            can also be defined as a time period in format like: 3s; 4m; 6h;
            1d; 3m15s; 3seconds; 4minutes; 6hours; 1days
        validation_period: after how many batches should the validation happen.
            It can also be defined as a time period in same format as logging
        val_preview_input_series: which input series to preview in validation
        val_preview_output_series: which output series to preview in validation
        val_preview_num_examples: how many examples should be printed during
            validation
        train_start_offset: how many lines from the training dataset should be
            skipped. The training starts from the next batch.
        runners_batch_size: batch size of runners. Reuses the training batching
            scheme with bucketing turned off.
        initial_variables: variables used for initialization, for example for
            continuation of training. Provide it with a path to your model
            directory and its checkpoint file group common prefix, e.g.
            "variables.data", or "variables.data.3" in case of multiple
            checkpoints per experiment.
        postprocess: A function which takes the dataset with its output series
            and generates additional series from them.
    """
    check_argument_types()

    _check_series_collisions(runners, postprocess)

    _log_model_variables(
        var_list=list(set().union(*[t.var_list for t in trainers])))

    step = 0
    seen_instances = 0
    last_seen_instances = 0

    if initial_variables is None:
        # Assume we don't look at coder checkpoints when global
        # initial variables are supplied
        tf_manager.initialize_model_parts(
            runners + trainers, save=True)  # type: ignore
    else:
        try:
            tf_manager.restore(initial_variables)
        except tf.errors.NotFoundError:
            warn("Some variables were not found in checkpoint.)")

    # Ignoring type. Mypy complains about summing runner and trainer lists.
    feedables = set.union(
        *[ex.feedables for ex in runners + trainers])  # type: ignore

    if log_directory:
        log("Initializing TensorBoard summary writer.")
        tb_writer = tf.summary.FileWriter(
            log_directory, tf_manager.sessions[0].graph)
        log("TensorBoard writer initialized.")

    log("Starting training")
    last_log_time = time.process_time()
    last_val_time = time.process_time()
    interrupt = None
    try:
        for epoch_n in range(1, epochs + 1):
            log_print("")
            log("Epoch {} begins".format(epoch_n), color="red")

            train_batches = train_dataset.batches(batching_scheme)

            if epoch_n == 1 and train_start_offset:
                if train_dataset.shuffled and not train_dataset.lazy:
                    warn("Not skipping training instances with shuffled "
                         "non-lazy dataset")
                else:
                    _skip_lines(train_start_offset, train_batches)

            for batch_n, batch in enumerate(train_batches):
                step += 1
                seen_instances += len(batch)

                if log_timer(step, last_log_time):
                    trainer_result = tf_manager.execute(
                        batch, feedables, trainers, train=True, summaries=True)
                    train_results, train_outputs = run_on_dataset(
                        tf_manager, runners, batch, postprocess,
                        write_out=False,
                        batching_scheme=runners_batching_scheme)
                    # ensure train outputs are iterable more than once
                    train_outputs = {
                        k: list(v) for k, v in train_outputs.items()}
                    train_evaluation = evaluation(
                        evaluators, batch, runners, train_results,
                        train_outputs)

                    _log_continuous_evaluation(
                        tb_writer, main_metric, train_evaluation,
                        seen_instances, epoch_n, epochs, trainer_result,
                        train=True)
                    last_log_time = time.process_time()
                else:
                    tf_manager.execute(batch, feedables, trainers, train=True,
                                       summaries=False)

                if val_timer(step, last_val_time):
                    log_print("")
                    val_duration_start = time.process_time()
                    val_examples = 0
                    for val_id, valset in enumerate(val_datasets):
                        val_examples += len(valset)

                        val_results, val_outputs = run_on_dataset(
                            tf_manager, runners, valset,
                            postprocess, write_out=False,
                            batching_scheme=runners_batching_scheme)
                        # ensure val outputs are iterable more than once
                        val_outputs = {k: list(v)
                                       for k, v in val_outputs.items()}
                        val_evaluation = evaluation(
                            evaluators, valset, runners, val_results,
                            val_outputs)

                        valheader = ("Validation (epoch {}, batch number {}):"
                                     .format(epoch_n, batch_n))
                        log(valheader, color="blue")
                        _print_examples(
                            valset, val_outputs, val_preview_input_series,
                            val_preview_output_series,
                            val_preview_num_examples)
                        log_print("")
                        log(valheader, color="blue")

                        # The last validation set is selected to be the main
                        if val_id == len(val_datasets) - 1:
                            this_score = val_evaluation[main_metric]
                            tf_manager.validation_hook(this_score, epoch_n,
                                                       batch_n)

                            if this_score == tf_manager.best_score:
                                best_score_str = colored(
                                    "{:.4g}".format(tf_manager.best_score),
                                    attrs=["bold"])

                                # store also graph parts
                                rnrs = runners + trainers  # type: ignore
                                # TODO: refactor trainers/runners so that they
                                # have the same API predecessor
                                parameterizeds = set.union(
                                    *[rnr.parameterizeds
                                      for rnr in rnrs])
                                for coder in parameterizeds:
                                    for session in tf_manager.sessions:
                                        coder.save(session)
                            else:
                                best_score_str = "{:.4g}".format(
                                    tf_manager.best_score)

                            log("best {} on validation: {} (in epoch {}, "
                                "after batch number {})"
                                .format(main_metric, best_score_str,
                                        tf_manager.best_score_epoch,
                                        tf_manager.best_score_batch),
                                color="blue")

                        v_name = valset.name if len(val_datasets) > 1 else None
                        _log_continuous_evaluation(
                            tb_writer, main_metric, val_evaluation,
                            seen_instances, epoch_n, epochs, val_results,
                            train=False, dataset_name=v_name)

                    # how long was the training between validations
                    training_duration = val_duration_start - last_val_time
                    val_duration = time.process_time() - val_duration_start

                    # the training should take at least twice the time of val.
                    steptime = (training_duration
                                / (seen_instances - last_seen_instances))
                    valtime = val_duration / val_examples
                    last_seen_instances = seen_instances
                    log("Validation time: {:.2f}s, inter-validation: {:.2f}s, "
                        "per-instance (train): {:.2f}s, per-instance (val): "
                        "{:.2f}s".format(val_duration, training_duration,
                                         steptime, valtime), color="blue")
                    if training_duration < 2 * val_duration:
                        notice("Validation period setting is inefficient.")

                    log_print("")
                    last_val_time = time.process_time()

    except KeyboardInterrupt as ex:
        interrupt = ex

    log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}"
        .format(main_metric, tf_manager.best_score,
                tf_manager.best_score_epoch))

    log("Saving final variables in {}".format(final_variables))
    tf_manager.save(final_variables)

    if test_datasets:
        tf_manager.restore_best_vars()

        for dataset in test_datasets:
            test_results, test_outputs = run_on_dataset(
                tf_manager, runners, dataset, postprocess,
                write_out=True, batching_scheme=runners_batching_scheme)
            # ensure test outputs are iterable more than once
            test_outputs = {k: list(v) for k, v in test_outputs.items()}
            eval_result = evaluation(evaluators, dataset, runners,
                                     test_results, test_outputs)
            print_final_evaluation(dataset.name, eval_result)

    log("Finished.")

    if interrupt is not None:
        raise interrupt  # pylint: disable=raising-bad-type
コード例 #7
0
def run_on_dataset(tf_manager: TensorFlowManager,
                   runners: List[BaseRunner],
                   dataset: Dataset,
                   postprocess: Postprocess,
                   batching_scheme: BatchingScheme,
                   write_out: bool = False,
                   log_progress: int = 0) -> Tuple[
                       List[ExecutionResult], Dict[str, List[Any]]]:
    """Apply the model on a dataset and optionally write outputs to files.

    This function processes the dataset in batches and optionally prints out
    the execution progress.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: Dataset-level postprocessors
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.
        batching_scheme: Scheme used for batching.
        log_progress: log progress every X seconds

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """
    # If the dataset contains the target series, compute also losses.
    contains_targets = all(dataset.has_series(runner.decoder_data_id)
                           for runner in runners
                           if runner.decoder_data_id is not None)

    last_log_time = time.process_time()
    batch_results = [[] for _ in runners]  # type: List[List[ExecutionResult]]

    feedables = set.union(*[runner.feedables for runner in runners])

    processed_examples = 0
    for batch in dataset.batches(batching_scheme):
        if 0 < log_progress < time.process_time() - last_log_time:
            log("Processed {} examples.".format(processed_examples))
            last_log_time = time.process_time()

        execution_results = tf_manager.execute(
            batch, feedables, runners, compute_losses=contains_targets)
        processed_examples += len(batch)

        for script_list, ex_result in zip(batch_results, execution_results):
            script_list.append(ex_result)

    # Transpose runner interim results.
    all_results = [reduce_execution_results(res) for res in batch_results]

    # Convert execution results to dictionary.
    result_data = {runner.output_series: result.outputs
                   for runner, result in zip(runners, all_results)}

    # Run dataset-level postprocessing.
    if postprocess is not None:
        for series_name, postprocessor in postprocess:
            postprocessed = postprocessor(dataset, result_data)
            if not hasattr(postprocessed, "__len__"):
                postprocessed = list(postprocessed)

            result_data[series_name] = postprocessed

    # Check output series lengths.
    for series_id, data in result_data.items():
        if len(data) != len(dataset):
            warn("Output '{}' for dataset '{}' has length {}, but "
                 "len(dataset) == {}".format(series_id, dataset.name,
                                             len(data), len(dataset)))

    if write_out and dataset.outputs is not None:
        for series_id, data in result_data.items():
            if series_id in dataset.outputs:
                path, writer = dataset.outputs[series_id]
                writer(path, data)
            else:
                log("There is no file for output series '{}' in dataset: '{}'"
                    .format(series_id, dataset.name), color="red")
    elif write_out:
        log("Dataset does not have any outputs, nothing to write out.",
            color="red")

    return all_results, result_data
コード例 #8
0
def run_on_dataset(
    tf_manager: TensorFlowManager,
    runners: List[BaseRunner],
    dataset_runner: DatasetRunner,
    dataset: Dataset,
    postprocess: Postprocess,
    write_out: bool = False,
    log_progress: int = 0
) -> Tuple[List[ExecutionResult], Dict[str, List], Dict[str, List]]:
    """Apply the model on a dataset and optionally write outputs to files.

    This function processes the dataset in batches and optionally prints out
    the execution progress.

    Args:
        tf_manager: TensorFlow manager with initialized sessions.
        runners: A function that runs the code
        dataset_runner: A runner object that fetches the data inputs
        dataset: The dataset on which the model will be executed.
        evaluators: List of evaluators that are used for the model
            evaluation if the target data are provided.
        postprocess: Dataset-level postprocessors
        write_out: Flag whether the outputs should be printed to a file defined
            in the dataset object.
        log_progress: log progress every X seconds

        extra_fetches: Extra tensors to evaluate for each batch.

    Returns:
        Tuple of resulting sentences/numpy arrays, and evaluation results if
        they are available which are dictionary function -> value.

    """
    # If the dataset contains the target series, compute also losses.
    contains_targets = all(runner.decoder_data_id in dataset
                           for runner in runners
                           if runner.decoder_data_id is not None)

    last_log_time = time.process_time()
    batch_results = [[] for _ in runners]  # type: List[List[ExecutionResult]]
    batch_results.append([])  # For dataset runner

    feedables = set.union(*[runner.feedables for runner in runners])
    feedables |= dataset_runner.feedables

    fetched_input = {s: [] for s in dataset.series}  # type: Dict[str, List]

    processed_examples = 0
    for batch in dataset.batches():
        if 0 < log_progress < time.process_time() - last_log_time:
            log("Processed {} examples.".format(processed_examples))
            last_log_time = time.process_time()

        executors = []  # type: List[GraphExecutor]
        executors.extend(runners)
        executors.append(dataset_runner)

        execution_results = tf_manager.execute(batch,
                                               feedables,
                                               executors,
                                               compute_losses=contains_targets)

        processed_examples += len(batch)

        for script_list, ex_result in zip(batch_results, execution_results):
            script_list.append(ex_result)

        for s_id in batch.series:
            fetched_input[s_id].extend(batch.get_series(s_id))

    # Transpose runner interim results.
    all_results = [join_execution_results(res) for res in batch_results[:-1]]

    # TODO uncomment this when dataset runner starts outputting the dataset
    # input_transposed = join_execution_results(batch_results[-1]).outputs
    # fetched_input = {
    #     k: [dic[k] for dic in input_transposed] for k in input_transposed[0]}

    fetched_input_lengths = {s: len(fetched_input[s]) for s in dataset.series}

    if len(set(fetched_input_lengths.values())) != 1:
        warn("Fetched input dataset series are not of the same length: {}".
             format(str(fetched_input_lengths)))

    dataset_len = fetched_input_lengths[dataset.series[0]]

    # Convert execution results to dictionary.
    result_data = {}  # type: Dict[str, Union[List, np.ndarray]]
    for s_id, data in (pair for res in all_results
                       for pair in res.outputs.items()):
        if s_id in result_data:
            raise ValueError("Overwriting output series forbidden.")
        result_data[s_id] = data

    # Run dataset-level postprocessing.
    if postprocess is not None:
        for series_name, postprocessor in postprocess:
            postprocessed = postprocessor(fetched_input, result_data)
            if not hasattr(postprocessed, "__len__"):
                postprocessed = list(postprocessed)

            result_data[series_name] = postprocessed

    # Check output series lengths.
    for series_id, data in result_data.items():
        if len(data) != dataset_len:
            warn("Output '{}' for dataset '{}' has length {}, but input "
                 "dataset size is {}".format(series_id, dataset.name,
                                             len(data), dataset_len))

    if write_out and dataset.outputs is not None:
        for series_id, data in result_data.items():
            if series_id in dataset.outputs:
                path, writer = dataset.outputs[series_id]
                writer(path, data)
            else:
                log("There is no file for output series '{}' in dataset: '{}'".
                    format(series_id, dataset.name),
                    color="red")
    elif write_out:
        log("Dataset does not have any outputs, nothing to write out.",
            color="red")

    return all_results, result_data, fetched_input