Ejemplo n.º 1
0
    def test_bucketing_no_leftovers(self):

        # testing dataset is 50 sequences of lengths 1 - 50
        iterators = {
            "sentences": lambda:
            (["word" for _ in range(l)] for l in range(1, 50))
        }

        dataset = Dataset("dataset", iterators=iterators, shuffled=False)

        # we use batch size 7 and bucket span 10
        scheme = BatchingScheme(7, 10, False, None, False)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches(scheme):
            batches.append(list(batch.get_series("sentences")))

        ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)],
                       [["word" for _ in range(l)] for l in range(10, 17)],
                       [["word" for _ in range(l)] for l in range(20, 27)],
                       [["word" for _ in range(l)] for l in range(30, 37)],
                       [["word" for _ in range(l)] for l in range(40, 47)]]

        self.assertSequenceEqual(ref_batches, batches)
Ejemplo n.º 2
0
    def test_buckets_similar_size(self):
        # testing dataset is 3 x 6 sequences of lengths 0 - 5
        iterators = {
            "sentences":
            lambda: [["word" for _ in range(l)] for l in range(6)] * 3
        }

        dataset = Dataset("dataset", iterators=iterators, shuffled=True)

        # we use batch size 6 and bucket span 2
        scheme = BatchingScheme(6, 2, False, None)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches(scheme):
            batches.append(list(batch.get_series("sentences")))

        # this setup should divide the data to 3 batches
        self.assertEqual(len(batches), 3)

        for batch in batches:
            # each batch should contain 6 values
            self.assertEqual(len(batch), 6)

            lengths = set(len(b) for b in batch)

            # the values in the batch should have two lengths
            self.assertEqual(len(lengths), 2)

            # the lengths should differ by one
            self.assertEqual(max(lengths) - min(lengths), 1)
Ejemplo n.º 3
0
    def test_bucketing(self):

        # testing dataset is 50 sequences of lengths 1 - 50
        iterators = {
            "sentences": lambda:
            (["word" for _ in range(l)] for l in range(1, 50))
        }

        # we use batch size 7 and bucket span 10
        scheme = BatchingScheme(bucket_boundaries=[9, 19, 29, 39, 49],
                                bucket_batch_sizes=[7, 7, 7, 7, 7, 7])

        dataset = Dataset("dataset",
                          iterators=iterators,
                          batching=scheme,
                          shuffled=False)

        # we process the dataset in two epochs and save what did the batches
        # look like
        batches = []
        for batch in dataset.batches():
            batches.append(list(batch.get_series("sentences")))

        ref_batches = [[["word" for _ in range(l)] for l in range(1, 8)],
                       [["word" for _ in range(l)] for l in range(10, 17)],
                       [["word" for _ in range(l)] for l in range(20, 27)],
                       [["word" for _ in range(l)] for l in range(30, 37)],
                       [["word" for _ in range(l)] for l in range(40, 47)],
                       [["word" for _ in range(l)] for l in range(8, 10)],
                       [["word" for _ in range(l)] for l in range(17, 20)],
                       [["word" for _ in range(l)] for l in range(27, 30)],
                       [["word" for _ in range(l)] for l in range(37, 40)],
                       [["word" for _ in range(l)] for l in range(47, 50)]]

        self.assertSequenceEqual(ref_batches, batches)
Ejemplo n.º 4
0
def normalize_configuration(cfg: Namespace, train_mode: bool) -> None:
    """Given a configuration namespace, normalize the values it contains.

    Arguments:
        cfg: The namespace object returned by `Configuration.make_namespace`
        train_mode: Boolean flag controlling normalization of parameters only
            used during training.
    """
    if train_mode:
        _normalize_train_cfg(cfg)

    if cfg.tf_manager is None:
        cfg.tf_manager = get_default_tf_manager()

    if (cfg.batch_size is None) == (cfg.batching_scheme is None):
        raise ValueError("You must specify either batch_size or "
                         "batching_scheme (not both).")

    if cfg.batch_size is not None:
        assert cfg.batching_scheme is None
        cfg.batching_scheme = BatchingScheme(batch_size=cfg.batch_size)
    else:
        assert cfg.batching_scheme is not None
        cfg.batch_size = cfg.batching_scheme.batch_size

    if cfg.runners_batch_size is None:
        cfg.runners_batch_size = cfg.batching_scheme.batch_size

    cfg.runners_batching_scheme = BatchingScheme(
        batch_size=cfg.runners_batch_size,
        token_level_batching=cfg.batching_scheme.token_level_batching,
        use_leftover_buckets=True)

    cfg.evaluation = [(e[0], e[0], e[1]) if len(e) == 2 else e
                      for e in cfg.evaluation]

    if cfg.evaluation:
        cfg.main_metric = "{}/{}".format(cfg.evaluation[-1][0],
                                         cfg.evaluation[-1][-1].name)
    else:
        cfg.main_metric = "{}/{}".format(cfg.runners[-1].decoder_data_id,
                                         cfg.runners[-1].loss_names[0])

        if not cfg.tf_manager.minimize_metric:
            raise ValueError("minimize_metric must be set to True in "
                             "TensorFlowManager when using loss as "
                             "the main metric")
Ejemplo n.º 5
0
def run(data):  # pragma: no cover
    exp = APP.config["experiment"]
    dataset = Dataset("request",
                      data,
                      BatchingScheme(batch_size=1), {},
                      preprocessors=APP.config["preprocess"])

    _, response_data, _ = exp.run_model(dataset, write_out=False)

    return response_data
Ejemplo n.º 6
0
    def process_images():
        dataset = Dataset("dataset", {"images": np.array(images)},
                          BatchingScheme(batch_size=1), {})
        feed_dict = imagenet.feed_dict(dataset)

        fetch = imagenet.encoded if args.vector else imagenet.spatial_states
        feature_maps = session.run(fetch, feed_dict=feed_dict)

        for features, rel_path in zip(feature_maps, image_paths):
            npz_path = os.path.join(args.output_prefix, rel_path + ".npz")
            os.makedirs(os.path.dirname(npz_path), exist_ok=True)
            np.savez(npz_path, features)
            print(npz_path)
Ejemplo n.º 7
0
    def run_model(
        self,
        dataset: Dataset,
        write_out: bool = False,
        batch_size: int = None,
        log_progress: int = 0
    ) -> Tuple[List[ExecutionResult], Dict[str, List[Any]]]:
        """Run the model on a given dataset.

        Args:
            dataset: The dataset on which the model will be executed.
            write_out: Flag whether the outputs should be printed to a file
                defined in the dataset object.
            batch_size: size of the minibatch
            log_progress: log progress every X seconds

        Returns:
            A list of `ExecutionResult`s and a dictionary of the output series.
        """
        if not self._model_built:
            self.build_model()
        if not self._vars_loaded:
            self.load_variables()

        toklevel = self.model.runners_batching_scheme.token_level_batching
        assert self.model.runners_batching_scheme.batch_bucket_span is None

        batching_scheme = BatchingScheme(batch_size=batch_size
                                         or self.model.runners_batch_size,
                                         batch_bucket_span=None,
                                         token_level_batching=toklevel,
                                         bucketing_ignore_series=[])

        with self.graph.as_default():
            # TODO: check_dataset_and_coders(dataset, self.model.runners)
            return run_on_dataset(self.model.tf_manager,
                                  self.model.runners,
                                  dataset,
                                  self.model.postprocess,
                                  write_out=write_out,
                                  log_progress=log_progress,
                                  batching_scheme=batching_scheme)
Ejemplo n.º 8
0
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("--beam",
                        metavar="BEAM_SIZE",
                        type=int,
                        default=10,
                        help="Beam size.")
    parser.add_argument("--kenlm",
                        type=str,
                        default=None,
                        help="Path to a KenLM model arpa file.")
    parser.add_argument("--lm-weight",
                        type=float,
                        help="Weight of the language model.")
    parser.add_argument("--null-trail-weight",
                        type=float,
                        help="Weight of the null-trailing feature.")
    parser.add_argument("--nt-ratio-weight",
                        type=float,
                        help="Weight of the null-token ratio feature.")
    parser.add_argument("--out", type=str, help="Path to the output file.")
    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument("test_datasets")
    test_datasets.add_argument("batch_size", cond=lambda x: x > 0)
    test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list))

    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    ctc_decoder = None
    for runner in exp.model.runners:
        if (isinstance(runner, PlainRunner)
                and isinstance(runner.decoder, CTCDecoder)):
            ctc_decoder = runner.decoder
            break

    if ctc_decoder is None:
        raise ValueError(
            "Was not able to detect CTC decoder in the configuration.")

    logits_runner = RepresentationRunner(output_series="logits",
                                         encoder=ctc_decoder,
                                         attribute="logits")
    exp.model.runners = [logits_runner]

    dataset = datasets_model.test_datasets[0]
    singleton_batches = dataset.batches(BatchingScheme(1))
    print("Loading language model")
    lm = NGramModel(args.kenlm)
    print("LM loaded")

    weights = {}

    if args.lm_weight:
        weights['lm_score'] = args.lm_weight

    if args.null_trail_weight:
        weights['null_trailing'] = args.null_trail_weight

    if args.nt_ratio_weight:
        weights['null_token_ratio'] = args.nt_ratio_weight

    print("Weights:", weights)

    i = 0
    stats = []

    with open(args.out, 'w') as out_file:
        for sent_dataset in singleton_batches:

            t1 = timeit.default_timer()
            ctc_model_result = exp.run_model(sent_dataset,
                                             write_out=False,
                                             batch_size=1)
            t2 = timeit.default_timer()

            logits = np.squeeze(ctc_model_result[1]['logits'], axis=1)

            t3 = timeit.default_timer()
            best_hyp = decode_beam(logits,
                                   args.beam,
                                   ctc_decoder.vocabulary,
                                   lm=lm,
                                   weights=weights)
            t4 = timeit.default_timer()

            stats.append([len(best_hyp.tokens), t2 - t1, t4 - t3])

            output = " ".join([best_hyp.tokens][0])
            out_file.write(output + "\n")

            if i % 10 == 0:
                print("[{}] {}".format(i, output))
            i += 1

    with open(args.out + ".stats", 'w') as stats_file:
        for line in stats:
            stats_file.write("{} {:.3f} {:.3f}\n".format(*line))

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Ejemplo n.º 9
0
def training_loop(tf_manager: TensorFlowManager,
                  epochs: int,
                  trainer: Union[Trainer, List[Trainer]],
                  log_directory: str,
                  evaluators: EvalConfiguration,
                  runners: List[BaseRunner],
                  final_variables: str,
                  train_dataset: Dataset,
                  val_dataset: Union[Dataset, List[Dataset]],
                  test_datasets: List[Dataset] = None,
                  logging_period: Union[str, int] = 20,
                  validation_period: Union[str, int] = 500,
                  val_preview_input_series: List[str] = None,
                  val_preview_output_series: List[str] = None,
                  val_preview_num_examples: int = 15,
                  train_start_offset: int = 0,
                  batch_size: int = None,
                  batching_scheme: BatchingScheme = None,
                  runners_batch_size: int = None,
                  initial_variables: Union[str, List[str]] = None,
                  postprocess: Postprocess = None) -> None:
    """Execute the training loop for given graph and data.

    Args:
        tf_manager: TensorFlowManager with initialized sessions.
        epochs: Number of epochs for which the algoritm will learn.
        trainer: The trainer object containg the TensorFlow code for computing
            the loss and optimization operation.
        batch_size: Number of examples in one mini-batch.
        batching_scheme: Batching scheme specification. Cannot be provided when
            batch_size is specified.
        log_directory: Directory where the TensordBoard log will be generated.
            If None, nothing will be done.
        evaluators: List of evaluators. The last evaluator is used as the main.
            An evaluator is a tuple of the name of the generated
            series, the name of the dataset series the generated one is
            evaluated with and the evaluation function. If only one
            series names is provided, it means the generated and
            dataset series have the same name.
        runners: List of runners for logging and evaluation runs
        train_dataset: Dataset used for training
        val_dataset: used for validation. Can be Dataset or a list of datasets.
            The last dataset is used as the main one for storing best results.
            When using multiple datasets. It is recommended to name them for
            better Tensorboard visualization.
        test_datasets: List of datasets used for testing
        logging_period: after how many batches should the logging happen. It
            can also be defined as a time period in format like: 3s; 4m; 6h;
            1d; 3m15s; 3seconds; 4minutes; 6hours; 1days
        validation_period: after how many batches should the validation happen.
            It can also be defined as a time period in same format as logging
        val_preview_input_series: which input series to preview in validation
        val_preview_output_series: which output series to preview in validation
        val_preview_num_examples: how many examples should be printed during
            validation
        train_start_offset: how many lines from the training dataset should be
            skipped. The training starts from the next batch.
        runners_batch_size: batch size of runners. Reuses the training batching
            scheme with bucketing turned off.
        initial_variables: variables used for initialization, for example for
            continuation of training. Provide it with a path to your model
            directory and its checkpoint file group common prefix, e.g.
            "variables.data", or "variables.data.3" in case of multiple
            checkpoints per experiment.
        postprocess: A function which takes the dataset with its output series
            and generates additional series from them.
    """
    check_argument_types()

    if (batch_size is None) == (batching_scheme is None):
        raise ValueError("You must specify either batch_size or "
                         "batching_scheme (not both).")

    if batch_size is not None:
        assert batching_scheme is None
        batching_scheme = BatchingScheme(batch_size=batch_size)

    assert batching_scheme is not None

    if runners_batch_size is None:
        runners_batch_size = batching_scheme.batch_size

    runners_batching_scheme = BatchingScheme(
        batch_size=runners_batch_size,
        token_level_batching=batching_scheme.token_level_batching,
        use_leftover_buckets=True)

    if isinstance(val_dataset, List):
        val_datasets = val_dataset
    else:
        val_datasets = [val_dataset]

    log_period_batch, log_period_time = _resolve_period(logging_period)
    val_period_batch, val_period_time = _resolve_period(validation_period)

    _check_series_collisions(runners, postprocess)

    if isinstance(trainer, List):
        trainers = trainer
    else:
        trainers = [trainer]

    _log_model_variables(
        var_list=list(set().union(*[t.var_list for t in trainers])))

    evaluators = [(e[0], e[0], e[1]) if len(e) == 2 else e
                  for e in evaluators]

    if evaluators:
        main_metric = "{}/{}".format(evaluators[-1][0],
                                     evaluators[-1][-1].name)
    else:
        main_metric = "{}/{}".format(runners[-1].decoder_data_id,
                                     runners[-1].loss_names[0])

        if not tf_manager.minimize_metric:
            raise ValueError("minimize_metric must be set to True in "
                             "TensorFlowManager when using loss as "
                             "the main metric")

    if log_period_batch is not None and isinstance(
            trainer, DelayedUpdateTrainer):
        if log_period_batch % trainer.batches_per_update != 0:
            raise ValueError("When using delayed update trainer, the logging "
                             "period must be divisible by batches_per_update.")

    if val_period_batch is not None and isinstance(
            trainer, DelayedUpdateTrainer):
        if val_period_batch % trainer.batches_per_update != 0:
            raise ValueError("When using delayed update trainer, validation "
                             "period must be divisible by batches_per_update.")

    step = 0
    seen_instances = 0
    last_seen_instances = 0

    def _is_logging_time(period_batch: Optional[int],
                         period_time: Optional[float],
                         last_time: float) -> bool:
        if step == 0:
            return False

        if period_batch is not None:
            return step % period_batch == 0

        assert period_time is not None

        # deal with delayed trainer
        if isinstance(trainer, DelayedUpdateTrainer):
            if step % trainer.batches_per_update != 0:
                return False

        return last_time + period_time < time.process_time()

    if initial_variables is None:
        # Assume we don't look at coder checkpoints when global
        # initial variables are supplied
        tf_manager.initialize_model_parts(
            runners + trainers, save=True)  # type: ignore
    else:
        try:
            tf_manager.restore(initial_variables)
        except tf.errors.NotFoundError:
            warn("Some variables were not found in checkpoint.)")

    if log_directory:
        log("Initializing TensorBoard summary writer.")
        tb_writer = tf.summary.FileWriter(
            log_directory, tf_manager.sessions[0].graph)
        log("TensorBoard writer initialized.")

    log("Starting training")
    last_log_time = time.process_time()
    last_val_time = time.process_time()
    interrupt = None
    try:
        for epoch_n in range(1, epochs + 1):
            log_print("")
            log("Epoch {} begins".format(epoch_n), color="red")

            train_batches = train_dataset.batches(batching_scheme)

            if epoch_n == 1 and train_start_offset:
                if train_dataset.shuffled and not train_dataset.lazy:
                    warn("Not skipping training instances with shuffled "
                         "non-lazy dataset")
                else:
                    _skip_lines(train_start_offset, train_batches)

            for batch_n, batch in enumerate(train_batches):
                step += 1
                seen_instances += len(batch)

                if _is_logging_time(log_period_batch, log_period_time,
                                    last_log_time):
                    trainer_result = tf_manager.execute(
                        batch, trainers, train=True, summaries=True)
                    train_results, train_outputs = run_on_dataset(
                        tf_manager, runners, batch, postprocess,
                        write_out=False,
                        batching_scheme=runners_batching_scheme)
                    # ensure train outputs are iterable more than once
                    train_outputs = {
                        k: list(v) for k, v in train_outputs.items()}
                    train_evaluation = evaluation(
                        evaluators, batch, runners, train_results,
                        train_outputs)

                    _log_continuous_evaluation(
                        tb_writer, main_metric, train_evaluation,
                        seen_instances, epoch_n, epochs, trainer_result,
                        train=True)
                    last_log_time = time.process_time()
                else:
                    tf_manager.execute(
                        batch, trainers, train=True, summaries=False)

                if _is_logging_time(val_period_batch, val_period_time,
                                    last_val_time):
                    log_print("")
                    val_duration_start = time.process_time()
                    val_examples = 0
                    for val_id, valset in enumerate(val_datasets):
                        val_examples += len(valset)

                        val_results, val_outputs = run_on_dataset(
                            tf_manager, runners, valset,
                            postprocess, write_out=False,
                            batching_scheme=runners_batching_scheme)
                        # ensure val outputs are iterable more than once
                        val_outputs = {k: list(v)
                                       for k, v in val_outputs.items()}
                        val_evaluation = evaluation(
                            evaluators, valset, runners, val_results,
                            val_outputs)

                        valheader = ("Validation (epoch {}, batch number {}):"
                                     .format(epoch_n, batch_n))
                        log(valheader, color="blue")
                        _print_examples(
                            valset, val_outputs, val_preview_input_series,
                            val_preview_output_series,
                            val_preview_num_examples)
                        log_print("")
                        log(valheader, color="blue")

                        # The last validation set is selected to be the main
                        if val_id == len(val_datasets) - 1:
                            this_score = val_evaluation[main_metric]
                            tf_manager.validation_hook(this_score, epoch_n,
                                                       batch_n)

                            if this_score == tf_manager.best_score:
                                best_score_str = colored(
                                    "{:.4g}".format(tf_manager.best_score),
                                    attrs=["bold"])

                                # store also graph parts
                                rnrs = runners + trainers  # type: ignore
                                # TODO: refactor trainers/runners so that they
                                # have the same API predecessor
                                parameterizeds = set.union(
                                    *[rnr.parameterizeds
                                      for rnr in rnrs])
                                for coder in parameterizeds:
                                    for session in tf_manager.sessions:
                                        coder.save(session)
                            else:
                                best_score_str = "{:.4g}".format(
                                    tf_manager.best_score)

                            log("best {} on validation: {} (in epoch {}, "
                                "after batch number {})"
                                .format(main_metric, best_score_str,
                                        tf_manager.best_score_epoch,
                                        tf_manager.best_score_batch),
                                color="blue")

                        v_name = valset.name if len(val_datasets) > 1 else None
                        _log_continuous_evaluation(
                            tb_writer, main_metric, val_evaluation,
                            seen_instances, epoch_n, epochs, val_results,
                            train=False, dataset_name=v_name)

                    # how long was the training between validations
                    training_duration = val_duration_start - last_val_time
                    val_duration = time.process_time() - val_duration_start

                    # the training should take at least twice the time of val.
                    steptime = (training_duration
                                / (seen_instances - last_seen_instances))
                    valtime = val_duration / val_examples
                    last_seen_instances = seen_instances
                    log("Validation time: {:.2f}s, inter-validation: {:.2f}s, "
                        "per-instance (train): {:.2f}s, per-instance (val): "
                        "{:.2f}s".format(val_duration, training_duration,
                                         steptime, valtime), color="blue")
                    if training_duration < 2 * val_duration:
                        notice("Validation period setting is inefficient.")

                    log_print("")
                    last_val_time = time.process_time()

    except KeyboardInterrupt as ex:
        interrupt = ex

    log("Training finished. Maximum {} on validation data: {:.4g}, epoch {}"
        .format(main_metric, tf_manager.best_score,
                tf_manager.best_score_epoch))

    log("Saving final variables in {}".format(final_variables))
    tf_manager.save(final_variables)

    if test_datasets:
        tf_manager.restore_best_vars()

        for dataset in test_datasets:
            test_results, test_outputs = run_on_dataset(
                tf_manager, runners, dataset, postprocess,
                write_out=True, batching_scheme=runners_batching_scheme)
            # ensure test outputs are iterable more than once
            test_outputs = {k: list(v) for k, v in test_outputs.items()}
            eval_result = evaluation(evaluators, dataset, runners,
                                     test_results, test_outputs)
            print_final_evaluation(dataset.name, eval_result)

    log("Finished.")

    if interrupt is not None:
        raise interrupt  # pylint: disable=raising-bad-type
Ejemplo n.º 10
0
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("--beam",
                        metavar="BEAM_SIZE",
                        type=int,
                        default=10,
                        help="Beam size.")
    parser.add_argument("--kenlm",
                        type=str,
                        help="Path to a KenLM model arpa file.")
    parser.add_argument("--prefix",
                        type=str,
                        help="Path used as a prefix of stored checkpoints.")
    parser.add_argument("--lm-weight",
                        type=float,
                        help="Default weight of the language model.")
    parser.add_argument("--null-trail-weight",
                        type=float,
                        help="Default weight of the null-trailing feature.")
    parser.add_argument("--nt-ratio-weight",
                        type=float,
                        help="Default weight of the null-token ratio feature.")

    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument("test_datasets")
    test_datasets.add_argument("batch_size", cond=lambda x: x > 0)
    test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list))

    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    weights = {}

    if args.lm_weight is not None:
        weights['lm_score'] = args.lm_weight

    if args.null_trail_weight is not None:
        weights['null_trailing'] = args.null_trail_weight

    if args.nt_ratio_weight is not None:
        weights['null_token_ratio'] = args.nt_ratio_weight

    if not weights:
        raise ValueError("No default weights specified, nothing to train.")

    ctc_decoder = None
    for runner in exp.model.runners:
        if (isinstance(runner, PlainRunner)
                and isinstance(runner.decoder, CTCDecoder)):
            ctc_decoder = runner.decoder
            break

    if ctc_decoder is None:
        raise ValueError(
            "Was not able to detect CTC decoder in the configuration.")

    print("Loading language model")
    lm = NGramModel(args.kenlm)
    print("LM loaded")

    logits_runner = RepresentationRunner(output_series="logits",
                                         encoder=ctc_decoder,
                                         attribute="logits")
    exp.model.runners = [logits_runner]

    dataset = datasets_model.test_datasets[0]
    singleton_batches = dataset.batches(BatchingScheme(1))

    DATASET_SIZE = dataset.length
    CHECKPOINTS = 5
    CHECKPOINT_ITERS = int(DATASET_SIZE / CHECKPOINTS)

    print(
        "{} sentences in the dataset, checkpoint every {} sentences ({} checkpoints in total)."
        .format(DATASET_SIZE, CHECKPOINT_ITERS, CHECKPOINTS))

    for i, sent_dataset in enumerate(singleton_batches):
        ctc_model_result = exp.run_model(sent_dataset,
                                         write_out=False,
                                         batch_size=1)

        logits = np.squeeze(ctc_model_result[1]['logits'], axis=1)
        target = ctc_model_result[2]['target'][0]

        train_weights(logits, args.beam, ctc_decoder.vocabulary, target,
                      weights, lm)

        print(
            "[{}] Weights:".format(i + 1), ", ".join([
                "{}: {:.3f}".format(key, value)
                for key, value in weights.items()
            ]))

        if i != 0 and (i + 1) % CHECKPOINT_ITERS == 0:
            with open("{}.{}".format(args.prefix, int(i / CHECKPOINT_ITERS)),
                      "w") as f:
                for key, value in weights.items():
                    f.write("{}={:.3f}\n".format(key.upper(), value))

            print("\nCheckpoint saved.\n")

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Ejemplo n.º 11
0
#!/usr/bin/env python3.5

from typing import Iterable, List
import os
import tempfile
import unittest

from neuralmonkey.dataset import Dataset, from_files, load, BatchingScheme
from neuralmonkey.readers.plain_text_reader import UtfPlainTextReader

DEFAULT_BATCHING_SCHEME = BatchingScheme(batch_size=3,
                                         batch_bucket_span=None,
                                         token_level_batching=False,
                                         bucketing_ignore_series=[])


class TestDataset(unittest.TestCase):
    def test_nonexistent_file(self):
        with self.assertRaises(FileNotFoundError):
            load(name="name",
                 series=["source"],
                 data=[(["some_nonexistent_file"], UtfPlainTextReader)],
                 buffer_size=5)

    def test_nonexistent_file_deprec(self):
        with self.assertRaises(FileNotFoundError):
            from_files(name="name",
                       s_source=(["some_nonexistent_file"],
                                 UtfPlainTextReader),
                       lazy=True)
Ejemplo n.º 12
0
#!/usr/bin/env python3.5

from typing import Iterable, List
import os
import tempfile
import unittest

from neuralmonkey.dataset import Dataset, load, BatchingScheme
from neuralmonkey.readers.plain_text_reader import UtfPlainTextReader

DEFAULT_BATCHING_SCHEME = BatchingScheme(batch_size=3)


class TestDataset(unittest.TestCase):
    def test_nonexistent_file(self) -> None:
        with self.assertRaises(FileNotFoundError):
            load(name="name",
                 series=["source"],
                 data=[(["some_nonexistent_file"], UtfPlainTextReader)],
                 batching=DEFAULT_BATCHING_SCHEME,
                 buffer_size=5)

    def test_lazy_dataset(self) -> None:
        i = 0  # iteration counter

        def reader(files: List[str]) -> Iterable[List[str]]:
            del files
            nonlocal i
            for i in range(10):  # pylint: disable=unused-variable
                yield ["foo"]