Example #1
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Runs Neural Monkey as a web server.")
    parser.add_argument("--port", type=int, default=5000)
    parser.add_argument("--host", type=str, default="127.0.0.1")
    parser.add_argument("--configuration", type=str, required=True)
    parser.add_argument("--preprocess", type=str,
                        required=False, default=None)
    args = parser.parse_args()

    print("")

    if args.preprocess is not None:
        preprocessing = Configuration()
        preprocessing.add_argument("preprocess")
        preprocessing.load_file(args.preprocess)
        preprocessing.build_model()
        APP.config["preprocess"] = preprocessing.model.preprocess
    else:
        APP.config["preprocess"] = []

    exp = Experiment(config_path=args.configuration)
    exp.build_model()
    APP.config["experiment"] = exp
    APP.run(port=args.port, host=args.host)
Example #2
0
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets",
                        metavar="INI-TEST-DATASETS",
                        help="the configuration of the test datasets")
    parser.add_argument("-g",
                        "--grid",
                        dest="grid",
                        action="store_true",
                        help="look at the SGE variables for slicing the data")
    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument("test_datasets")
    test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list))

    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    if args.grid and len(datasets_model.test_datasets) > 1:
        raise ValueError("Only one test dataset supported when using --grid")

    for dataset in datasets_model.test_datasets:
        if args.grid:
            if ("SGE_TASK_FIRST" not in os.environ
                    or "SGE_TASK_LAST" not in os.environ
                    or "SGE_TASK_STEPSIZE" not in os.environ
                    or "SGE_TASK_ID" not in os.environ):
                raise EnvironmentError(
                    "Some SGE environment variables are missing")

            length = int(os.environ["SGE_TASK_STEPSIZE"])
            start = int(os.environ["SGE_TASK_ID"]) - 1
            end = int(os.environ["SGE_TASK_LAST"]) - 1

            if start + length > end:
                length = end - start + 1

            log("Running grid task {} starting at {} with step {}".format(
                start // length, start, length))

            dataset = dataset.subset(start, length)

        if exp.config.args.evaluation is None:
            exp.run_model(dataset, write_out=True)
        else:
            exp.evaluate(dataset, write_out=True)

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Example #3
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Runs Neural Monkey as a web server.")
    parser.add_argument("--port", type=int, default=5000)
    parser.add_argument("--host", type=str, default="127.0.0.1")
    parser.add_argument("--configuration", type=str, required=True)
    args = parser.parse_args()

    print("")

    exp = Experiment(config_path=args.configuration)
    exp.build_model()
    APP.config["experiment"] = exp
    APP.run(port=args.port, host=args.host)
Example #4
0
def _main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config", metavar="INI-FILE",
                        help="the configuration file for the experiment")
    parser.add_argument("-s", "--set", type=str, metavar="SETTING",
                        action="append", dest="config_changes", default=[],
                        help="override an option in the configuration; the "
                        "syntax is [section.]option=value")
    parser.add_argument("-v", "--var", type=str, metavar="VAR", default=[],
                        action="append", dest="config_vars",
                        help="set a variable in the configuration; the syntax "
                        "is var=value (shorthand for -s vars.var=value)")
    parser.add_argument("-i", "--init", dest="init_only", action="store_true",
                        help="initialize the experiment directory and exit "
                        "without building the model")
    parser.add_argument("-f", "--overwrite", action="store_true",
                        help="force overwriting the output directory; can be "
                        "used to start an experiment created with --init")
    args = parser.parse_args()

    args.config_changes.extend("vars.{}".format(s) for s in args.config_vars)

    exp = Experiment(config_path=args.config,
                     config_changes=args.config_changes,
                     train_mode=True,
                     overwrite_output_dir=args.overwrite)

    with open(exp.get_path("args", exp.cont_index + 1), "w") as file:
        print(" ".join(shlex.quote(a) for a in sys.argv), file=file)

    if args.init_only:
        if exp.cont_index >= 0:
            log("The experiment directory already exists.", color="red")
            exit(1)

        exp.config.save_file(exp.get_path("experiment.ini", 0))
        copyfile(args.config, exp.get_path("original.ini", 0))

        log("Experiment directory initialized.")

        cmd = [os.path.basename(sys.argv[0]), "-f",
               exp.get_path("experiment.ini", 0)]
        log("To start experiment, run: {}".format(" ".join(shlex.quote(a)
                                                           for a in cmd)))
        exit(0)

    try:
        exp.train()
    except KeyboardInterrupt:
        raise
    except Exception:  # pylint: disable=broad-except
        log(traceback.format_exc(), color="red")
        exit(1)
Example #5
0
def _main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config", metavar="INI-FILE",
                        help="the configuration file for the experiment")
    parser.add_argument("-s", "--set", type=str, metavar="SETTING",
                        action="append", dest="config_changes", default=[],
                        help="override an option in the configuration; the "
                        "syntax is [section.]option=value")
    parser.add_argument("-v", "--var", type=str, metavar="VAR", default=[],
                        action="append", dest="config_vars",
                        help="set a variable in the configuration; the syntax "
                        "is var=value (shorthand for -s vars.var=value)")
    parser.add_argument("-i", "--init", dest="init_only", action="store_true",
                        help="initialize the experiment directory and exit "
                        "without building the model")
    parser.add_argument("-f", "--overwrite", action="store_true",
                        help="force overwriting the output directory; can be "
                        "used to start an experiment created with --init")
    args = parser.parse_args()

    args.config_changes.extend("vars.{}".format(s) for s in args.config_vars)

    exp = Experiment(config_path=args.config,
                     config_changes=args.config_changes,
                     train_mode=True,
                     overwrite_output_dir=args.overwrite)

    with open(exp.get_path("args", exp.cont_index + 1), "w") as file:
        print(" ".join(shlex.quote(a) for a in sys.argv), file=file)

    if args.init_only:
        if exp.cont_index >= 0:
            log("The experiment directory already exists.", color="red")
            exit(1)

        exp.config.save_file(exp.get_path("experiment.ini", 0))
        copyfile(args.config, exp.get_path("original.ini", 0))

        log("Experiment directory initialized.")

        cmd = [os.path.basename(sys.argv[0]), "-f",
               exp.get_path("experiment.ini", 0)]
        log("To start experiment, run: {}".format(" ".join(shlex.quote(a)
                                                           for a in cmd)))
        exit(0)

    try:
        exp.train()
    except KeyboardInterrupt:  # pylint: disable=try-except-raise
        raise
    except Exception:  # pylint: disable=broad-except
        log(traceback.format_exc(), color="red")
        exit(1)
    def __init__(self,
                 runs_on_features,
                 config_path,
                 vars_path,
                 data_series="",
                 src_caption_series="",
                 caption_series="",
                 alignments_series="",
                 bs_graph_series="bs_target",
                 name=None):
        """
        caption_series -> GreedyRunner output_series
        alignments_series -> WordAlignmentRunner output_series
        bs_graph_series -> BeamSearchRunner output_series
        """

        super(NeuralMonkeyModelWrapper, self).__init__(name, runs_on_features)

        if not os.path.isfile(config_path):
            raise ValueError("File {} does not exist.".format(config_path))

        self._config_path = config_path
        self._vars_path = vars_path
        self._data_series = data_series
        self._src_caption_series = src_caption_series
        self._caption_series = caption_series
        self._alignments_series = alignments_series
        self._bs_graph_series = bs_graph_series

        self._exp = Experiment(config_path=config_path)
        self._exp.build_model()
        self._exp.load_variables([vars_path])

        if self._src_caption_series:
            self.multimodal = True
        else:
            self.multimodal = False
Example #7
0
def main() -> None:
    parser = argparse.ArgumentParser(
        description="Runs Neural Monkey as a web server.")
    parser.add_argument("--port", type=int, default=5000)
    parser.add_argument("--host", type=str, default="127.0.0.1")
    parser.add_argument("--configuration", type=str, required=True)
    parser.add_argument("--preprocess", type=str, required=False, default=None)
    args = parser.parse_args()

    print("")

    if args.preprocess is not None:
        preprocessing = Configuration()
        preprocessing.add_argument("preprocess")
        preprocessing.load_file(args.preprocess)
        preprocessing.build_model()
        APP.config["preprocess"] = preprocessing.model.preprocess
    else:
        APP.config["preprocess"] = []

    exp = Experiment(config_path=args.configuration)
    exp.build_model()
    APP.config["experiment"] = exp
    APP.run(port=args.port, host=args.host)
Example #8
0
def _reverse_gradient(x: tf.Tensor) -> tf.Tensor:
    """Flips the sign of the incoming gradient during training."""

    grad_name = "gradient_reversal_{}".format(x.name)

    # pylint: disable=unused-variable,invalid-name,unused-argument
    @ops.RegisterGradient(grad_name)
    def _flip_gradients(op, grad):
        return [tf.negative(grad)]
    # pylint: enable=unused-variable,invalid-name,unused-argument

    from neuralmonkey.experiment import Experiment
    graph = Experiment.get_current().graph
    with graph.gradient_override_map({"Identity": grad_name}):
        y = tf.identity(x)

    return y
def _reverse_gradient(x: tf.Tensor) -> tf.Tensor:
    """Flips the sign of the incoming gradient during training."""

    grad_name = "gradient_reversal_{}".format(x.name)

    # pylint: disable=unused-variable,invalid-name,unused-argument
    @ops.RegisterGradient(grad_name)
    def _flip_gradients(op, grad):
        return [tf.negative(grad)]

    # pylint: enable=unused-variable,invalid-name,unused-argument

    from neuralmonkey.experiment import Experiment
    graph = Experiment.get_current().graph
    with graph.gradient_override_map({"Identity": grad_name}):
        y = tf.identity(x)

    return y
Example #10
0
    def fetches(self) -> Dict[str, tf.Tensor]:

        fetches = {}  # type: Dict[str, tf.Tensor]
        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self.batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for mpart, tname, bid in zip(self._modelparts, self._tensors,
                                     self.batch_dims):
            if not hasattr(mpart, tname):
                raise ValueError("Model part {} does not have a tensor called "
                                 "{}.".format(mpart, tname))

            tensorval = getattr(mpart, tname)

            fetches[tensorval.name] = tensorval
            self.batch_ids[tensorval.name] = bid

        return fetches
Example #11
0
    def fetches(self) -> Dict[str, tf.Tensor]:

        fetches = {}  # type: Dict[str, tf.Tensor]
        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self.batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for mpart, tname, bid in zip(self._modelparts, self._tensors,
                                     self.batch_dims):
            if not hasattr(mpart, tname):
                raise ValueError("Model part {} does not have a tensor called "
                                 "{}.".format(mpart, tname))

            tensorval = getattr(mpart, tname)

            fetches[tensorval.name] = tensorval
            self.batch_ids[tensorval.name] = bid

        return fetches
Example #12
0
def load(name: str,
         series: List[str],
         data: List[SourceSpec],
         batching: BatchingScheme = None,
         outputs: List[OutputSpec] = None,
         buffer_size: int = None,
         shuffled: bool = False) -> "Dataset":
    """Create a dataset using specification from the configuration.

    The dataset provides iterators over data series. The dataset has a buffer,
    which pre-fetches a given number of the data series lazily. In case the
    dataset is not lazy (buffer size is `None`), the iterators are built on top
    of in-memory arrays. Otherwise, the iterators operate on the data sources
    directly.

    Arguments:
        name: The name of the dataset.
        series: A list of names of data series the dataset contains.
        data: The specification of the data sources for each series.
        outputs: A list of output specifications.
        buffer_size: The size of the buffer. If set, the dataset will be loaded
            lazily into the buffer (useful for large datasets). The buffer size
            specifies the number of sequences to pre-load. This is useful for
            pseudo-shuffling of large data on-the-fly. Ideally, this should be
            (much) larger than the batch size. Note that the buffer gets
            refilled each time its size is less than half the `buffer_size`.
            When refilling, the buffer gets refilled to the specified size.
    """
    check_argument_types()

    if batching is None:
        from neuralmonkey.experiment import Experiment
        log("Using default batching scheme for dataset {}.".format(name))
        # pylint: disable=no-member
        batch_size = Experiment.get_current().config.args.batch_size
        # pylint: enable=no-member
        if batch_size is None:
            raise ValueError("Argument main.batch_size is not specified, "
                             "cannot use default batching scheme.")
        batching = BatchingScheme(batch_size=batch_size)

    if not series:
        raise ValueError("No dataset series specified.")

    if not [s for s in data if match_type(s, ReaderDef)]:  # type: ignore
        raise ValueError("At least one data series should be from a file")

    if len(series) != len(data):
        raise ValueError(
            "The 'series' and 'data' lists should have the same number"
            " of elements: {} vs {}.".format(len(series), len(data)))

    if len(series) != len(set(series)):
        raise ValueError("There are duplicate series.")

    if outputs is not None:
        output_sources = [o[0] for o in outputs]
        if len(output_sources) != len(set(output_sources)):
            raise ValueError("Multiple outputs for a single series")

    log("Initializing dataset {}.".format(name))

    iterators = {}  # type: Dict[str, Callable[[], DataSeries]]

    prep_sl = {}  # type: Dict[str, Tuple[Callable, str]]
    prep_dl = {}  # type: Dict[str, DatasetPreprocess]

    def _make_iterator(reader, files):
        def itergen():
            return reader(files)
        return itergen

    def _make_sl_iterator(src, prep):
        def itergen():
            return (prep(item) for item in iterators[src]())
        return itergen

    def _make_dl_iterator(func):
        def itergen():
            return func(iterators)
        return itergen

    # First, prepare iterators for series using file readers
    for s_name, source_spec in zip(series, data):
        if match_type(source_spec, ReaderDef):  # type: ignore
            files, reader = _normalize_readerdef(cast(ReaderDef, source_spec))
            for path in files:
                if not os.path.isfile(path):
                    raise FileNotFoundError(
                        "File not found. Series: {}, Path: {}"
                        .format(s_name, path))

            iterators[s_name] = _make_iterator(reader, files)

        elif match_type(source_spec, Tuple[Callable, str]):
            prep_sl[s_name] = cast(Tuple[Callable, str], source_spec)

        else:
            assert match_type(source_spec, DatasetPreprocess)  # type: ignore
            prep_dl[s_name] = cast(DatasetPreprocess, source_spec)

    # Second, prepare series-level preprocessors.
    # Note that series-level preprocessors cannot be stacked on the dataset
    # specification level.
    for s_name, (preprocessor, source) in prep_sl.items():
        if source not in iterators:
            raise ValueError(
                "Source series for series-level preprocessor nonexistent: "
                "Preprocessed series '{}', source series '{}'")
        iterators[s_name] = _make_sl_iterator(source, preprocessor)

    # Finally, dataset-level preprocessors.
    for s_name, func in prep_dl.items():
        iterators[s_name] = _make_dl_iterator(func)

    output_dict = None
    if outputs is not None:
        output_dict = {s_name: (path, writer)
                       for s_name, path, writer
                       in [_normalize_outputspec(out) for out in outputs]}

    if buffer_size is not None:
        return Dataset(name, iterators, batching, output_dict,
                       (buffer_size // 2, buffer_size), shuffled)

    return Dataset(name, iterators, batching, output_dict, None, shuffled)
Example #13
0
    def __init__(self,
                 output_series: str,
                 toplevel_modelpart: ModelPart,
                 toplevel_tensors: List[tf.Tensor],
                 tensors_by_name: List[str],
                 tensors_by_ref: List[tf.Tensor],
                 batch_dims_by_name: List[int],
                 batch_dims_by_ref: List[int],
                 select_session: int = None,
                 single_tensor: bool = False) -> None:
        """Construct a new ``TensorRunner`` object.

        Note that at this time, one must specify the toplevel objects so that
        it is ensured that the graph is built. The reason for this behavior is
        that the graph is constructed lazily and therefore if the tensors to
        store are provided by indirect reference (name), the system does not
        know early enough that it needs to create them.

        Args:
            output_series: The name of the generated output data series.
            toplevel_modelpart: A ``ModelPart`` object that is used as the
                top-level component of the model. This object should depend on
                values of all the wanted tensors.
            toplevel_tensors: A list of tensors that should be constructed. Use
                this when the toplevel model part does not depend on this
                tensor. The tensors are constructed during running this
                constructor method which prints them out.
            tensors_by_name: A list of tensor names to fetch. If a tensor
                is not in the graph, a warning is generated and the tensor is
                ignored.
            tensors_by_ref: A list of tensor objects to fetch.
            batch_dims_by_name: A list of integers that correspond to the
                batch dimension in each wanted tensor specified by name.
            batch_dims_by_ref: A list of integers that correspond to the
                batch dimension in each wanted tensor specified by reference.
            select_session: An optional integer specifying the session to use
                in case of ensembling. When not used, tensors from all sessions
                are stored. In case of a single session, this option has no
                effect.
            single_tensor: If `True`, it is assumed that only one tensor is to
                be fetched, and the execution result will consist of this
                tensor only. If `False`, the result will be a dict mapping
                tensor names to NumPy arrays.
        """
        check_argument_types()
        BaseRunner[ModelPart].__init__(self, output_series, toplevel_modelpart)

        total_tensors = len(tensors_by_name) + len(tensors_by_ref)
        if single_tensor and total_tensors > 1:
            raise ValueError(
                "single_tensor is True, but {} tensors were given".format(
                    total_tensors))

        self._names = tensors_by_name
        self._tensors = tensors_by_ref
        self._batch_dims_name = batch_dims_by_name
        self._batch_dims_ref = batch_dims_by_ref
        self._select_session = select_session
        self._single_tensor = single_tensor

        log("Blessing toplevel tensors for tensor runner:")
        for tensor in toplevel_tensors:
            log("Toplevel tensor: {}".format(tensor))

        self._fetches = {}  # type: Dict[str, tf.Tensor]
        self._batch_ids = {}  # type: Dict[str, int]

        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                self._fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self._batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("--beam",
                        metavar="BEAM_SIZE",
                        type=int,
                        default=10,
                        help="Beam size.")
    parser.add_argument("--kenlm",
                        type=str,
                        default=None,
                        help="Path to a KenLM model arpa file.")
    parser.add_argument("--lm-weight",
                        type=float,
                        help="Weight of the language model.")
    parser.add_argument("--null-trail-weight",
                        type=float,
                        help="Weight of the null-trailing feature.")
    parser.add_argument("--nt-ratio-weight",
                        type=float,
                        help="Weight of the null-token ratio feature.")
    parser.add_argument("--out", type=str, help="Path to the output file.")
    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument("test_datasets")
    test_datasets.add_argument("batch_size", cond=lambda x: x > 0)
    test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list))

    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    ctc_decoder = None
    for runner in exp.model.runners:
        if (isinstance(runner, PlainRunner)
                and isinstance(runner.decoder, CTCDecoder)):
            ctc_decoder = runner.decoder
            break

    if ctc_decoder is None:
        raise ValueError(
            "Was not able to detect CTC decoder in the configuration.")

    logits_runner = RepresentationRunner(output_series="logits",
                                         encoder=ctc_decoder,
                                         attribute="logits")
    exp.model.runners = [logits_runner]

    dataset = datasets_model.test_datasets[0]
    singleton_batches = dataset.batches(BatchingScheme(1))
    print("Loading language model")
    lm = NGramModel(args.kenlm)
    print("LM loaded")

    weights = {}

    if args.lm_weight:
        weights['lm_score'] = args.lm_weight

    if args.null_trail_weight:
        weights['null_trailing'] = args.null_trail_weight

    if args.nt_ratio_weight:
        weights['null_token_ratio'] = args.nt_ratio_weight

    print("Weights:", weights)

    i = 0
    stats = []

    with open(args.out, 'w') as out_file:
        for sent_dataset in singleton_batches:

            t1 = timeit.default_timer()
            ctc_model_result = exp.run_model(sent_dataset,
                                             write_out=False,
                                             batch_size=1)
            t2 = timeit.default_timer()

            logits = np.squeeze(ctc_model_result[1]['logits'], axis=1)

            t3 = timeit.default_timer()
            best_hyp = decode_beam(logits,
                                   args.beam,
                                   ctc_decoder.vocabulary,
                                   lm=lm,
                                   weights=weights)
            t4 = timeit.default_timer()

            stats.append([len(best_hyp.tokens), t2 - t1, t4 - t3])

            output = " ".join([best_hyp.tokens][0])
            out_file.write(output + "\n")

            if i % 10 == 0:
                print("[{}] {}".format(i, output))
            i += 1

    with open(args.out + ".stats", 'w') as stats_file:
        for line in stats:
            stats_file.write("{} {:.3f} {:.3f}\n".format(*line))

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Example #15
0
def _get_current_experiment():
    # This is needed to avoid circular imports.
    from neuralmonkey.experiment import Experiment
    return Experiment.get_current()
Example #16
0
def load(name: str,
         series: List[str],
         data: List[SourceSpec],
         batching: BatchingScheme = None,
         outputs: List[OutputSpec] = None,
         buffer_size: int = None,
         shuffled: bool = False) -> "Dataset":
    """Create a dataset using specification from the configuration.

    The dataset provides iterators over data series. The dataset has a buffer,
    which pre-fetches a given number of the data series lazily. In case the
    dataset is not lazy (buffer size is `None`), the iterators are built on top
    of in-memory arrays. Otherwise, the iterators operate on the data sources
    directly.

    Arguments:
        name: The name of the dataset.
        series: A list of names of data series the dataset contains.
        data: The specification of the data sources for each series.
        outputs: A list of output specifications.
        buffer_size: The size of the buffer. If set, the dataset will be loaded
            lazily into the buffer (useful for large datasets). The buffer size
            specifies the number of sequences to pre-load. This is useful for
            pseudo-shuffling of large data on-the-fly. Ideally, this should be
            (much) larger than the batch size. Note that the buffer gets
            refilled each time its size is less than half the `buffer_size`.
            When refilling, the buffer gets refilled to the specified size.
    """
    check_argument_types()

    if batching is None:
        from neuralmonkey.experiment import Experiment
        log("Using default batching scheme for dataset {}.".format(name))
        # pylint: disable=no-member
        batch_size = Experiment.get_current().config.args.batch_size
        # pylint: enable=no-member
        if batch_size is None:
            raise ValueError("Argument main.batch_size is not specified, "
                             "cannot use default batching scheme.")
        batching = BatchingScheme(batch_size=batch_size)

    if not series:
        raise ValueError("No dataset series specified.")

    if not [s for s in data if match_type(s, ReaderDef)]:  # type: ignore
        raise ValueError("At least one data series should be from a file")

    if len(series) != len(data):
        raise ValueError(
            "The 'series' and 'data' lists should have the same number"
            " of elements: {} vs {}.".format(len(series), len(data)))

    if len(series) != len(set(series)):
        raise ValueError("There are duplicate series.")

    if outputs is not None:
        output_sources = [o[0] for o in outputs]
        if len(output_sources) != len(set(output_sources)):
            raise ValueError("Multiple outputs for a single series")

    log("Initializing dataset {}.".format(name))

    iterators = {}  # type: Dict[str, Callable[[], DataSeries]]

    prep_sl = {}  # type: Dict[str, Tuple[Callable, str]]
    prep_dl = {}  # type: Dict[str, DatasetPreprocess]

    def _make_iterator(reader, files):
        def itergen():
            return reader(files)

        return itergen

    def _make_sl_iterator(src, prep):
        def itergen():
            return (prep(item) for item in iterators[src]())

        return itergen

    def _make_dl_iterator(func):
        def itergen():
            return func(iterators)

        return itergen

    # First, prepare iterators for series using file readers
    for s_name, source_spec in zip(series, data):
        if match_type(source_spec, ReaderDef):  # type: ignore
            files, reader = _normalize_readerdef(cast(ReaderDef, source_spec))
            for path in files:
                if not os.path.isfile(path):
                    raise FileNotFoundError(
                        "File not found. Series: {}, Path: {}".format(
                            s_name, path))

            iterators[s_name] = _make_iterator(reader, files)

        elif match_type(source_spec, Tuple[Callable, str]):
            prep_sl[s_name] = cast(Tuple[Callable, str], source_spec)

        else:
            assert match_type(source_spec, DatasetPreprocess)  # type: ignore
            prep_dl[s_name] = cast(DatasetPreprocess, source_spec)

    # Second, prepare series-level preprocessors.
    # Note that series-level preprocessors cannot be stacked on the dataset
    # specification level.
    for s_name, (preprocessor, source) in prep_sl.items():
        if source not in iterators:
            raise ValueError(
                "Source series for series-level preprocessor nonexistent: "
                "Preprocessed series '{}', source series '{}'")
        iterators[s_name] = _make_sl_iterator(source, preprocessor)

    # Finally, dataset-level preprocessors.
    for s_name, func in prep_dl.items():
        iterators[s_name] = _make_dl_iterator(func)

    output_dict = None
    if outputs is not None:
        output_dict = {
            s_name: (path, writer)
            for s_name, path, writer in
            [_normalize_outputspec(out) for out in outputs]
        }

    if buffer_size is not None:
        return Dataset(name, iterators, batching, output_dict,
                       (buffer_size // 2, buffer_size), shuffled)

    return Dataset(name, iterators, batching, output_dict, None, shuffled)
def main() -> None:
    # pylint: disable=no-member,broad-except
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets",
                        metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("--beam",
                        metavar="BEAM_SIZE",
                        type=int,
                        default=10,
                        help="Beam size.")
    parser.add_argument("--kenlm",
                        type=str,
                        help="Path to a KenLM model arpa file.")
    parser.add_argument("--prefix",
                        type=str,
                        help="Path used as a prefix of stored checkpoints.")
    parser.add_argument("--lm-weight",
                        type=float,
                        help="Default weight of the language model.")
    parser.add_argument("--null-trail-weight",
                        type=float,
                        help="Default weight of the null-trailing feature.")
    parser.add_argument("--nt-ratio-weight",
                        type=float,
                        help="Default weight of the null-token ratio feature.")

    args = parser.parse_args()

    test_datasets = Configuration()
    test_datasets.add_argument("test_datasets")
    test_datasets.add_argument("batch_size", cond=lambda x: x > 0)
    test_datasets.add_argument("variables", cond=lambda x: isinstance(x, list))

    test_datasets.load_file(args.datasets)
    test_datasets.build_model()
    datasets_model = test_datasets.model

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    weights = {}

    if args.lm_weight is not None:
        weights['lm_score'] = args.lm_weight

    if args.null_trail_weight is not None:
        weights['null_trailing'] = args.null_trail_weight

    if args.nt_ratio_weight is not None:
        weights['null_token_ratio'] = args.nt_ratio_weight

    if not weights:
        raise ValueError("No default weights specified, nothing to train.")

    ctc_decoder = None
    for runner in exp.model.runners:
        if (isinstance(runner, PlainRunner)
                and isinstance(runner.decoder, CTCDecoder)):
            ctc_decoder = runner.decoder
            break

    if ctc_decoder is None:
        raise ValueError(
            "Was not able to detect CTC decoder in the configuration.")

    print("Loading language model")
    lm = NGramModel(args.kenlm)
    print("LM loaded")

    logits_runner = RepresentationRunner(output_series="logits",
                                         encoder=ctc_decoder,
                                         attribute="logits")
    exp.model.runners = [logits_runner]

    dataset = datasets_model.test_datasets[0]
    singleton_batches = dataset.batches(BatchingScheme(1))

    DATASET_SIZE = dataset.length
    CHECKPOINTS = 5
    CHECKPOINT_ITERS = int(DATASET_SIZE / CHECKPOINTS)

    print(
        "{} sentences in the dataset, checkpoint every {} sentences ({} checkpoints in total)."
        .format(DATASET_SIZE, CHECKPOINT_ITERS, CHECKPOINTS))

    for i, sent_dataset in enumerate(singleton_batches):
        ctc_model_result = exp.run_model(sent_dataset,
                                         write_out=False,
                                         batch_size=1)

        logits = np.squeeze(ctc_model_result[1]['logits'], axis=1)
        target = ctc_model_result[2]['target'][0]

        train_weights(logits, args.beam, ctc_decoder.vocabulary, target,
                      weights, lm)

        print(
            "[{}] Weights:".format(i + 1), ", ".join([
                "{}: {:.3f}".format(key, value)
                for key, value in weights.items()
            ]))

        if i != 0 and (i + 1) % CHECKPOINT_ITERS == 0:
            with open("{}.{}".format(args.prefix, int(i / CHECKPOINT_ITERS)),
                      "w") as f:
                for key, value in weights.items():
                    f.write("{}={:.3f}\n".format(key.upper(), value))

            print("\nCheckpoint saved.\n")

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Example #18
0
def _get_current_experiment():
    # This is needed to avoid circular imports.
    from neuralmonkey.experiment import Experiment
    return Experiment.get_current()
Example #19
0
def main() -> None:
    # pylint: disable=no-member
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config", metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets", metavar="INI-TEST-DATASETS",
                        help="the configuration of the test datasets")
    parser.add_argument("-s", "--set", type=str, metavar="SETTING",
                        action="append", dest="config_changes", default=[],
                        help="override an option in the configuration; the "
                        "syntax is [section.]option=value")
    parser.add_argument("-v", "--var", type=str, metavar="VAR", default=[],
                        action="append", dest="config_vars",
                        help="set a variable in the configuration; the syntax "
                        "is var=value (shorthand for -s vars.var=value)")
    parser.add_argument("--json", type=str, help="write the evaluation "
                        "results to this file in JSON format")
    parser.add_argument("-g", "--grid", dest="grid", action="store_true",
                        help="look at the SGE variables for slicing the data")
    args = parser.parse_args()

    datasets_model = load_runtime_config(args.datasets)

    args.config_changes.extend("vars.{}".format(s) for s in args.config_vars)
    exp = Experiment(config_path=args.config,
                     config_changes=args.config_changes)

    exp.build_model()
    exp.load_variables(datasets_model.variables)

    if args.grid and len(datasets_model.test_datasets) > 1:
        raise ValueError("Only one test dataset supported when using --grid")

    results = []
    for dataset in datasets_model.test_datasets:
        if args.grid:
            if ("SGE_TASK_FIRST" not in os.environ
                    or "SGE_TASK_LAST" not in os.environ
                    or "SGE_TASK_STEPSIZE" not in os.environ
                    or "SGE_TASK_ID" not in os.environ):
                raise EnvironmentError(
                    "Some SGE environment variables are missing")

            length = int(os.environ["SGE_TASK_STEPSIZE"])
            start = int(os.environ["SGE_TASK_ID"]) - 1
            end = int(os.environ["SGE_TASK_LAST"]) - 1

            if start + length > end:
                length = end - start + 1

            log("Running grid task {} starting at {} with step {}"
                .format(start // length, start, length))

            dataset = dataset.subset(start, length)

        if exp.config.args.evaluation is None:
            exp.run_model(dataset, write_out=True)
        else:
            eval_result = exp.evaluate(dataset, write_out=True)
            results.append(eval_result)

    if args.json:
        with open(args.json, "w") as f_out:
            json.dump(results, f_out)
            f_out.write("\n")

    for session in exp.config.model.tf_manager.sessions:
        session.close()
Example #20
0
def main() -> None:
    # pylint: disable=no-member
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("config", metavar="INI-FILE",
                        help="the configuration file of the experiment")
    parser.add_argument("datasets", metavar="INI-TEST-DATASETS",
                        help="the configuration of the test datasets")
    parser.add_argument("--json", type=str, help="write the evaluation "
                        "results to this file in JSON format")
    parser.add_argument("-g", "--grid", dest="grid", action="store_true",
                        help="look at the SGE variables for slicing the data")
    args = parser.parse_args()

    datasets_model = load_runtime_config(args.datasets)

    exp = Experiment(config_path=args.config)
    exp.build_model()
    exp.load_variables(datasets_model.variables)

    if args.grid and len(datasets_model.test_datasets) > 1:
        raise ValueError("Only one test dataset supported when using --grid")

    results = []
    for dataset in datasets_model.test_datasets:
        if args.grid:
            if ("SGE_TASK_FIRST" not in os.environ
                    or "SGE_TASK_LAST" not in os.environ
                    or "SGE_TASK_STEPSIZE" not in os.environ
                    or "SGE_TASK_ID" not in os.environ):
                raise EnvironmentError(
                    "Some SGE environment variables are missing")

            length = int(os.environ["SGE_TASK_STEPSIZE"])
            start = int(os.environ["SGE_TASK_ID"]) - 1
            end = int(os.environ["SGE_TASK_LAST"]) - 1

            if start + length > end:
                length = end - start + 1

            log("Running grid task {} starting at {} with step {}"
                .format(start // length, start, length))

            dataset = dataset.subset(start, length)

        if exp.config.args.evaluation is None:
            exp.run_model(dataset,
                          write_out=True,
                          batch_size=datasets_model.batch_size)
        else:
            eval_result = exp.evaluate(dataset,
                                       write_out=True,
                                       batch_size=datasets_model.batch_size)
            results.append(eval_result)

    if args.json:
        with open(args.json, "w") as f_out:
            json.dump(results, f_out)
            f_out.write("\n")

    for session in exp.config.model.tf_manager.sessions:
        session.close()
class NeuralMonkeyModelWrapper(ModelWrapper):
    def __init__(self,
                 runs_on_features,
                 config_path,
                 vars_path,
                 data_series="",
                 src_caption_series="",
                 caption_series="",
                 alignments_series="",
                 bs_graph_series="bs_target",
                 name=None):
        """
        caption_series -> GreedyRunner output_series
        alignments_series -> WordAlignmentRunner output_series
        bs_graph_series -> BeamSearchRunner output_series
        """

        super(NeuralMonkeyModelWrapper, self).__init__(name, runs_on_features)

        if not os.path.isfile(config_path):
            raise ValueError("File {} does not exist.".format(config_path))

        self._config_path = config_path
        self._vars_path = vars_path
        self._data_series = data_series
        self._src_caption_series = src_caption_series
        self._caption_series = caption_series
        self._alignments_series = alignments_series
        self._bs_graph_series = bs_graph_series

        self._exp = Experiment(config_path=config_path)
        self._exp.build_model()
        self._exp.load_variables([vars_path])

        if self._src_caption_series:
            self.multimodal = True
        else:
            self.multimodal = False

    def run(self, inputs, src_captions=None):
        """
        Args:
            inputs: A Numpy Array of inputs (feature, or image arrays).
            src_captions: A list of string source captions.
        Returns:
            A list of dictionaries. Each dictionary contains the keys
            `caption`, `alignments`, `beam_search_output_graph`.
        """

        n_elems = inputs.shape[0]
        # enc-dec model (runs on images)
        if not self.runs_on_features:
            if self._src_caption_series:
                # TODO: handle multimodal translation case
                pass
            else:
                ds = Dataset("macaque_data",
                             {self._data_series: lambda: inputs}, {})

        # dec-only model (runs on feature maps)
        else:
            if self._src_caption_series:
                # TODO: handle multimodal translation case
                pass
            else:
                ds = Dataset("macaque_data",
                             {self._data_series: lambda: inputs}, {})

        _, output_series = self._exp.run_model(dataset=ds, write_out=False)

        if self._caption_series and self._caption_series in output_series:
            captions = output_series[self._caption_series]
        else:
            captions = [None] * n_elems

        if self._alignments_series and self._alignments_series in output_series:
            alignments = output_series[self._alignments_series]
            # WordAlignmentRunner is incompatible with beam search decoding.
            if self._bs_graph_series:
                alignments = None
        else:
            alignments = [None] * n_elems

        if self._bs_graph_series and self._bs_graph_series in output_series:
            bs_out = output_series[self._bs_graph_series]
            graphs = []
            for b in bs_out:
                attns = [_transform_alignments(a) for a in b['alignments']]
                graphs.append(
                    BeamSearchOutputGraph(scores=b['scores'],
                                          tokens=b['tokens'],
                                          parent_ids=b['parent_ids'],
                                          alignments=attns))

            hyps = [g.collect_hypotheses() for g in graphs]
            bs_caps = [h['tokens'] for h in hyps]
            bs_attns = [h['alignments'] for h in hyps]
            #bs_attns = [[_transform_alignments(h) for h in hyp['alignments']] for hyp in hyps]
        else:
            graphs = [None] * n_elems
            bs_caps = [None] * n_elems
            bs_attns = [None] * n_elems

        results = []
        for c, a, bs_g, bs_c, bs_a in zip(captions, alignments, graphs,
                                          bs_caps, bs_attns):
            r = {}
            for x in [(c, 'caption'), (a, 'alignments')]:
                if x[0] is not None:
                    if 'greedy' not in r:
                        r['greedy'] = {}
                    r['greedy'][x[1]] = x[0]
            for x in [(bs_g, 'graph'), (bs_c, 'captions'),
                      (bs_a, 'alignments')]:
                if x[0] is not None:
                    if 'beam_search' not in r:
                        r['beam_search'] = {}
                    r['beam_search'][x[1]] = x[0]
            results.append(r)
        return results