Ejemplo n.º 1
0
def _reverse_gradient(x: tf.Tensor) -> tf.Tensor:
    """Flips the sign of the incoming gradient during training."""

    grad_name = "gradient_reversal_{}".format(x.name)

    # pylint: disable=unused-variable,invalid-name,unused-argument
    @ops.RegisterGradient(grad_name)
    def _flip_gradients(op, grad):
        return [tf.negative(grad)]
    # pylint: enable=unused-variable,invalid-name,unused-argument

    from neuralmonkey.experiment import Experiment
    graph = Experiment.get_current().graph
    with graph.gradient_override_map({"Identity": grad_name}):
        y = tf.identity(x)

    return y
Ejemplo n.º 2
0
def _reverse_gradient(x: tf.Tensor) -> tf.Tensor:
    """Flips the sign of the incoming gradient during training."""

    grad_name = "gradient_reversal_{}".format(x.name)

    # pylint: disable=unused-variable,invalid-name,unused-argument
    @ops.RegisterGradient(grad_name)
    def _flip_gradients(op, grad):
        return [tf.negative(grad)]

    # pylint: enable=unused-variable,invalid-name,unused-argument

    from neuralmonkey.experiment import Experiment
    graph = Experiment.get_current().graph
    with graph.gradient_override_map({"Identity": grad_name}):
        y = tf.identity(x)

    return y
Ejemplo n.º 3
0
    def fetches(self) -> Dict[str, tf.Tensor]:

        fetches = {}  # type: Dict[str, tf.Tensor]
        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self.batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for mpart, tname, bid in zip(self._modelparts, self._tensors,
                                     self.batch_dims):
            if not hasattr(mpart, tname):
                raise ValueError("Model part {} does not have a tensor called "
                                 "{}.".format(mpart, tname))

            tensorval = getattr(mpart, tname)

            fetches[tensorval.name] = tensorval
            self.batch_ids[tensorval.name] = bid

        return fetches
Ejemplo n.º 4
0
    def fetches(self) -> Dict[str, tf.Tensor]:

        fetches = {}  # type: Dict[str, tf.Tensor]
        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self.batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))

        for mpart, tname, bid in zip(self._modelparts, self._tensors,
                                     self.batch_dims):
            if not hasattr(mpart, tname):
                raise ValueError("Model part {} does not have a tensor called "
                                 "{}.".format(mpart, tname))

            tensorval = getattr(mpart, tname)

            fetches[tensorval.name] = tensorval
            self.batch_ids[tensorval.name] = bid

        return fetches
Ejemplo n.º 5
0
def load(name: str,
         series: List[str],
         data: List[SourceSpec],
         batching: BatchingScheme = None,
         outputs: List[OutputSpec] = None,
         buffer_size: int = None,
         shuffled: bool = False) -> "Dataset":
    """Create a dataset using specification from the configuration.

    The dataset provides iterators over data series. The dataset has a buffer,
    which pre-fetches a given number of the data series lazily. In case the
    dataset is not lazy (buffer size is `None`), the iterators are built on top
    of in-memory arrays. Otherwise, the iterators operate on the data sources
    directly.

    Arguments:
        name: The name of the dataset.
        series: A list of names of data series the dataset contains.
        data: The specification of the data sources for each series.
        outputs: A list of output specifications.
        buffer_size: The size of the buffer. If set, the dataset will be loaded
            lazily into the buffer (useful for large datasets). The buffer size
            specifies the number of sequences to pre-load. This is useful for
            pseudo-shuffling of large data on-the-fly. Ideally, this should be
            (much) larger than the batch size. Note that the buffer gets
            refilled each time its size is less than half the `buffer_size`.
            When refilling, the buffer gets refilled to the specified size.
    """
    check_argument_types()

    if batching is None:
        from neuralmonkey.experiment import Experiment
        log("Using default batching scheme for dataset {}.".format(name))
        # pylint: disable=no-member
        batch_size = Experiment.get_current().config.args.batch_size
        # pylint: enable=no-member
        if batch_size is None:
            raise ValueError("Argument main.batch_size is not specified, "
                             "cannot use default batching scheme.")
        batching = BatchingScheme(batch_size=batch_size)

    if not series:
        raise ValueError("No dataset series specified.")

    if not [s for s in data if match_type(s, ReaderDef)]:  # type: ignore
        raise ValueError("At least one data series should be from a file")

    if len(series) != len(data):
        raise ValueError(
            "The 'series' and 'data' lists should have the same number"
            " of elements: {} vs {}.".format(len(series), len(data)))

    if len(series) != len(set(series)):
        raise ValueError("There are duplicate series.")

    if outputs is not None:
        output_sources = [o[0] for o in outputs]
        if len(output_sources) != len(set(output_sources)):
            raise ValueError("Multiple outputs for a single series")

    log("Initializing dataset {}.".format(name))

    iterators = {}  # type: Dict[str, Callable[[], DataSeries]]

    prep_sl = {}  # type: Dict[str, Tuple[Callable, str]]
    prep_dl = {}  # type: Dict[str, DatasetPreprocess]

    def _make_iterator(reader, files):
        def itergen():
            return reader(files)
        return itergen

    def _make_sl_iterator(src, prep):
        def itergen():
            return (prep(item) for item in iterators[src]())
        return itergen

    def _make_dl_iterator(func):
        def itergen():
            return func(iterators)
        return itergen

    # First, prepare iterators for series using file readers
    for s_name, source_spec in zip(series, data):
        if match_type(source_spec, ReaderDef):  # type: ignore
            files, reader = _normalize_readerdef(cast(ReaderDef, source_spec))
            for path in files:
                if not os.path.isfile(path):
                    raise FileNotFoundError(
                        "File not found. Series: {}, Path: {}"
                        .format(s_name, path))

            iterators[s_name] = _make_iterator(reader, files)

        elif match_type(source_spec, Tuple[Callable, str]):
            prep_sl[s_name] = cast(Tuple[Callable, str], source_spec)

        else:
            assert match_type(source_spec, DatasetPreprocess)  # type: ignore
            prep_dl[s_name] = cast(DatasetPreprocess, source_spec)

    # Second, prepare series-level preprocessors.
    # Note that series-level preprocessors cannot be stacked on the dataset
    # specification level.
    for s_name, (preprocessor, source) in prep_sl.items():
        if source not in iterators:
            raise ValueError(
                "Source series for series-level preprocessor nonexistent: "
                "Preprocessed series '{}', source series '{}'")
        iterators[s_name] = _make_sl_iterator(source, preprocessor)

    # Finally, dataset-level preprocessors.
    for s_name, func in prep_dl.items():
        iterators[s_name] = _make_dl_iterator(func)

    output_dict = None
    if outputs is not None:
        output_dict = {s_name: (path, writer)
                       for s_name, path, writer
                       in [_normalize_outputspec(out) for out in outputs]}

    if buffer_size is not None:
        return Dataset(name, iterators, batching, output_dict,
                       (buffer_size // 2, buffer_size), shuffled)

    return Dataset(name, iterators, batching, output_dict, None, shuffled)
Ejemplo n.º 6
0
    def __init__(self,
                 output_series: str,
                 toplevel_modelpart: ModelPart,
                 toplevel_tensors: List[tf.Tensor],
                 tensors_by_name: List[str],
                 tensors_by_ref: List[tf.Tensor],
                 batch_dims_by_name: List[int],
                 batch_dims_by_ref: List[int],
                 select_session: int = None,
                 single_tensor: bool = False) -> None:
        """Construct a new ``TensorRunner`` object.

        Note that at this time, one must specify the toplevel objects so that
        it is ensured that the graph is built. The reason for this behavior is
        that the graph is constructed lazily and therefore if the tensors to
        store are provided by indirect reference (name), the system does not
        know early enough that it needs to create them.

        Args:
            output_series: The name of the generated output data series.
            toplevel_modelpart: A ``ModelPart`` object that is used as the
                top-level component of the model. This object should depend on
                values of all the wanted tensors.
            toplevel_tensors: A list of tensors that should be constructed. Use
                this when the toplevel model part does not depend on this
                tensor. The tensors are constructed during running this
                constructor method which prints them out.
            tensors_by_name: A list of tensor names to fetch. If a tensor
                is not in the graph, a warning is generated and the tensor is
                ignored.
            tensors_by_ref: A list of tensor objects to fetch.
            batch_dims_by_name: A list of integers that correspond to the
                batch dimension in each wanted tensor specified by name.
            batch_dims_by_ref: A list of integers that correspond to the
                batch dimension in each wanted tensor specified by reference.
            select_session: An optional integer specifying the session to use
                in case of ensembling. When not used, tensors from all sessions
                are stored. In case of a single session, this option has no
                effect.
            single_tensor: If `True`, it is assumed that only one tensor is to
                be fetched, and the execution result will consist of this
                tensor only. If `False`, the result will be a dict mapping
                tensor names to NumPy arrays.
        """
        check_argument_types()
        BaseRunner[ModelPart].__init__(self, output_series, toplevel_modelpart)

        total_tensors = len(tensors_by_name) + len(tensors_by_ref)
        if single_tensor and total_tensors > 1:
            raise ValueError(
                "single_tensor is True, but {} tensors were given".format(
                    total_tensors))

        self._names = tensors_by_name
        self._tensors = tensors_by_ref
        self._batch_dims_name = batch_dims_by_name
        self._batch_dims_ref = batch_dims_by_ref
        self._select_session = select_session
        self._single_tensor = single_tensor

        log("Blessing toplevel tensors for tensor runner:")
        for tensor in toplevel_tensors:
            log("Toplevel tensor: {}".format(tensor))

        self._fetches = {}  # type: Dict[str, tf.Tensor]
        self._batch_ids = {}  # type: Dict[str, int]

        for name, bid in zip(self._names, self._batch_dims_name):
            try:
                self._fetches[name] = (
                    Experiment.get_current().graph.get_tensor_by_name(name))
                self._batch_ids[name] = bid
            except KeyError:
                warn(("The tensor of name '{}' is not present in the "
                      "graph.").format(name))
Ejemplo n.º 7
0
def load(name: str,
         series: List[str],
         data: List[SourceSpec],
         batching: BatchingScheme = None,
         outputs: List[OutputSpec] = None,
         buffer_size: int = None,
         shuffled: bool = False) -> "Dataset":
    """Create a dataset using specification from the configuration.

    The dataset provides iterators over data series. The dataset has a buffer,
    which pre-fetches a given number of the data series lazily. In case the
    dataset is not lazy (buffer size is `None`), the iterators are built on top
    of in-memory arrays. Otherwise, the iterators operate on the data sources
    directly.

    Arguments:
        name: The name of the dataset.
        series: A list of names of data series the dataset contains.
        data: The specification of the data sources for each series.
        outputs: A list of output specifications.
        buffer_size: The size of the buffer. If set, the dataset will be loaded
            lazily into the buffer (useful for large datasets). The buffer size
            specifies the number of sequences to pre-load. This is useful for
            pseudo-shuffling of large data on-the-fly. Ideally, this should be
            (much) larger than the batch size. Note that the buffer gets
            refilled each time its size is less than half the `buffer_size`.
            When refilling, the buffer gets refilled to the specified size.
    """
    check_argument_types()

    if batching is None:
        from neuralmonkey.experiment import Experiment
        log("Using default batching scheme for dataset {}.".format(name))
        # pylint: disable=no-member
        batch_size = Experiment.get_current().config.args.batch_size
        # pylint: enable=no-member
        if batch_size is None:
            raise ValueError("Argument main.batch_size is not specified, "
                             "cannot use default batching scheme.")
        batching = BatchingScheme(batch_size=batch_size)

    if not series:
        raise ValueError("No dataset series specified.")

    if not [s for s in data if match_type(s, ReaderDef)]:  # type: ignore
        raise ValueError("At least one data series should be from a file")

    if len(series) != len(data):
        raise ValueError(
            "The 'series' and 'data' lists should have the same number"
            " of elements: {} vs {}.".format(len(series), len(data)))

    if len(series) != len(set(series)):
        raise ValueError("There are duplicate series.")

    if outputs is not None:
        output_sources = [o[0] for o in outputs]
        if len(output_sources) != len(set(output_sources)):
            raise ValueError("Multiple outputs for a single series")

    log("Initializing dataset {}.".format(name))

    iterators = {}  # type: Dict[str, Callable[[], DataSeries]]

    prep_sl = {}  # type: Dict[str, Tuple[Callable, str]]
    prep_dl = {}  # type: Dict[str, DatasetPreprocess]

    def _make_iterator(reader, files):
        def itergen():
            return reader(files)

        return itergen

    def _make_sl_iterator(src, prep):
        def itergen():
            return (prep(item) for item in iterators[src]())

        return itergen

    def _make_dl_iterator(func):
        def itergen():
            return func(iterators)

        return itergen

    # First, prepare iterators for series using file readers
    for s_name, source_spec in zip(series, data):
        if match_type(source_spec, ReaderDef):  # type: ignore
            files, reader = _normalize_readerdef(cast(ReaderDef, source_spec))
            for path in files:
                if not os.path.isfile(path):
                    raise FileNotFoundError(
                        "File not found. Series: {}, Path: {}".format(
                            s_name, path))

            iterators[s_name] = _make_iterator(reader, files)

        elif match_type(source_spec, Tuple[Callable, str]):
            prep_sl[s_name] = cast(Tuple[Callable, str], source_spec)

        else:
            assert match_type(source_spec, DatasetPreprocess)  # type: ignore
            prep_dl[s_name] = cast(DatasetPreprocess, source_spec)

    # Second, prepare series-level preprocessors.
    # Note that series-level preprocessors cannot be stacked on the dataset
    # specification level.
    for s_name, (preprocessor, source) in prep_sl.items():
        if source not in iterators:
            raise ValueError(
                "Source series for series-level preprocessor nonexistent: "
                "Preprocessed series '{}', source series '{}'")
        iterators[s_name] = _make_sl_iterator(source, preprocessor)

    # Finally, dataset-level preprocessors.
    for s_name, func in prep_dl.items():
        iterators[s_name] = _make_dl_iterator(func)

    output_dict = None
    if outputs is not None:
        output_dict = {
            s_name: (path, writer)
            for s_name, path, writer in
            [_normalize_outputspec(out) for out in outputs]
        }

    if buffer_size is not None:
        return Dataset(name, iterators, batching, output_dict,
                       (buffer_size // 2, buffer_size), shuffled)

    return Dataset(name, iterators, batching, output_dict, None, shuffled)
Ejemplo n.º 8
0
def _get_current_experiment():
    # This is needed to avoid circular imports.
    from neuralmonkey.experiment import Experiment
    return Experiment.get_current()
Ejemplo n.º 9
0
def _get_current_experiment():
    # This is needed to avoid circular imports.
    from neuralmonkey.experiment import Experiment
    return Experiment.get_current()