def _reverse_gradient(x: tf.Tensor) -> tf.Tensor: """Flips the sign of the incoming gradient during training.""" grad_name = "gradient_reversal_{}".format(x.name) # pylint: disable=unused-variable,invalid-name,unused-argument @ops.RegisterGradient(grad_name) def _flip_gradients(op, grad): return [tf.negative(grad)] # pylint: enable=unused-variable,invalid-name,unused-argument from neuralmonkey.experiment import Experiment graph = Experiment.get_current().graph with graph.gradient_override_map({"Identity": grad_name}): y = tf.identity(x) return y
def fetches(self) -> Dict[str, tf.Tensor]: fetches = {} # type: Dict[str, tf.Tensor] for name, bid in zip(self._names, self._batch_dims_name): try: fetches[name] = ( Experiment.get_current().graph.get_tensor_by_name(name)) self.batch_ids[name] = bid except KeyError: warn(("The tensor of name '{}' is not present in the " "graph.").format(name)) for mpart, tname, bid in zip(self._modelparts, self._tensors, self.batch_dims): if not hasattr(mpart, tname): raise ValueError("Model part {} does not have a tensor called " "{}.".format(mpart, tname)) tensorval = getattr(mpart, tname) fetches[tensorval.name] = tensorval self.batch_ids[tensorval.name] = bid return fetches
def load(name: str, series: List[str], data: List[SourceSpec], batching: BatchingScheme = None, outputs: List[OutputSpec] = None, buffer_size: int = None, shuffled: bool = False) -> "Dataset": """Create a dataset using specification from the configuration. The dataset provides iterators over data series. The dataset has a buffer, which pre-fetches a given number of the data series lazily. In case the dataset is not lazy (buffer size is `None`), the iterators are built on top of in-memory arrays. Otherwise, the iterators operate on the data sources directly. Arguments: name: The name of the dataset. series: A list of names of data series the dataset contains. data: The specification of the data sources for each series. outputs: A list of output specifications. buffer_size: The size of the buffer. If set, the dataset will be loaded lazily into the buffer (useful for large datasets). The buffer size specifies the number of sequences to pre-load. This is useful for pseudo-shuffling of large data on-the-fly. Ideally, this should be (much) larger than the batch size. Note that the buffer gets refilled each time its size is less than half the `buffer_size`. When refilling, the buffer gets refilled to the specified size. """ check_argument_types() if batching is None: from neuralmonkey.experiment import Experiment log("Using default batching scheme for dataset {}.".format(name)) # pylint: disable=no-member batch_size = Experiment.get_current().config.args.batch_size # pylint: enable=no-member if batch_size is None: raise ValueError("Argument main.batch_size is not specified, " "cannot use default batching scheme.") batching = BatchingScheme(batch_size=batch_size) if not series: raise ValueError("No dataset series specified.") if not [s for s in data if match_type(s, ReaderDef)]: # type: ignore raise ValueError("At least one data series should be from a file") if len(series) != len(data): raise ValueError( "The 'series' and 'data' lists should have the same number" " of elements: {} vs {}.".format(len(series), len(data))) if len(series) != len(set(series)): raise ValueError("There are duplicate series.") if outputs is not None: output_sources = [o[0] for o in outputs] if len(output_sources) != len(set(output_sources)): raise ValueError("Multiple outputs for a single series") log("Initializing dataset {}.".format(name)) iterators = {} # type: Dict[str, Callable[[], DataSeries]] prep_sl = {} # type: Dict[str, Tuple[Callable, str]] prep_dl = {} # type: Dict[str, DatasetPreprocess] def _make_iterator(reader, files): def itergen(): return reader(files) return itergen def _make_sl_iterator(src, prep): def itergen(): return (prep(item) for item in iterators[src]()) return itergen def _make_dl_iterator(func): def itergen(): return func(iterators) return itergen # First, prepare iterators for series using file readers for s_name, source_spec in zip(series, data): if match_type(source_spec, ReaderDef): # type: ignore files, reader = _normalize_readerdef(cast(ReaderDef, source_spec)) for path in files: if not os.path.isfile(path): raise FileNotFoundError( "File not found. Series: {}, Path: {}" .format(s_name, path)) iterators[s_name] = _make_iterator(reader, files) elif match_type(source_spec, Tuple[Callable, str]): prep_sl[s_name] = cast(Tuple[Callable, str], source_spec) else: assert match_type(source_spec, DatasetPreprocess) # type: ignore prep_dl[s_name] = cast(DatasetPreprocess, source_spec) # Second, prepare series-level preprocessors. # Note that series-level preprocessors cannot be stacked on the dataset # specification level. for s_name, (preprocessor, source) in prep_sl.items(): if source not in iterators: raise ValueError( "Source series for series-level preprocessor nonexistent: " "Preprocessed series '{}', source series '{}'") iterators[s_name] = _make_sl_iterator(source, preprocessor) # Finally, dataset-level preprocessors. for s_name, func in prep_dl.items(): iterators[s_name] = _make_dl_iterator(func) output_dict = None if outputs is not None: output_dict = {s_name: (path, writer) for s_name, path, writer in [_normalize_outputspec(out) for out in outputs]} if buffer_size is not None: return Dataset(name, iterators, batching, output_dict, (buffer_size // 2, buffer_size), shuffled) return Dataset(name, iterators, batching, output_dict, None, shuffled)
def __init__(self, output_series: str, toplevel_modelpart: ModelPart, toplevel_tensors: List[tf.Tensor], tensors_by_name: List[str], tensors_by_ref: List[tf.Tensor], batch_dims_by_name: List[int], batch_dims_by_ref: List[int], select_session: int = None, single_tensor: bool = False) -> None: """Construct a new ``TensorRunner`` object. Note that at this time, one must specify the toplevel objects so that it is ensured that the graph is built. The reason for this behavior is that the graph is constructed lazily and therefore if the tensors to store are provided by indirect reference (name), the system does not know early enough that it needs to create them. Args: output_series: The name of the generated output data series. toplevel_modelpart: A ``ModelPart`` object that is used as the top-level component of the model. This object should depend on values of all the wanted tensors. toplevel_tensors: A list of tensors that should be constructed. Use this when the toplevel model part does not depend on this tensor. The tensors are constructed during running this constructor method which prints them out. tensors_by_name: A list of tensor names to fetch. If a tensor is not in the graph, a warning is generated and the tensor is ignored. tensors_by_ref: A list of tensor objects to fetch. batch_dims_by_name: A list of integers that correspond to the batch dimension in each wanted tensor specified by name. batch_dims_by_ref: A list of integers that correspond to the batch dimension in each wanted tensor specified by reference. select_session: An optional integer specifying the session to use in case of ensembling. When not used, tensors from all sessions are stored. In case of a single session, this option has no effect. single_tensor: If `True`, it is assumed that only one tensor is to be fetched, and the execution result will consist of this tensor only. If `False`, the result will be a dict mapping tensor names to NumPy arrays. """ check_argument_types() BaseRunner[ModelPart].__init__(self, output_series, toplevel_modelpart) total_tensors = len(tensors_by_name) + len(tensors_by_ref) if single_tensor and total_tensors > 1: raise ValueError( "single_tensor is True, but {} tensors were given".format( total_tensors)) self._names = tensors_by_name self._tensors = tensors_by_ref self._batch_dims_name = batch_dims_by_name self._batch_dims_ref = batch_dims_by_ref self._select_session = select_session self._single_tensor = single_tensor log("Blessing toplevel tensors for tensor runner:") for tensor in toplevel_tensors: log("Toplevel tensor: {}".format(tensor)) self._fetches = {} # type: Dict[str, tf.Tensor] self._batch_ids = {} # type: Dict[str, int] for name, bid in zip(self._names, self._batch_dims_name): try: self._fetches[name] = ( Experiment.get_current().graph.get_tensor_by_name(name)) self._batch_ids[name] = bid except KeyError: warn(("The tensor of name '{}' is not present in the " "graph.").format(name))
def load(name: str, series: List[str], data: List[SourceSpec], batching: BatchingScheme = None, outputs: List[OutputSpec] = None, buffer_size: int = None, shuffled: bool = False) -> "Dataset": """Create a dataset using specification from the configuration. The dataset provides iterators over data series. The dataset has a buffer, which pre-fetches a given number of the data series lazily. In case the dataset is not lazy (buffer size is `None`), the iterators are built on top of in-memory arrays. Otherwise, the iterators operate on the data sources directly. Arguments: name: The name of the dataset. series: A list of names of data series the dataset contains. data: The specification of the data sources for each series. outputs: A list of output specifications. buffer_size: The size of the buffer. If set, the dataset will be loaded lazily into the buffer (useful for large datasets). The buffer size specifies the number of sequences to pre-load. This is useful for pseudo-shuffling of large data on-the-fly. Ideally, this should be (much) larger than the batch size. Note that the buffer gets refilled each time its size is less than half the `buffer_size`. When refilling, the buffer gets refilled to the specified size. """ check_argument_types() if batching is None: from neuralmonkey.experiment import Experiment log("Using default batching scheme for dataset {}.".format(name)) # pylint: disable=no-member batch_size = Experiment.get_current().config.args.batch_size # pylint: enable=no-member if batch_size is None: raise ValueError("Argument main.batch_size is not specified, " "cannot use default batching scheme.") batching = BatchingScheme(batch_size=batch_size) if not series: raise ValueError("No dataset series specified.") if not [s for s in data if match_type(s, ReaderDef)]: # type: ignore raise ValueError("At least one data series should be from a file") if len(series) != len(data): raise ValueError( "The 'series' and 'data' lists should have the same number" " of elements: {} vs {}.".format(len(series), len(data))) if len(series) != len(set(series)): raise ValueError("There are duplicate series.") if outputs is not None: output_sources = [o[0] for o in outputs] if len(output_sources) != len(set(output_sources)): raise ValueError("Multiple outputs for a single series") log("Initializing dataset {}.".format(name)) iterators = {} # type: Dict[str, Callable[[], DataSeries]] prep_sl = {} # type: Dict[str, Tuple[Callable, str]] prep_dl = {} # type: Dict[str, DatasetPreprocess] def _make_iterator(reader, files): def itergen(): return reader(files) return itergen def _make_sl_iterator(src, prep): def itergen(): return (prep(item) for item in iterators[src]()) return itergen def _make_dl_iterator(func): def itergen(): return func(iterators) return itergen # First, prepare iterators for series using file readers for s_name, source_spec in zip(series, data): if match_type(source_spec, ReaderDef): # type: ignore files, reader = _normalize_readerdef(cast(ReaderDef, source_spec)) for path in files: if not os.path.isfile(path): raise FileNotFoundError( "File not found. Series: {}, Path: {}".format( s_name, path)) iterators[s_name] = _make_iterator(reader, files) elif match_type(source_spec, Tuple[Callable, str]): prep_sl[s_name] = cast(Tuple[Callable, str], source_spec) else: assert match_type(source_spec, DatasetPreprocess) # type: ignore prep_dl[s_name] = cast(DatasetPreprocess, source_spec) # Second, prepare series-level preprocessors. # Note that series-level preprocessors cannot be stacked on the dataset # specification level. for s_name, (preprocessor, source) in prep_sl.items(): if source not in iterators: raise ValueError( "Source series for series-level preprocessor nonexistent: " "Preprocessed series '{}', source series '{}'") iterators[s_name] = _make_sl_iterator(source, preprocessor) # Finally, dataset-level preprocessors. for s_name, func in prep_dl.items(): iterators[s_name] = _make_dl_iterator(func) output_dict = None if outputs is not None: output_dict = { s_name: (path, writer) for s_name, path, writer in [_normalize_outputspec(out) for out in outputs] } if buffer_size is not None: return Dataset(name, iterators, batching, output_dict, (buffer_size // 2, buffer_size), shuffled) return Dataset(name, iterators, batching, output_dict, None, shuffled)
def _get_current_experiment(): # This is needed to avoid circular imports. from neuralmonkey.experiment import Experiment return Experiment.get_current()