Ejemplo n.º 1
0
def init_theano_devices():
    """
  Only for Theano.

  :rtype: list[Device.Device]|None
  """
    if not BackendEngine.is_theano_selected():
        return None
    from returnn.util.basic import TheanoFlags
    from returnn.config import get_devices_init_args
    from returnn.theano.device import Device
    old_device_config = ",".join(config.list('device', ['default']))
    if config.value("task", "train") == "nop":
        return []
    if "device" in TheanoFlags:
        # This is important because Theano likely already has initialized that device.
        config.set("device", TheanoFlags["device"])
        print("Devices: Use %s via THEANO_FLAGS instead of %s." %
              (TheanoFlags["device"], old_device_config),
              file=log.v4)
    dev_args = get_devices_init_args(config)
    assert len(dev_args) > 0
    devices = [Device(**kwargs) for kwargs in dev_args]
    for device in devices:
        while not device.initialized:
            time.sleep(0.25)
    if devices[0].blocking:
        print("Devices: Used in blocking / single proc mode.", file=log.v4)
    else:
        print("Devices: Used in multiprocessing mode.", file=log.v4)
    return devices
Ejemplo n.º 2
0
 def _layer_params(self, info, sources, mask, reverse=False):
   """
   :param dict[str] info: self.hidden_info[i]
   :param list[str] sources: 'from' entry
   :param None | str mask: mask
   :param bool reverse: reverse or not
   :rtype: dict[str]
   """
   from returnn.util.basic import BackendEngine, getargspec
   if BackendEngine.is_theano_selected():
     from returnn.theano.layers.basic import get_layer_class
   elif BackendEngine.is_tensorflow_selected():
     from returnn.tf.layers.basic import get_layer_class
   else:
     raise NotImplementedError
   params = dict(self.default_layer_info)
   params.update(info)
   params["from"] = sources
   if mask:
     params["mask"] = mask
   layer_class = get_layer_class(params["layer_class"])
   if layer_class.recurrent:
     params['truncation'] = self.truncation
     if self.bidirectional:
       if not reverse:
         params['name'] += "_fw"
       else:
         params['name'] += "_bw"
         params['reverse'] = True
     if 'sharpgates' in getargspec(layer_class.__init__).args[1:]:
       params['sharpgates'] = self.sharpgates
   return params
Ejemplo n.º 3
0
def init_backend_engine():
    """
  Initializes ``engine``, which is either :class:`TFEngine.Engine` or Theano :class:`Engine.Engine`.
  """
    BackendEngine.select_engine(config=config)
    if BackendEngine.is_theano_selected():
        print("Theano:", describe_theano_version(), file=log.v3)
        import returnn.theano.util
        returnn.theano.util.monkey_patches()
    elif BackendEngine.is_tensorflow_selected():
        print("TensorFlow:", describe_tensorflow_version(), file=log.v3)
        if get_tensorflow_version_tuple()[0] == 0:
            print("Warning: TF <1.0 is not supported and likely broken.",
                  file=log.v2)
        if os.environ.get("TF_DEVICE"):
            print("Devices: Use %s via TF_DEVICE instead of %s." %
                  (os.environ.get("TF_DEVICE"),
                   config.opt_typed_value("device")),
                  file=log.v4)
            config.set("device", os.environ.get("TF_DEVICE"))
        if config.is_true("use_horovod"):
            import returnn.tf.horovod
            hvd = returnn.tf.horovod.get_ctx(config=config)
            import socket
            if "gpu" in config.value("device", "") or os.environ.get(
                    "CUDA_VISIBLE_DEVICES", ""):
                # We assume that we want to use a GPU.
                gpu_opts = config.typed_dict.setdefault("tf_session_opts",
                                                        {}).setdefault(
                                                            "gpu_options", {})
                assert "visible_device_list" not in gpu_opts
                gpu_opts["visible_device_list"] = str(hvd.local_rank())
                print("Horovod: Hostname %s, pid %i, using GPU %s." %
                      (socket.gethostname(), os.getpid(),
                       gpu_opts["visible_device_list"]),
                      file=log.v3)
            else:
                if hvd.rank() == 0:  # Don't spam in all ranks.
                    print("Horovod: Not using GPU.", file=log.v3)
            if hvd.rank() == 0:  # Don't spam in all ranks.
                print("Horovod: Reduce type:",
                      hvd.get_reduce_type(),
                      file=log.v3)
        from returnn.tf.util.basic import debug_register_better_repr, setup_tf_thread_pools, print_available_devices
        tf_session_opts = config.typed_value("tf_session_opts", {})
        assert isinstance(tf_session_opts, dict)
        # This must be done after the Horovod logic, such that we only touch the devices we are supposed to touch.
        setup_tf_thread_pools(log_file=log.v3, tf_session_opts=tf_session_opts)
        # Print available devices. Also make sure that get_tf_list_local_devices uses the correct TF session opts.
        print_available_devices(tf_session_opts=tf_session_opts, file=log.v2)
        from returnn.tf.native_op import OpMaker
        OpMaker.log_stream = log.v3
        debug_register_better_repr()
        if config.is_true("distributed_tf"):
            import returnn.tf.distributed
            returnn.tf.distributed.init_distributed_tf(config)
    else:
        raise NotImplementedError
Ejemplo n.º 4
0
def init_engine(devices):
    """
  Initializes global engine.

  :type devices: list[Device.Device]|None
  """
    global engine
    if BackendEngine.is_theano_selected():
        from returnn.theano.engine import Engine
        engine = Engine(devices)
    elif BackendEngine.is_tensorflow_selected():
        from returnn.tf.engine import Engine
        engine = Engine(config=config)
    else:
        raise NotImplementedError
Ejemplo n.º 5
0
def _prepare_forwarding():
    assert engine
    assert config
    # Should already be set via setTargetMode().
    assert config.list('extract') == [
        "posteriors"
    ], ("You need to have extract = posteriors in your RETURNN config. You have: %s"
        % config.list('extract'))

    # Load network.
    engine.init_network_from_config(config)

    # Copy over net params.
    if BackendEngine.is_theano_selected():
        engine.devices[0].prepare(engine.network)
Ejemplo n.º 6
0
def finalize(error_occurred=False):
    """
  Cleanup at the end.

  :param bool error_occurred:
  """
    print("Quitting", file=getattr(log, "v4", sys.stderr))
    global quit_returnn
    quit_returnn = True
    sys.exited = True
    if engine:
        if BackendEngine.is_theano_selected():
            for device in engine.devices:
                device.terminate()
        elif BackendEngine.is_tensorflow_selected():
            engine.finalize(error_occurred=error_occurred)
Ejemplo n.º 7
0
def dump_flags():
    """
  Dump some relevant env flags.
  """
    print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
    print("CUDA_LAUNCH_BLOCKING:", os.environ.get("CUDA_LAUNCH_BLOCKING"))

    if BackendEngine.is_theano_selected():
        print("available GPUs:", get_gpu_names())

        # noinspection PyUnresolvedReferences,PyPackageRequirements
        from theano.sandbox import cuda as theano_cuda
        print("CUDA via", theano_cuda.__file__)
        print("CUDA available:", theano_cuda.cuda_available)

        from returnn.util.basic import TheanoFlags
        print("THEANO_FLAGS:", TheanoFlags)
Ejemplo n.º 8
0
def init(config_filename=None,
         command_line_options=(),
         config_updates=None,
         extra_greeting=None):
    """
  :param str|None config_filename:
  :param tuple[str]|list[str]|None command_line_options: e.g. sys.argv[1:]
  :param dict[str]|None config_updates: see :func:`init_config`
  :param str|None extra_greeting:
  """
    init_better_exchook()
    init_thread_join_hack()
    init_config(config_filename=config_filename,
                command_line_options=command_line_options,
                extra_updates=config_updates)
    if config.bool("patch_atfork", False):
        from returnn.util.basic import maybe_restart_returnn_with_atfork_patch
        maybe_restart_returnn_with_atfork_patch()
    init_log()
    if extra_greeting:
        print(extra_greeting, file=log.v1)
    returnn_greeting(config_filename=config_filename,
                     command_line_options=command_line_options)
    init_faulthandler()
    init_backend_engine()
    if BackendEngine.is_theano_selected():
        if config.value('task', 'train') == "theano_graph":
            config.set("multiprocessing", False)
        if config.bool('multiprocessing', True):
            init_cuda_not_in_main_proc_check()
    if config.bool('ipython', False):
        init_ipython_kernel()
    init_config_json_network()
    devices = init_theano_devices()
    if need_data():
        init_data()
    print_task_properties(devices)
    if config.value('task', 'train') == 'server':
        from returnn.theano.server import Server
        global server
        server = Server(config)
    else:
        init_engine(devices)
Ejemplo n.º 9
0
def get_global_config(raise_exception=True, auto_create=False):
    """
  :param bool raise_exception: if no global config is found, raise an exception, otherwise return None
  :param bool auto_create: if no global config is found, it creates one and returns it
  :rtype: Config|None
  """
    config = _get_or_set_config_via_tf_default_graph()
    if config:
        return config
    if _global_config:
        return _global_config
    import returnn.util.task_system
    from returnn.util.basic import BackendEngine
    if not returnn.util.task_system.isMainProcess:
        try:
            if BackendEngine.is_theano_selected():
                import returnn.theano.device
                # We expect that we are a Device subprocess.
                assert returnn.theano.device.asyncChildGlobalDevice is not None
                return returnn.theano.device.asyncChildGlobalDevice.config
        except BackendEngine.CannotSelectEngine:
            pass  # ignore
    # We are the main process.
    import sys
    main_mod = sys.modules["__main__"]  # should be rnn.py
    if hasattr(main_mod, "config") and isinstance(main_mod.config, Config):
        return main_mod.config
    # Maybe __main__ is not rnn.py, or config not yet loaded.
    # Anyway, try directly. (E.g. for SprintInterface.)
    import returnn.__main__ as rnn
    if isinstance(rnn.config, Config):
        return rnn.config
    if auto_create:
        config = Config()
        set_global_config(config)
        return config
    if raise_exception:
        raise Exception("No global config found.")
    return None
Ejemplo n.º 10
0
  def get_batch_loss_and_error_signal(self, log_posteriors, seq_lengths, tags=None):
    """
    :param numpy.ndarray log_posteriors: 3d (time,batch,label)
    :param numpy.ndarray seq_lengths: 1d (batch)
    :param list[str] tags: seq names, length = batch
    :rtype (numpy.ndarray, numpy.ndarray)
    :returns (loss, error_signal). error_signal has the same shape as posteriors.
    loss is a 1d-array (batch).

    Note that this accesses some global references, like global current seg info,
    via the current Device instance.
    Thus this is expected to be run from the Device host proc,
      inside from SprintErrorSigOp.perform.
    This also expects that we don't have chunked seqs.
    """
    assert seq_lengths.ndim == 1
    assert log_posteriors.ndim == 3
    n_batch = seq_lengths.shape[0]
    assert n_batch == log_posteriors.shape[1]

    if tags is None:
      import returnn.theano.device as theano_device
      assert theano_device.is_device_host_proc()
      tags = theano_device.get_current_seq_tags()
    assert len(tags) == n_batch

    batch_loss = numpy.zeros((n_batch,), dtype="float32")
    batch_error_signal = numpy.zeros_like(log_posteriors, dtype="float32")

    # greedy solution to the scheduling problem
    sorted_length = sorted(enumerate(seq_lengths), key=lambda x: x[1], reverse=True)
    jobs = [[] for _ in range(self.max_num_instances)]
    joblen = [0] * self.max_num_instances
    for i, l in sorted_length:
      j = min(enumerate(joblen), key=lambda x: x[1])[0]  # noqa
      jobs[j].append(i)
      joblen[j] += l

    if not BackendEngine.is_theano_selected() and self.max_num_instances > 1:
      threads = [
        ReaderThread(
          self._get_instance(i), i, jobs[i], tags, seq_lengths, log_posteriors, batch_loss, batch_error_signal)
        for i in range(self.max_num_instances)]
      for i, thread in enumerate(threads):
        thread.join()
        if thread.exception:
          raise thread.exception
    else:
      # Very simple parallelism. We must avoid any form of multi-threading
      # because this can be problematic with Theano.
      # See: https://groups.google.com/forum/#!msg/theano-users/Pu4YKlZKwm4/eNcAegzaNeYJ
      # We also try to keep it simple here.
      for bb in range(0, n_batch, self.max_num_instances):
        for i in range(self.max_num_instances):
          b = bb + i
          if b >= n_batch:
            break
          instance = self._get_instance(i)
          instance.get_loss_and_error_signal__send(
            seg_name=tags[b], seg_len=seq_lengths[b], log_posteriors=log_posteriors[:seq_lengths[b], b])
        for i in range(self.max_num_instances):
          b = bb + i
          if b >= n_batch:
            break
          instance = self._get_instance(i)
          seg_name, loss, error_signal = instance.get_loss_and_error_signal__read()
          assert seg_name == tags[b]
          batch_loss[b] = loss
          batch_error_signal[:seq_lengths[b], b] = error_signal
          numpy_set_unused(error_signal)
    return batch_loss, batch_error_signal
Ejemplo n.º 11
0
    assert start_seq <= end_seq
    if start_seq == end_seq:
      return True
    first_seq, last_seq = start_seq, end_seq - 1
    have_first, have_last = False, False
    for loss_data in self.loss_data_queue:
      if loss_data.seq_idx == first_seq:
        have_first = True
      if loss_data.seq_idx == last_seq:
        have_last = True
    if have_last:
      assert have_first  # otherwise, we removed the cache already although we still need it
    return have_first and have_last


if BackendEngine.is_theano_selected():
  # noinspection PyPackageRequirements,PyUnresolvedReferences
  import theano
  # noinspection PyPackageRequirements,PyUnresolvedReferences,PyPep8Naming
  import theano.tensor as T

  # noinspection PyAbstractClass
  class SprintErrorSigOp(theano.Op):
    """
    Op: log_posteriors, seq_lengths -> loss, error_signal (grad w.r.t. z, i.e. before softmax is applied)
    """

    __props__ = ("sprint_opts",)

    def __init__(self, sprint_opts):
      super(SprintErrorSigOp, self).__init__()
Ejemplo n.º 12
0
 def num_inputs_outputs_from_config(cls, config):
   """
   :type config: Config.Config
   :returns (num_inputs, num_outputs),
      where num_inputs is like num_outputs["data"][0],
      and num_outputs is a dict of data_key -> (dim, ndim),
        where data_key is e.g. "classes" or "data",
        dim is the feature dimension or the number of classes,
        and ndim is the ndim counted without batch-dim,
        i.e. ndim=1 means usually sparse data and ndim=2 means dense data.
   :rtype: (int,dict[str,(int,int)])
   """
   from returnn.util.basic import BackendEngine
   num_inputs = config.int('num_inputs', 0)
   target = config.value('target', 'classes')
   if config.is_typed('num_outputs'):
     num_outputs = config.typed_value('num_outputs')
     if not isinstance(num_outputs, dict):
       num_outputs = {target: num_outputs}
     num_outputs = num_outputs.copy()
     from returnn.datasets.basic import convert_data_dims
     num_outputs = convert_data_dims(num_outputs, leave_dict_as_is=BackendEngine.is_tensorflow_selected())
     if "data" in num_outputs:
       num_inputs = num_outputs["data"]
       if isinstance(num_inputs, (list, tuple)):
         num_inputs = num_inputs[0]
       elif isinstance(num_inputs, dict):
         if "dim" in num_inputs:
           num_inputs = num_inputs["dim"]
         else:
           num_inputs = num_inputs["shape"][-1]
       else:
         raise TypeError("data key %r" % num_inputs)
   elif config.has('num_outputs'):
     num_outputs = {target: [config.int('num_outputs', 0), 1]}
   else:
     num_outputs = None
   dataset = None
   if config.list('train') and ":" not in config.value('train', ''):
     dataset = config.list('train')[0]
   if not config.is_typed('num_outputs') and dataset:
     # noinspection PyBroadException
     try:
       _num_inputs = hdf5_dimension(dataset, 'inputCodeSize') * config.int('window', 1)
     except Exception:
       _num_inputs = hdf5_dimension(dataset, 'inputPattSize') * config.int('window', 1)
     # noinspection PyBroadException
     try:
       _num_outputs = {target: [hdf5_dimension(dataset, 'numLabels'), 1]}
     except Exception:
       _num_outputs = hdf5_group(dataset, 'targets/size')
       for k in _num_outputs:
         _num_outputs[k] = [_num_outputs[k], len(hdf5_shape(dataset, 'targets/data/' + k))]
     if num_inputs:
       assert num_inputs == _num_inputs
     if num_outputs:
       assert num_outputs == _num_outputs
     num_inputs = _num_inputs
     num_outputs = _num_outputs
   if not num_inputs and not num_outputs and config.has("load") and BackendEngine.is_theano_selected():
     from returnn.theano.network import LayerNetwork
     import h5py
     model = h5py.File(config.value("load", ""), "r")
     # noinspection PyProtectedMember
     num_inputs, num_outputs = LayerNetwork._n_in_out_from_hdf_model(model)
   assert num_inputs and num_outputs, "provide num_inputs/num_outputs directly or via train"
   return num_inputs, num_outputs
Ejemplo n.º 13
0
def _forward(segment_name, features):
    """
  :param numpy.ndarray features: format (input-feature,time) (via Sprint)
  :return: format (output-dim,time)
  :rtype: numpy.ndarray
  """
    print("Sprint forward", segment_name, features.shape)
    start_time = time.time()
    assert engine is not None, "not initialized"
    assert sprintDataset

    # Features are in Sprint format (feature,time).
    num_time = features.shape[1]
    assert features.shape == (InputDim, num_time)
    dataset, seq_idx = features_to_dataset(features=features,
                                           segment_name=segment_name)

    if BackendEngine.is_theano_selected():
        # Prepare data for device.
        device = engine.devices[0]
        from returnn.theano.engine_util import assign_dev_data_single_seq
        success = assign_dev_data_single_seq(device,
                                             dataset=dataset,
                                             seq=seq_idx)
        assert success, "failed to allocate & assign data for seq %i, %s" % (
            seq_idx, segment_name)

        # Do the actual forwarding and collect result.
        device.run("extract")
        result, _ = device.result()
        assert result is not None, "Device crashed."
        assert len(result) == 1
        posteriors = result[0]

    elif BackendEngine.is_tensorflow_selected():
        posteriors = engine.forward_single(dataset=dataset, seq_idx=seq_idx)

    else:
        raise NotImplementedError("unknown backend engine")
    # If we have a sequence training criterion, posteriors might be in format (time,seq|batch,emission).
    if posteriors.ndim == 3:
        assert posteriors.shape == (num_time, 1, OutputDim * MaxSegmentLength)
        posteriors = posteriors[:, 0]
    # Posteriors are in format (time,emission).
    assert posteriors.shape == (num_time, OutputDim * MaxSegmentLength)
    # Reformat to Sprint expected format (emission,time).
    posteriors = posteriors.transpose()
    assert posteriors.shape == (OutputDim * MaxSegmentLength, num_time)
    stats = (numpy.min(posteriors), numpy.max(posteriors),
             numpy.mean(posteriors), numpy.std(posteriors))
    print("posteriors min/max/mean/std:", stats, "time:",
          time.time() - start_time)
    if numpy.isinf(posteriors).any() or numpy.isnan(posteriors).any():
        print("posteriors:", posteriors)
        debug_feat_fn = "/tmp/returnn.pid%i.sprintinterface.debug.features.txt" % os.getpid(
        )
        debug_post_fn = "/tmp/returnn.pid%i.sprintinterface.debug.posteriors.txt" % os.getpid(
        )
        print("Wrote to files %s, %s" % (debug_feat_fn, debug_post_fn))
        numpy.savetxt(debug_feat_fn, features)
        numpy.savetxt(debug_post_fn, posteriors)
        assert False, "Error, posteriors contain invalid numbers."

    return posteriors
Ejemplo n.º 14
0
def _init_base(configfile=None,
               target_mode=None,
               epoch=None,
               sprint_opts=None):
    """
  :param str|None configfile: filename, via init(), this is set
  :param str|None target_mode: "forward" or so. via init(), this is set
  :param int epoch: via init(), this is set
  :param dict[str,str]|None sprint_opts: optional parameters to override values in configfile
  """
    global isInitialized
    isInitialized = True
    # Run through in any case. Maybe just to set targetMode.

    if not getattr(sys, "argv", None):
        # Set some dummy. Some code might want this (e.g. TensorFlow).
        sys.argv = [__file__]

    global Engine
    global config
    if not config:
        # Some subset of what we do in rnn.init().

        rnn.init_better_exchook()
        rnn.init_thread_join_hack()

        if configfile is None:
            configfile = DefaultSprintCrnnConfig
        assert os.path.exists(configfile)
        rnn.init_config(config_filename=configfile,
                        extra_updates={"task": target_mode})
        assert rnn.config
        config = rnn.config
        if sprint_opts is not None:
            config.update(sprint_opts)

        rnn.init_log()
        rnn.returnn_greeting(config_filename=configfile)
        rnn.init_backend_engine()
        rnn.init_faulthandler(sigusr1_chain=True)
        rnn.init_config_json_network()

        if BackendEngine.is_tensorflow_selected():
            # Use TFEngine.Engine class instead of Engine.Engine.
            from returnn.tf.engine import Engine
        elif BackendEngine.is_theano_selected():
            from returnn.theano.engine import Engine

        import atexit
        atexit.register(_at_exit_handler)

    if target_mode:
        set_target_mode(target_mode)

    _init_dataset()

    if target_mode and target_mode == "forward" and epoch:
        model_filename = config.value('model', '')
        fns = [
            EngineBase.epoch_model_filename(model_filename, epoch, is_pretrain)
            for is_pretrain in [False, True]
        ]
        fn_postfix = ""
        if BackendEngine.is_tensorflow_selected():
            fn_postfix += ".meta"
        fns_existing = [fn for fn in fns if os.path.exists(fn + fn_postfix)]
        assert len(fns_existing) == 1, "%s not found" % fns
        model_epoch_filename = fns_existing[0]
        config.set('load', model_epoch_filename)
        assert EngineBase.get_epoch_model(config)[1] == model_epoch_filename, (
            "%r != %r" %
            (EngineBase.get_epoch_model(config), model_epoch_filename))

    global engine
    if not engine:
        devices = rnn.init_theano_devices()
        rnn.print_task_properties(devices)
        rnn.init_engine(devices)
        engine = rnn.engine
        assert isinstance(engine, Engine)