예제 #1
0
파일: Server.py 프로젝트: rwth-i6/returnn
  def classify_in_background(self):
    while True:
      requests = []
      # fetch first request
      r = yield self.classification_queue.get()
      requests.append(r)
      # grab all other waiting requests
      try:
        while True:
          requests.append(self.classification_queue.get_nowait())
      except QueueEmpty:
        pass

      output_dim = {}
      # Do dataset creation and classification.
      dataset = StaticDataset(data=[r.data for r in requests], output_dim=output_dim)
      dataset.init_seq_order()
      batches = dataset.generate_batches(recurrent_net=self.engine.network.recurrent,
                                         batch_size=self.batch_size, max_seqs=self.max_seqs)

      with (yield self.lock.acquire()):
        ctt = ForwardTaskThread(self.engine.network, self.devices, dataset, batches)
        yield ctt.join()

      try:
        for i in range(dataset.num_seqs):
          requests[i].future.set_result(ctt.result[i])
          self.classification_queue.task_done()
      except Exception as e:
        print('exception', e)
        raise
예제 #2
0
def test_hdf_target_float_dtype():
  from GeneratingDataset import StaticDataset
  dataset = StaticDataset([
    {"data": numpy.array([1, 2, 3], dtype="float32"), "classes": numpy.array([-1, 5], dtype="float32")}],
    output_dim={"data": (1, 1), "classes": (1, 1)})
  orig_data_dtype = dataset.get_data_dtype("data")
  orig_classes_dtype = dataset.get_data_dtype("classes")
  assert orig_data_dtype == "float32" and orig_classes_dtype == "float32"

  hdf_fn = _get_tmp_file(suffix=".hdf")
  hdf_writer = HDFDatasetWriter(filename=hdf_fn)
  hdf_writer.dump_from_dataset(dataset, use_progress_bar=False)
  hdf_writer.close()

  hdf_dataset = HDFDataset(files=[hdf_fn])
  hdf_dataset.initialize()
  hdf_dataset.init_seq_order(epoch=1)
  hdf_data_dtype = hdf_dataset.get_data_dtype("data")
  hdf_classes_dtype = hdf_dataset.get_data_dtype("classes")
  assert hdf_data_dtype == orig_data_dtype and hdf_classes_dtype == orig_classes_dtype
  hdf_data_dim = hdf_dataset.get_data_dim("data")
  hdf_classes_dim = hdf_dataset.get_data_dim("classes")
  assert hdf_data_dim == 1 and hdf_classes_dim == 1
  hdf_data_shape = hdf_dataset.get_data_shape("data")
  hdf_classes_shape = hdf_dataset.get_data_shape("classes")
  assert hdf_data_shape == [] and hdf_classes_shape == []
  hdf_dataset.load_seqs(0, 1)
  hdf_data_data = hdf_dataset.get_data(0, "data")
  hdf_data_classes = hdf_dataset.get_data(0, "classes")
  assert hdf_data_data.dtype == orig_data_dtype and hdf_data_classes.dtype == orig_classes_dtype
예제 #3
0
def test_hdf_data_short_int_dtype():
  from GeneratingDataset import StaticDataset
  dataset = StaticDataset([
    {"data": numpy.array([1, 2, 3], dtype="uint8"), "classes": numpy.array([-1, 5], dtype="int16")}],
    output_dim={"data": (255, 1), "classes": (10, 1)})
  orig_data_dtype = dataset.get_data_dtype("data")
  orig_classes_dtype = dataset.get_data_dtype("classes")
  assert orig_data_dtype == "uint8" and orig_classes_dtype == "int16"

  hdf_fn = _get_tmp_file(suffix=".hdf")
  hdf_writer = HDFDatasetWriter(filename=hdf_fn)
  hdf_writer.dump_from_dataset(dataset, use_progress_bar=False)
  hdf_writer.close()

  hdf_dataset = HDFDataset(files=[hdf_fn])
  hdf_dataset.initialize()
  hdf_dataset.init_seq_order(epoch=1)
  hdf_data_dtype = hdf_dataset.get_data_dtype("data")
  hdf_classes_dtype = hdf_dataset.get_data_dtype("classes")
  assert hdf_data_dtype == orig_data_dtype and hdf_classes_dtype == orig_classes_dtype
  hdf_data_dim = hdf_dataset.get_data_dim("data")
  hdf_classes_dim = hdf_dataset.get_data_dim("classes")
  assert hdf_data_dim == 255 and hdf_classes_dim == 10
  hdf_data_shape = hdf_dataset.get_data_shape("data")
  hdf_classes_shape = hdf_dataset.get_data_shape("classes")
  assert hdf_data_shape == [] and hdf_classes_shape == []
  hdf_dataset.load_seqs(0, 1)
  hdf_data_data = hdf_dataset.get_data(0, "data")
  hdf_data_classes = hdf_dataset.get_data(0, "classes")
  assert hdf_data_data.dtype == orig_data_dtype and hdf_data_classes.dtype == orig_classes_dtype
예제 #4
0
def test_hdf_target_float_dense():
  from GeneratingDataset import StaticDataset
  dataset = StaticDataset([
    {"data": numpy.array([[1, 2, 3], [2, 3, 4]], dtype="float32"),
     "classes": numpy.array([[-1, 5], [-2, 4], [-3, 2]], dtype="float32")}])
  orig_data_dtype = dataset.get_data_dtype("data")
  orig_classes_dtype = dataset.get_data_dtype("classes")
  assert orig_data_dtype == "float32" and orig_classes_dtype == "float32"
  orig_data_shape = dataset.get_data_shape("data")
  orig_classes_shape = dataset.get_data_shape("classes")
  assert orig_data_shape == [3] and orig_classes_shape == [2]

  hdf_fn = _get_tmp_file(suffix=".hdf")
  hdf_writer = HDFDatasetWriter(filename=hdf_fn)
  hdf_writer.dump_from_dataset(dataset, use_progress_bar=False)
  hdf_writer.close()

  hdf_dataset = HDFDataset(files=[hdf_fn])
  hdf_dataset.initialize()
  hdf_dataset.init_seq_order(epoch=1)
  hdf_data_dtype = hdf_dataset.get_data_dtype("data")
  hdf_classes_dtype = hdf_dataset.get_data_dtype("classes")
  assert hdf_data_dtype == orig_data_dtype and hdf_classes_dtype == orig_classes_dtype
  hdf_data_dim = hdf_dataset.get_data_dim("data")
  hdf_classes_dim = hdf_dataset.get_data_dim("classes")
  assert hdf_data_dim == orig_data_shape[-1] and hdf_classes_dim == orig_classes_shape[-1]
  hdf_data_shape = hdf_dataset.get_data_shape("data")
  hdf_classes_shape = hdf_dataset.get_data_shape("classes")
  assert hdf_data_shape == orig_data_shape and hdf_classes_shape == orig_classes_shape
  hdf_dataset.load_seqs(0, 1)
  hdf_data_data = hdf_dataset.get_data(0, "data")
  hdf_data_classes = hdf_dataset.get_data(0, "classes")
  assert hdf_data_data.dtype == orig_data_dtype and hdf_data_classes.dtype == orig_classes_dtype
예제 #5
0
    def classify_in_background(self):
        while True:
            requests = []
            # fetch first request
            r = yield self.classification_queue.get()
            requests.append(r)
            # grab all other waiting requests
            try:
                while True:
                    requests.append(self.classification_queue.get_nowait())
            except QueueEmpty:
                pass

            output_dim = {}
            # Do dataset creation and classification.
            dataset = StaticDataset(data=[r.data for r in requests],
                                    output_dim=output_dim)
            dataset.init_seq_order()
            batches = dataset.generate_batches(
                recurrent_net=self.engine.network.recurrent,
                batch_size=self.batch_size,
                max_seqs=self.max_seqs)

            with (yield self.lock.acquire()):
                ctt = ForwardTaskThread(self.engine.network, self.devices,
                                        dataset, batches)
                yield ctt.join()

            try:
                for i in range(dataset.num_seqs):
                    requests[i].future.set_result(ctt.result[i])
                    self.classification_queue.task_done()
            except Exception as e:
                print('exception', e)
                raise
예제 #6
0
    def post(self, *args, **kwargs):
        #TODO: Make this batch over a specific time period
    
        params = json.loads(self.request.body)
        output_dim = {}
        ret = {}
        
        #first get meta data
        engine_hash = params['engine_hash']
        
        print('Received engine hash: ', engine_hash, file=log.v4)
        
        #delete unneccessary stuff so that the rest works
        del params['engine_hash']
        
        #load in engine and hash
        engine = _engines[engine_hash]
        network = engine.network
        devices = _devices[engine_hash]
        
        hash_engine = hashlib.new('ripemd160')
        hash_engine.update(json.dumps(params) + engine_hash)
        hash_temp = hash_engine.hexdigest()
        
        #process the data
        for k in params:
            try:
                params[k] = numpy.asarray(params[k], dtype='float32')
                if k != 'data':
                  output_dim[k] = network.n_out[k]  # = [network.n_in,2] if k == 'data' else network.n_out[k]
            except Exception:
                if k != 'data' and not k in network.n_out:
                    ret['error'] = 'unknown target: %s' % k
                else:
                    ret['error'] = 'unable to convert %s to an array from value %s' % (k, str(params[k]))
                break
        if not 'error' in ret:
            try:
                data = StaticDataset(data=[params], output_dim=output_dim)
                data.init_seq_order()
            except Exception:
                ret['error'] = 'Dataset server error'
                self.write(ret)
                pass
            else:
                batches = data.generate_batches(recurrent_net=network.recurrent,
                                                batch_size=sys.maxsize, max_seqs=1)
                if not hash_temp in _classify_cache:
                    print('Starting classification', file=log.v3)
                    #if we haven't yet processed this exact request, and saved it in the cache
                    _classify_cache[hash_temp] = yield self.classification_task(network=network,
                                                                                devices=devices,
                                                                                data=data, batches=batches)

                ret = {'result':
                     {k: _classify_cache[hash_temp].result[k].tolist() for k in _classify_cache[hash_temp].result}}
        
        print("Finished processing classification with ID: ", hash_temp, file=log.v4)
        
        self.write(ret)
예제 #7
0
def test_hdf_data_target_int32():
  from GeneratingDataset import StaticDataset
  dataset = StaticDataset([
    {"data": numpy.array([1, 2, 3], dtype="uint8"),
     "classes": numpy.array([2147483647, 2147483646, 2147483645], dtype="int32")}],
    output_dim={"data": (255, 1), "classes": (10, 1)})
  dataset.initialize()
  dataset.init_seq_order(epoch=0)
  dataset.load_seqs(0, 1)
  orig_classes_dtype = dataset.get_data_dtype("classes")
  orig_classes_seq = dataset.get_data(0, "classes")
  assert orig_classes_seq.shape == (3,) and orig_classes_seq[0] == 2147483647
  assert orig_classes_seq.dtype == orig_classes_dtype == "int32"

  hdf_fn = _get_tmp_file(suffix=".hdf")
  hdf_writer = HDFDatasetWriter(filename=hdf_fn)
  hdf_writer.dump_from_dataset(dataset, use_progress_bar=False)
  hdf_writer.close()

  hdf_dataset = HDFDataset(files=[hdf_fn])
  hdf_dataset.initialize()
  hdf_dataset.init_seq_order(epoch=1)
  hdf_classes_dtype = hdf_dataset.get_data_dtype("classes")
  assert hdf_classes_dtype == orig_classes_dtype
  hdf_classes_shape = hdf_dataset.get_data_shape("classes")
  assert hdf_classes_shape == []
  hdf_dataset.load_seqs(0, 1)
  hdf_data_classes = hdf_dataset.get_data(0, "classes")
  assert hdf_data_classes.dtype == orig_classes_dtype
  assert all(hdf_data_classes == orig_classes_seq)
예제 #8
0
 def _classify(params):
   ret = { }
   output_dim = {}
   hash = hashlib.new('ripemd160')
   hash.update(json.dumps(params))
   hash = hash.hexdigest()
   for k in params:
     try:
       params[k] = numpy.asarray(params[k], dtype='float32')
       if k != 'data':
         output_dim[k] = network.n_out[k] # = [network.n_in,2] if k == 'data' else network.n_out[k]
     except Exception:
       if k != 'data' and not k in network.n_out:
         ret['error'] = 'unknown target: %s' % k
       else:
         ret['error'] = 'unable to convert %s to an array from value %s' % (k,str(params[k]))
       break
   if not 'error' in ret:
     data = StaticDataset(data=[params], output_dim=output_dim)
     data.init_seq_order()
     try:
       data = StaticDataset(data=[params], output_dim=output_dim)
       data.init_seq_order()
     except Exception:
       ret['error'] = "invalid data: %s" % params
     else:
       batches = data.generate_batches(recurrent_net=network.recurrent,
                                       batch_size=sys.maxsize, max_seqs=1)
       if not hash in workers:
         workers[hash] = ClassificationTaskThread(network, devices, data, batches)
         workers[hash].json_params = params
         print("worker started:", hash, file=log.v3)
       ret['result'] = { 'hash' : hash }
   return ret
예제 #9
0
def test_hdf_data_target_int32():
  from GeneratingDataset import StaticDataset
  dataset = StaticDataset([
    {"data": numpy.array([1, 2, 3], dtype="uint8"),
     "classes": numpy.array([2147483647, 2147483646, 2147483645], dtype="int32")}],
    output_dim={"data": (255, 1), "classes": (10, 1)})
  dataset.initialize()
  dataset.init_seq_order(epoch=0)
  dataset.load_seqs(0, 1)
  orig_classes_dtype = dataset.get_data_dtype("classes")
  orig_classes_seq = dataset.get_data(0, "classes")
  assert orig_classes_seq.shape == (3,) and orig_classes_seq[0] == 2147483647
  assert orig_classes_seq.dtype == orig_classes_dtype == "int32"

  hdf_fn = _get_tmp_file(suffix=".hdf")
  hdf_writer = HDFDatasetWriter(filename=hdf_fn)
  hdf_writer.dump_from_dataset(dataset, use_progress_bar=False)
  hdf_writer.close()

  hdf_dataset = HDFDataset(files=[hdf_fn])
  hdf_dataset.initialize()
  hdf_dataset.init_seq_order(epoch=1)
  hdf_classes_dtype = hdf_dataset.get_data_dtype("classes")
  assert hdf_classes_dtype == orig_classes_dtype
  hdf_classes_shape = hdf_dataset.get_data_shape("classes")
  assert hdf_classes_shape == []
  hdf_dataset.load_seqs(0, 1)
  hdf_data_classes = hdf_dataset.get_data(0, "classes")
  assert hdf_data_classes.dtype == orig_classes_dtype
  assert all(hdf_data_classes == orig_classes_seq)
예제 #10
0
 def __init__(self, config, train_data):
   """
   :param Config.Config config:
   :param Dataset train_data:
   """
   self.config = config
   self.opts = CollectionReadCheckCovered(config.get_of_type("hyper_param_tuning", dict, {}))
   self.log = log.v1
   train_data.init_seq_order(epoch=1)
   self.train_data = StaticDataset.copy_from_dataset(
     train_data, max_seqs=self.opts.get("num_train_steps", 100))
   self.hyper_params = []  # type: list[HyperParam]
   self._find_hyper_params()
   if not self.hyper_params:
     raise Exception("No hyper params found.")
   self.hyper_params.sort(key=lambda p: p.unique_idx)
   print("We have found these hyper params:")
   for p in self.hyper_params:
     print(" %s" % p.description())
   self.dry_run_first_individual = self.opts.get("dry_run_first_individual", True)
   self.num_iterations = self.opts["num_tune_iterations"]
   self.num_individuals = self.opts["num_individuals"]
   self.num_kill_individuals = self.opts.get(
     "num_kill_individuals", self.num_individuals // 2)
   self.num_best = self.opts.get("num_best", 10)
   self.num_threads = self.opts.get("num_threads", guess_requested_max_num_threads())
   self.opts.assert_all_read()
예제 #11
0
 def __init__(self, config, train_data):
   """
   :param Config.Config config:
   :param Dataset train_data:
   """
   self.config = config
   self.opts = CollectionReadCheckCovered(config.get_of_type("hyper_param_tuning", dict, {}))
   self.log = log.v1
   train_data.init_seq_order(epoch=1)
   self.train_data = StaticDataset.copy_from_dataset(
     train_data, max_seqs=self.opts.get("num_train_steps", 100))
   self.hyper_params = []  # type: list[HyperParam]
   self._find_hyper_params()
   if not self.hyper_params:
     raise Exception("No hyper params found.")
   self.hyper_params.sort(key=lambda p: p.unique_idx)
   print("We have found these hyper params:")
   for p in self.hyper_params:
     print(" %s" % p.description())
   self.dry_run_first_individual = self.opts.get("dry_run_first_individual", True)
   self.num_iterations = self.opts["num_tune_iterations"]
   self.num_individuals = self.opts["num_individuals"]
   self.num_kill_individuals = self.opts.get(
     "num_kill_individuals", self.num_individuals // 2)
   self.num_best = self.opts.get("num_best", 10)
   self.num_threads = self.opts.get("num_threads", guess_requested_max_num_threads())
   self.opts.assert_all_read()
예제 #12
0
파일: Engine.py 프로젝트: chagge/returnn
 def _classify(params):
   ret = { }
   output_dim = {}
   hash = hashlib.new('ripemd160')
   hash.update(json.dumps(params))
   hash = hash.hexdigest()
   for k in params:
     try:
       params[k] = numpy.asarray(params[k], dtype='float32')
       if k != 'data':
         output_dim[k] = network.n_out[k] # = [network.n_in,2] if k == 'data' else network.n_out[k]
     except Exception:
       if k != 'data' and not k in network.n_out:
         ret['error'] = 'unknown target: %s' % k
       else:
         ret['error'] = 'unable to convert %s to an array from value %s' % (k,str(params[k]))
       break
   if not 'error' in ret:
     data = StaticDataset(data=[params], output_dim=output_dim)
     data.init_seq_order()
     try:
       data = StaticDataset(data=[params], output_dim=output_dim)
       data.init_seq_order()
     except Exception:
       ret['error'] = "invalid data: %s" % params
     else:
       batches = data.generate_batches(recurrent_net=network.recurrent,
                                       batch_size=sys.maxint, max_seqs=1)
       if not hash in classifiers:
         classifiers[hash] = ClassificationTaskThread(network, devices, data, batches)
         classifiers[hash].json_params = params
         print >> log.v3, "classifier started:", hash
       ret['result'] = { 'hash' : hash }
   return ret
예제 #13
0
 def run(self):
   if self.individual.cost is not None:
     return self.individual.cost
   start_time = time.time()
   hyper_param_mapping = self.individual.hyper_param_mapping
   print("Training %r using hyper params:" % self.individual.name, file=log.v2)
   for p in self.optim.hyper_params:
     print(" %s -> %s" % (p.description(), hyper_param_mapping[p]), file=log.v2)
   config = self.optim.create_config_instance(hyper_param_mapping, gpu_ids=self.gpu_ids)
   engine = Engine(config=config)
   train_data = StaticDataset.copy_from_dataset(self.optim.train_data)
   engine.init_train_from_config(config=config, train_data=train_data)
   # Not directly calling train() as we want to have full control.
   engine.epoch = 1
   train_data.init_seq_order(epoch=engine.epoch)
   batches = train_data.generate_batches(
     recurrent_net=engine.network.recurrent,
     batch_size=engine.batch_size,
     max_seqs=engine.max_seqs,
     max_seq_length=int(engine.max_seq_length),
     seq_drop=engine.seq_drop,
     shuffle_batches=engine.shuffle_batches,
     used_data_keys=engine.network.used_data_keys)
   engine.updater.set_learning_rate(engine.learning_rate)
   trainer = Runner(engine=engine, dataset=train_data, batches=batches, train=True)
   self.runner = trainer
   if self.cancel_flag:
     raise CancelTrainingException("Trainer cancel flag is set")
   trainer.run(report_prefix="hyper param tune train %r" % self.individual.name)
   if not trainer.finalized:
     print("Trainer exception:", trainer.run_exception, file=log.v1)
     raise trainer.run_exception
   cost = trainer.score["cost:output"]
   print(
     "Individual %s:" % self.individual.name,
     "Train cost:", cost,
     "elapsed time:", hms_fraction(time.time() - start_time),
     file=self.optim.log)
   self.individual.cost = cost
예제 #14
0
 def run(self):
   if self.individual.cost is not None:
     return self.individual.cost
   start_time = time.time()
   hyper_param_mapping = self.individual.hyper_param_mapping
   print("Training %r using hyper params:" % self.individual.name, file=log.v2)
   for p in self.optim.hyper_params:
     print(" %s -> %s" % (p.description(), hyper_param_mapping[p]), file=log.v2)
   config = self.optim.create_config_instance(hyper_param_mapping, gpu_ids=self.gpu_ids)
   engine = Engine(config=config)
   train_data = StaticDataset.copy_from_dataset(self.optim.train_data)
   engine.init_train_from_config(config=config, train_data=train_data)
   # Not directly calling train() as we want to have full control.
   engine.epoch = 1
   train_data.init_seq_order(epoch=engine.epoch)
   batches = train_data.generate_batches(
     recurrent_net=engine.network.recurrent,
     batch_size=engine.batch_size,
     max_seqs=engine.max_seqs,
     max_seq_length=int(engine.max_seq_length),
     seq_drop=engine.seq_drop,
     shuffle_batches=engine.shuffle_batches,
     used_data_keys=engine.network.used_data_keys)
   engine.updater.set_learning_rate(engine.learning_rate, session=engine.tf_session)
   trainer = Runner(engine=engine, dataset=train_data, batches=batches, train=True)
   self.runner = trainer
   if self.cancel_flag:
     raise CancelTrainingException("Trainer cancel flag is set")
   trainer.run(report_prefix="hyper param tune train %r" % self.individual.name)
   if not trainer.finalized:
     print("Trainer exception:", trainer.run_exception, file=log.v1)
     raise trainer.run_exception
   cost = trainer.score["cost:output"]
   print(
     "Individual %s:" % self.individual.name,
     "Train cost:", cost,
     "elapsed time:", hms_fraction(time.time() - start_time),
     file=self.optim.log)
   self.individual.cost = cost
예제 #15
0
def test_multi_target_init():
    config = Config()
    config.update({
        "multiprocessing": False,
        "blocking": True,
        "device": "cpu",
        "num_epochs": 1,
        "num_inputs": 3,
        "num_outputs": {
            "t1": 4,
            "t2": 5
        },
        "learning_rate": 1.0,
    })
    config.network_topology_json = """
  {
  "fw0": {"class": "hidden", "activation": "identity", "n_out": 3},
  "out1": {"class": "softmax", "loss": "ce", "target": "t1", "from": ["fw0"]},
  "out2": {"class": "softmax", "loss": "ce", "target": "t2", "from": ["fw0"]}
  }
  """

    device = Device("cpu", config=config, blocking=True)
    assert_true(device.trainnet, "train network initialized")
    assert_true(device.testnet, "test network initialized")
    param_vars = device.trainnet.get_all_params_vars()
    print "params:", param_vars
    assert_equal(len(param_vars), 6, "W, b vars for each out, and fw")
    num_params = get_num_params(param_vars)
    assert_equal(num_params, (3 * 3 + 3) + (3 * 4 + 4) + (3 * 5 + 5),
                 "W, b for each out, and fw")
    assert_in("fw0", device.testnet.hidden)
    assert_in("out1", device.testnet.output)
    assert_in("out2", device.testnet.output)
    assert_is(device.testnet.j["t1"], device.testnet.output["out1"].index)
    assert_true(device.updater)
    update_list = device.updater.getUpdateList()
    print "update list:"
    pprint(update_list)
    update_dict = dict(update_list)
    assert_equal(len(update_dict), len(update_list),
                 "all params in update list only once")
    assert_in("fw0", device.trainnet.hidden)
    assert_equal(len(device.trainnet.hidden), 1)
    assert_in("W_in_data_fw0", device.trainnet.hidden["fw0"].params)
    assert_in("b_fw0", device.trainnet.hidden["fw0"].params)
    assert_equal(len(device.trainnet.hidden["fw0"].params), 2)
    assert_in("out1", device.trainnet.output)
    assert_equal(len(device.trainnet.output), 2)
    assert_in("W_in_fw0_out1", device.trainnet.output["out1"].params)
    assert_in("b_out1", device.trainnet.output["out1"].params)
    assert_equal(len(device.trainnet.output["out1"].params), 2)
    assert_in(device.trainnet.hidden["fw0"].params["W_in_data_fw0"],
              update_dict)
    assert_in(device.trainnet.hidden["fw0"].params["b_fw0"], update_dict)
    assert_in(device.trainnet.output["out1"].params["W_in_fw0_out1"],
              update_dict)
    assert_in(device.trainnet.output["out1"].params["b_out1"], update_dict)
    assert_in(device.trainnet.output["out2"].params["W_in_fw0_out2"],
              update_dict)
    assert_in(device.trainnet.output["out2"].params["b_out2"], update_dict)
    assert_equal(len(update_dict), 6)

    # Set net params.
    net_params = {
        "fw0": {
            "W_in_data_fw0": numpy.identity(3, dtype="float32"),
            "b_fw0": numpy.zeros((3, ), dtype="float32")
        },
        "out1": {
            "W_in_fw0_out1":
            numpy.arange(0.0, 1.2, 0.1, dtype="float32").reshape((3, 4)),
            "b_out1":
            numpy.arange(0.0, 4, dtype="float32")
        },
        "out2": {
            "W_in_fw0_out2":
            numpy.arange(0.0, 1.5, 0.1, dtype="float32").reshape((3, 5)),
            "b_out2":
            numpy.arange(0.0, 5, dtype="float32")
        }
    }
    device.trainnet.set_params_by_dict(net_params)
    device.testnet.set_params_by_dict(net_params)

    # Show params.
    for p in param_vars:
        print "init %s:" % p
        pprint(p.get_value())

    # Init dataset.
    dataset = StaticDataset(data=[{
        "data":
        numpy.array([[0.1, 0.2, -0.3]], dtype="float32"),
        "t1":
        numpy.array([2]),
        "t2":
        numpy.array([4])
    }],
                            output_dim=config.typed_value("num_outputs"))
    dataset.init_seq_order()
    assert_equal(dataset.is_data_sparse("data"), False)
    assert_equal(dataset.is_data_sparse("t1"), True)
    assert_equal(dataset.is_data_sparse("t2"), True)

    # Copy to device allocation.
    success = assign_dev_data_single_seq(device, dataset, 0)
    assert_true(success, "failed to allocate & assign data")

    # Check allocated data.
    assert_equal(device.targets["data"].shape,
                 (1, 1, 3))  # input shape. (time,batch,dim)
    assert_in("t1", device.targets)
    assert_in("t2", device.targets)
    assert_equal(device.targets["t1"].shape, (1, 1))
    assert_equal(device.targets["t2"].shape, (1, 1))
    assert_equal(device.output_index["data"].shape, (1, 1))
    numpy.testing.assert_equal(device.output_index["data"], numpy.array([[1]]))
    assert_equal(device.output_index["t1"].shape, (1, 1))
    numpy.testing.assert_equal(device.output_index["t1"], numpy.array([[1]]))

    # Forward test.
    device.update_data()
    device.testnet.costs["out1"].name = "out1_cost"  # nice in the func graph
    out_i1 = device.testnet.output["out1"].index
    out_i1_nonzero = device.testnet.output["out1"].i
    nll1, pcx1 = T.nnet.crossentropy_softmax_1hot(
        x=device.testnet.output["out1"].y_m[out_i1_nonzero],
        y_idx=device.testnet.output["out1"].y_data_flat[out_i1_nonzero])
    forward_func = theano.function(
        inputs=[device.block_start, device.block_end],
        outputs=[
            device.testnet.j["t1"], out_i1, out_i1_nonzero[0], nll1, pcx1,
            device.testnet.costs["out1"],
            device.testnet.output["out1"].p_y_given_x,
            device.testnet.costs["out2"],
            device.testnet.output["out2"].p_y_given_x
        ],
        givens=device.make_givens(device.testnet),
        no_default_updates=True,
        on_unused_input='warn',
        name="forward")
    #print "forward func:"
    #theano.printing.debugprint(forward_func)
    net_j1, out_i1_val, out_i1_nz_val, nll1_val, pcx1_val, t1_cost, t1_y, t2_cost, t2_y = forward_func(
        0, 1)
    print "forward results:"
    pprint(net_j1)
    pprint(out_i1_val)
    pprint(out_i1_nz_val)
    pprint(nll1_val)
    pprint(pcx1_val)
    pprint(t1_cost)
    pprint(t1_y)
    pprint(t2_cost)
    pprint(t2_y)
    assert_equal(net_j1, numpy.array([[1]]))
    assert_equal(out_i1_val, numpy.array([[1]]))
    assert_equal(out_i1_nz_val, numpy.array([0]))
    assert_almost_equal(nll1_val, numpy.array([t1_cost]))
    numpy.testing.assert_almost_equal(t1_y, pcx1_val)
    assert_almost_equal(t1_cost, 1.440189698561195, places=6)
    assert_almost_equal(t2_cost, 0.45191439593759336, places=6)
    numpy.testing.assert_almost_equal(
        t1_y,
        numpy.array([[0.0320586, 0.08714432, 0.23688282, 0.64391426]]),
        decimal=6)
    numpy.testing.assert_almost_equal(t2_y,
                                      numpy.array([[
                                          0.01165623, 0.03168492, 0.08612854,
                                          0.23412166, 0.63640865
                                      ]]),
                                      decimal=6)

    # One train step.
    device.set_learning_rate(config.typed_value("learning_rate"))
    device.run("train")
    output_list, outputs_format = device.result()
    assert_is_instance(output_list, list)
    assert_true(outputs_format, "for train, we should always get the format")
    outputs = Device.make_result_dict(output_list, outputs_format)
    pprint(outputs)
    assert_in("cost:out1", outputs)
    assert_greater(outputs["cost:out1"], 0)
    assert_almost_equal(outputs["cost:out1"], t1_cost)

    # Get net params.
    params = device.get_net_train_params(device.trainnet)
    references_params = {
        "W_in_data_fw0":
        numpy.array([[1.00055406e+00, 5.54056978e-04, 5.54056978e-04],
                     [1.10811396e-03, 1.00110811e+00, 1.10811396e-03],
                     [-1.66217093e-03, -1.66217093e-03, 9.98337829e-01]]),
        "b_fw0":
        numpy.array([0.00554057, 0.00554057, 0.00554057]),
        "W_in_fw0_out1":
        numpy.array([[-0.00320586, 0.09128557, 0.27631172, 0.23560857],
                     [0.39358828, 0.48257114, 0.75262344, 0.57121715],
                     [0.80961758, 0.9261433, 0.77106485, 1.29317428]]),
        "b_out1":
        numpy.array([-0.0320586, 0.91285568, 2.76311718, 2.35608574]),
        "W_in_fw0_out2":
        numpy.array([[
            -1.16562310e-03, 9.68315079e-02, 1.91387146e-01, 2.76587834e-01,
            4.36359135e-01
        ],
                     [
                         4.97668754e-01, 5.93663016e-01, 6.82774291e-01,
                         7.53175669e-01, 9.72718271e-01
                     ],
                     [
                         1.00349687e+00, 1.10950548e+00, 1.22583856e+00,
                         1.37023650e+00, 1.29092259e+00
                     ]]),
        "b_out2":
        numpy.array(
            [-0.01165623, 0.96831508, 1.91387146, 2.76587834, 4.36359135])
    }
    assert_equal(len(param_vars), len(params))
    for p, v in zip(param_vars, params):
        print "%s:" % p
        pprint(v)
        assert_true(p.name)
        numpy.testing.assert_almost_equal(references_params[p.name],
                                          v,
                                          decimal=6)
예제 #16
0
def test_combi_auto_enc():
    config = Config()
    config.update({
        "multiprocessing": False,
        "blocking": True,
        "device": "cpu",
        "num_epochs": 1,
        "num_inputs": 3,
        "num_outputs": {
            "classes": 2
        },
        "learning_rate": 1.0,
        "network": {
            "output": {
                "class": "softmax",
                "loss": "ce",
                "target": "classes"
            },
            "auto-enc": {
                "class": "softmax",
                "loss": "sse",
                "dtype": "float32",
                "target": "data"
            }
        }
    })

    device = Device("cpu", config=config, blocking=True)

    # Set net params.
    def get_net_params(with_auto_enc=True):
        d = {
            "output": {
                "W_in_data_output":
                numpy.arange(0.1, 0.7, 0.1, dtype="float32").reshape((3, 2)),
                "b_output":
                numpy.arange(0.0, 2, dtype="float32")
            }
        }
        if with_auto_enc:
            d["auto-enc"] = {
                "W_in_data_auto-enc":
                numpy.arange(0.1, 1.0, 0.1, dtype="float32").reshape((3, 3)),
                "b_auto-enc":
                numpy.arange(0.0, 3, dtype="float32")
            }
        return d

    device.trainnet.set_params_by_dict(get_net_params())
    device.testnet.set_params_by_dict(get_net_params())

    # Show params.
    for p in device.trainnet.get_all_params_vars():
        print "init %s:" % p
        pprint(p.get_value())

    # Init dataset.
    dataset = StaticDataset(data=[{
        "data":
        numpy.array([[0.1, 0.2, -0.3]], dtype="float32"),
        "classes":
        numpy.array([1]),
    }],
                            output_dim=config.typed_value("num_outputs"))
    dataset.init_seq_order()

    # Copy to device allocation.
    success = assign_dev_data_single_seq(device, dataset, 0)
    assert_true(success, "failed to allocate & assign data")

    # One train step.
    device.set_learning_rate(config.typed_value("learning_rate"))
    device.run("train")
    output_list, outputs_format = device.result()
    assert_is_instance(output_list, list)
    assert_true(outputs_format, "for train, we should always get the format")
    outputs = Device.make_result_dict(output_list, outputs_format)
    pprint(outputs)
    assert_in("cost:output", outputs)
    assert_in("cost:auto-enc", outputs)
    expected_cost_output = 0.3132616877555847
    assert_almost_equal(outputs["cost:output"], expected_cost_output, places=6)
    exact_cost_output = outputs["cost:output"]
    assert_almost_equal(outputs["cost:auto-enc"], 5.263200283050537, places=6)

    # Now, drop the auto-enc from the network, and redo the same thing.
    del config.typed_value("network")["auto-enc"]
    device = Device("cpu", config=config, blocking=True)
    device.trainnet.set_params_by_dict(get_net_params(with_auto_enc=False))
    device.testnet.set_params_by_dict(get_net_params(with_auto_enc=False))
    for p in device.trainnet.get_all_params_vars():
        print "second run, init %s:" % p
        pprint(p.get_value())
    dataset.init_seq_order()  # reset. probably not needed
    success = assign_dev_data_single_seq(device, dataset, 0)
    assert_true(success, "failed to allocate & assign data")
    device.set_learning_rate(config.typed_value("learning_rate"))
    device.run("train")
    output_list, outputs_format = device.result()
    assert_is_instance(output_list, list)
    assert_true(outputs_format, "for train, we should always get the format")
    outputs = Device.make_result_dict(output_list, outputs_format)
    pprint(outputs)
    assert_in("cost:output", outputs)
    assert_not_in("cost:auto-enc", outputs)
    assert_almost_equal(outputs["cost:output"], expected_cost_output, places=6)
    assert_equal(outputs["cost:output"], exact_cost_output)
예제 #17
0
파일: Server.py 프로젝트: panky8070/returnn
    def post(self, *args, **kwargs):
        # TODO: Write formal documentation
        """
    Method for handling classification via HTTP Post request. The following must
    be defined in the URL paramaters: engine_hash (engine hash which points to which
    engine to use), and the data itself in the body. If using binary data, the following
    URL paramaters must also be supplied: data_format='binary', data_shape=(<dim1,dim2>).
    If using a specific data type, you can supply it as the url parameter data_type.
    :param args:
    :param kwargs:
    :return: Either JSON with error or JSON list of generated outputs.
    """
        url_params = self.request.arguments
        output_dim = {}
        ret = {}
        data = {}
        data_format = ''
        data_type = ''
        engine_hash = ''
        data_shape = ''
        # First get meta data from URL parameters
        engine_hash = str(url_params['engine_hash']).replace("['", '').replace(
            "']", '')
        if 'data_format' in url_params:
            data_format = str(url_params['data_format']).replace("['",
                                                                 '').replace(
                                                                     "']", '')
        if 'data_type' in url_params:
            # Possible options: https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html
            data_type = str(url_params['data_type']).replace("['", '').replace(
                "']", '')
        if 'data_shape' in url_params:
            data_shape = str(url_params['data_shape']).replace(
                "['", '').replace("']", '')  # either '' or 'dim1,dim2'
        # Apply defaults, in case we didn't get them through the header.
        if data_format == '':
            data_format = 'json'
        if data_type == '':
            data_type = 'float32'

        print(
            'Received engine hash: %s data formatted: %s, data type %s data shape: %s'
            % (engine_hash, data_format, data_type, data_shape),
            file=log.v5)
        # Load in engine and hash
        engine = _engines[engine_hash]
        network = engine.network
        devices = _devices[engine_hash]
        hash_engine = hashlib.new('ripemd160')
        hash_engine.update(str(self.request.body) + engine_hash)
        hash_temp = hash_engine.hexdigest()

        # Pre-process the data
        if data_format == 'json':
            data = json.loads(self.request.body)
            for k in data:
                try:
                    data[k] = np.asarray(data[k], dtype=data_type)
                    if k != 'data':
                        output_dim[k] = network.n_out[
                            k]  # = [network.n_in,2] if k == 'data' else network.n_out[k]
                except Exception:
                    if k != 'data' and k not in network.n_out:
                        ret['error'] = 'unknown target: %s' % k
                    else:
                        ret['error'] = 'unable to convert %s to an array from value %s' % (
                            k, str(data[k]))
                    break

        if data_format == 'binary':
            float_array = array(self._get_type_code(data_type))
            try:
                float_array.fromstring(self.request.body)
            except Exception as e:
                print('Binary data error: %s' % str(e.message), file=log.v4)
                ret['error'] = 'Error during binary data conversion: ' + e.message
            data['data'] = np.asarray(float_array.tolist(), dtype=data_type)
            data_shape_arr = data_shape.split(",")
            shape = (int(data_shape_arr[0]), int(data_shape_arr[1]))
            data['data'] = np.reshape(data['data'], shape)

        # Do dataset creation and classification.
        if 'error' not in ret:
            data = StaticDataset(data=[data], output_dim=output_dim)
            data.init_seq_order()
            batches = data.generate_batches(recurrent_net=network.recurrent,
                                            batch_size=sys.maxsize,
                                            max_seqs=1)
            if hash_temp not in _classify_cache:
                print('Starting classification', file=log.v3)
                # If we haven't yet processed this exact request and saved it in the cache
                _classify_cache[hash_temp] = yield self._classification_task(
                    network=network,
                    devices=devices,
                    data=data,
                    batches=batches)
            ret = {
                'result': {
                    k: _classify_cache[hash_temp].result[k].tolist()
                    for k in _classify_cache[hash_temp].result
                }
            }

        # Update engine usage for performance optimization
        _engine_usage[engine_hash] = datetime.datetime.now()
        print("Finished processing classification with ID: ",
              hash_temp,
              file=log.v3)
        self.write(ret)
예제 #18
0
    def post(self, *args, **kwargs):
        # TODO: Make this batch over a specific time period
        # TODO: Write formal documentation

        url_params = self.request.arguments
        output_dim = {}
        ret = {}
        data = {}
        data_format = ''
        data_type = ''
        engine_hash = ''
        data_shape = ''
        # First get meta data from URL parameters
        try:
            engine_hash = str(url_params['engine_hash']).replace("['",
                                                                 '').replace(
                                                                     "']", '')
            if 'data_format' in url_params:
                data_format = str(url_params['data_format']).replace(
                    "['", '').replace("']", '')
            if 'data_type' in url_params:
                # Possible options: https://docs.scipy.org/doc/numpy-1.10.1/user/basics.types.html
                data_type = str(url_params['data_type']).replace("['",
                                                                 '').replace(
                                                                     "']", '')
            if 'data_shape' in url_params:
                data_shape = str(url_params['data_shape']).replace(
                    "['", '').replace("']", '')  # either '' or 'dim1,dim2'
        except Exception as e:
            print('Parameter formatting exception: ' + str(e.message),
                  file=log.v4)
        # Apply defaults, in case we didn't get them through the header.
        if data_format == '':
            data_format = 'json'
        if data_type == '':
            data_type = 'float32'

        print('Received engine hash: ' + engine_hash + ', data formatted: ' +
              data_format + ', data type ' + data_type + ' data shape: ' +
              data_shape,
              file=log.v5)
        # Load in engine and hash
        engine = _engines[engine_hash]
        network = engine.network
        devices = _devices[engine_hash]
        hash_engine = hashlib.new('ripemd160')
        hash_engine.update(str(self.request.body) + engine_hash)
        hash_temp = hash_engine.hexdigest()

        # Pre-process the data
        if data_format == 'json':
            data = json.loads(self.request.body)
            for k in data:
                try:
                    data[k] = np.asarray(data[k], dtype=data_type)
                    if k != 'data':
                        output_dim[k] = network.n_out[
                            k]  # = [network.n_in,2] if k == 'data' else network.n_out[k]
                except Exception:
                    if k != 'data' and not k in network.n_out:
                        ret['error'] = 'unknown target: %s' % k
                    else:
                        ret['error'] = 'unable to convert %s to an array from value %s' % (
                            k, str(data[k]))
                    break

        if data_format == 'binary':
            try:
                float_array = array(self._get_type_code(data_type))
                float_array.fromstring(self.request.body)
                data['data'] = np.asarray(float_array.tolist(),
                                          dtype=data_type)
                data_shape_arr = data_shape.split(",")
                shape = (int(data_shape_arr[0]), int(data_shape_arr[1]))
                data['data'] = np.reshape(data['data'], shape)
            except Exception as e:
                print('Binary data error: ' + str(e.message), file=log.v4)
                ret['error'] = 'Error during binary data conversion: ' + e.message

        # Do dataset creation and classification.
        if not 'error' in ret:
            try:
                data = StaticDataset(data=[data], output_dim=output_dim)
                data.init_seq_order()
            except Exception:
                ret['error'] = 'Dataset server error'
                self.write(ret)
                pass
            else:
                batches = data.generate_batches(
                    recurrent_net=network.recurrent,
                    batch_size=sys.maxsize,
                    max_seqs=1)
                if not hash_temp in _classify_cache:
                    print('Starting classification', file=log.v3)
                    # If we haven't yet processed this exact request and saved it in the cache
                    _classify_cache[
                        hash_temp] = yield self.classification_task(
                            network=network,
                            devices=devices,
                            data=data,
                            batches=batches)
                ret = {
                    'result': {
                        k: _classify_cache[hash_temp].result[k].tolist()
                        for k in _classify_cache[hash_temp].result
                    }
                }

        # Update engine usage for performance optimization
        _engine_usage[engine_hash] = datetime.datetime.now()
        print("Finished processing classification with ID: ",
              hash_temp,
              file=log.v3)
        self.write(ret)
예제 #19
0
def main():
  rnn.init(
    command_line_options=sys.argv[1:],
    config_updates={
      "task": "nop", "log": None, "device": "cpu",
      "allow_random_model_init": True,
      "debug_add_check_numerics_on_output": False},
    extra_greeting="Import Blocks MT model.")
  assert Util.BackendEngine.is_tensorflow_selected()
  config = rnn.config

  # Load Blocks MT model params.
  if not config.has("blocks_mt_model"):
    print("Please provide the option blocks_mt_model.")
    sys.exit(1)
  blocks_mt_model_fn = config.value("blocks_mt_model", "")
  assert blocks_mt_model_fn
  assert os.path.exists(blocks_mt_model_fn)
  if os.path.isdir(blocks_mt_model_fn):
    blocks_mt_model_fn += "/params.npz"
    assert os.path.exists(blocks_mt_model_fn)

  dry_run = config.bool("dry_run", False)
  if dry_run:
    our_model_fn = None
    print("Dry-run, will not save model.")
  else:
    our_model_fn = config.value('model', "returnn-model") + ".imported"
    print("Will save Returnn model as %s." % our_model_fn)
    assert os.path.exists(os.path.dirname(our_model_fn) or "."), "model-dir does not exist"
    assert not os.path.exists(our_model_fn + Util.get_model_filename_postfix()), "model-file already exists"

  blocks_mt_model = numpy.load(blocks_mt_model_fn)
  assert isinstance(blocks_mt_model, numpy.lib.npyio.NpzFile), "did not expect type %r in file %r" % (
    type(blocks_mt_model), blocks_mt_model_fn)
  print("Params found in Blocks model:")
  blocks_params = {}  # type: dict[str,numpy.ndarray]
  blocks_params_hierarchy = {}  # type: dict[str,dict[str]]
  blocks_total_num_params = 0
  for key in sorted(blocks_mt_model.keys()):
    value = blocks_mt_model[key]
    key = key.replace("-", "/")
    assert key[0] == "/"
    key = key[1:]
    blocks_params[key] = value
    print("  %s: %s, %s" % (key, value.shape, value.dtype))
    blocks_total_num_params += numpy.prod(value.shape)
    d = blocks_params_hierarchy
    for part in key.split("/"):
      d = d.setdefault(part, {})
  print("Blocks total num params: %i" % blocks_total_num_params)

  # Init our network structure.
  from TFNetworkRecLayer import _SubnetworkRecCell
  _SubnetworkRecCell._debug_out = []  # enable for debugging intermediate values below
  ChoiceLayer._debug_out = []  # also for debug outputs of search
  rnn.engine.use_search_flag = True  # construct the net as in search
  rnn.engine.init_network_from_config()
  print("Our network model params:")
  our_params = {}  # type: dict[str,tf.Variable]
  our_total_num_params = 0
  for v in rnn.engine.network.get_params_list():
    key = v.name[:-2]
    our_params[key] = v
    print("  %s: %s, %s" % (key, v.shape, v.dtype.base_dtype.name))
    our_total_num_params += numpy.prod(v.shape.as_list())
  print("Our total num params: %i" % our_total_num_params)

  # Now matching...
  blocks_used_params = set()  # type: set[str]
  our_loaded_params = set()  # type: set[str]

  def import_var(our_var, blocks_param):
    """
    :param tf.Variable our_var:
    :param str|numpy.ndarray blocks_param:
    """
    assert isinstance(our_var, tf.Variable)
    if isinstance(blocks_param, str):
      blocks_param = load_blocks_var(blocks_param)
    assert isinstance(blocks_param, numpy.ndarray)
    assert_equal(tuple(our_var.shape.as_list()), blocks_param.shape)
    our_loaded_params.add(our_var.name[:-2])
    our_var.load(blocks_param, session=rnn.engine.tf_session)

  def load_blocks_var(blocks_param_name):
    """
    :param str blocks_param_name:
    :rtype: numpy.ndarray
    """
    assert isinstance(blocks_param_name, str)
    assert blocks_param_name in blocks_params
    blocks_used_params.add(blocks_param_name)
    return blocks_params[blocks_param_name]

  enc_name = "bidirectionalencoder"
  enc_embed_name = "EncoderLookUp0.W"
  assert enc_name in blocks_params_hierarchy
  assert enc_embed_name in blocks_params_hierarchy[enc_name]  # input embedding
  num_encoder_layers = max([
    int(re.match(".*([0-9]+)", s).group(1))
    for s in blocks_params_hierarchy[enc_name]
    if s.startswith("EncoderBidirectionalLSTM")])
  blocks_input_dim, blocks_input_embed_dim = blocks_params["%s/%s" % (enc_name, enc_embed_name)].shape
  print("Blocks input dim: %i, embed dim: %i" % (blocks_input_dim, blocks_input_embed_dim))
  print("Blocks num encoder layers: %i" % num_encoder_layers)
  expected_enc_entries = (
    ["EncoderLookUp0.W"] +
    ["EncoderBidirectionalLSTM%i" % i for i in range(1, num_encoder_layers + 1)])
  assert_equal(set(expected_enc_entries), set(blocks_params_hierarchy[enc_name].keys()))

  our_input_layer = find_our_input_embed_layer()
  assert our_input_layer.input_data.dim == blocks_input_dim
  assert our_input_layer.output.dim == blocks_input_embed_dim
  assert not our_input_layer.with_bias
  import_var(our_input_layer.params["W"], "%s/%s" % (enc_name, enc_embed_name))

  dec_name = "decoder/sequencegenerator"
  dec_hierarchy_base = get_in_hierarchy(dec_name, blocks_params_hierarchy)
  assert_equal(set(dec_hierarchy_base.keys()), {"att_trans", "readout"})
  dec_embed_name = "readout/lookupfeedbackwmt15/lookuptable.W"
  get_in_hierarchy(dec_embed_name, dec_hierarchy_base)  # check

  for i in range(num_encoder_layers):
    # Assume standard LSTMCell.
    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    # lstm_matrix = self._linear1([inputs, m_prev])
    # i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
    # bias (4*in), kernel (in+out,4*out), w_(f|i|o)_diag (out)
    # prefix: rec/rnn/lstm_cell
    # Blocks: gate-in, gate-forget, next-in, gate-out
    for direction in ("fwd", "bwd"):
      our_layer = get_network().layers["lstm%i_%s" % (i, direction[:2])]
      blocks_prefix = "bidirectionalencoder/EncoderBidirectionalLSTM%i" % (i + 1,)
      # (in,out*4), (out*4,)
      W_in, b = [load_blocks_var(
        "%s/%s_fork/fork_inputs.%s" % (blocks_prefix, {"bwd": "back", "fwd": "fwd"}[direction], p))
        for p in ("W", "b")]
      W_re = load_blocks_var(
        "%s/bidirectionalseparateparameters/%s.W_state" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction]))
      W = numpy.concatenate([W_in, W_re], axis=0)
      b = lstm_vec_blocks_to_tf(b)
      W = lstm_vec_blocks_to_tf(W)
      import_var(our_layer.params["rnn/lstm_cell/bias"], b)
      import_var(our_layer.params["rnn/lstm_cell/kernel"], W)
      import_var(our_layer.params["initial_c"], "%s/bidirectionalseparateparameters/%s.initial_cells" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction]))
      import_var(our_layer.params["initial_h"], "%s/bidirectionalseparateparameters/%s.initial_state" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction]))
      for s1, s2 in [("W_cell_to_in", "w_i_diag"), ("W_cell_to_forget", "w_f_diag"), ("W_cell_to_out", "w_o_diag")]:
        import_var(our_layer.params["rnn/lstm_cell/%s" % s2], "%s/bidirectionalseparateparameters/%s.%s" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction], s1))
  import_var(get_network().layers["enc_ctx"].params["W"], "decoder/sequencegenerator/att_trans/attention/encoder_state_transformer.W")
  import_var(get_network().layers["enc_ctx"].params["b"], "decoder/sequencegenerator/att_trans/attention/encoder_state_transformer.b")
  import_var(our_params["output/rec/s/initial_c"], "decoder/sequencegenerator/att_trans/lstm_decoder.initial_cells")
  import_var(our_params["output/rec/s/initial_h"], "decoder/sequencegenerator/att_trans/lstm_decoder.initial_state")
  import_var(our_params["output/rec/weight_feedback/W"], "decoder/sequencegenerator/att_trans/attention/sum_alignment_transformer.W")
  import_var(our_params["output/rec/target_embed/W"], "decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W")
  import_var(our_params["fertility/W"], "decoder/sequencegenerator/att_trans/attention/fertility_transformer.W")
  import_var(our_params["output/rec/energy/W"], "decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W")
  prev_s_trans_W_states = load_blocks_var("decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W")
  prev_s_trans_W_cells = load_blocks_var("decoder/sequencegenerator/att_trans/attention/state_trans/transform_cells.W")
  prev_s_trans_W = numpy.concatenate([prev_s_trans_W_cells, prev_s_trans_W_states], axis=0)
  import_var(our_params["output/rec/prev_s_transformed/W"], prev_s_trans_W)
  import_var(our_params["output/rec/s/rec/lstm_cell/bias"], numpy.zeros(our_params["output/rec/s/rec/lstm_cell/bias"].shape))
  dec_lstm_kernel_in_feedback = load_blocks_var("decoder/sequencegenerator/att_trans/feedback_to_decoder/fork_inputs.W")
  dec_lstm_kernel_in_ctx = load_blocks_var("decoder/sequencegenerator/att_trans/context_to_decoder/fork_inputs.W")
  dec_lstm_kernel_re = load_blocks_var("decoder/sequencegenerator/att_trans/lstm_decoder.W_state")
  dec_lstm_kernel = numpy.concatenate([dec_lstm_kernel_in_feedback, dec_lstm_kernel_in_ctx, dec_lstm_kernel_re], axis=0)
  dec_lstm_kernel = lstm_vec_blocks_to_tf(dec_lstm_kernel)
  import_var(our_params["output/rec/s/rec/lstm_cell/kernel"], dec_lstm_kernel)
  for s1, s2 in [("W_cell_to_in", "w_i_diag"), ("W_cell_to_forget", "w_f_diag"), ("W_cell_to_out", "w_o_diag")]:
    import_var(our_params["output/rec/s/rec/lstm_cell/%s" % s2], "decoder/sequencegenerator/att_trans/lstm_decoder.%s" % s1)
  readout_in_W_states = load_blocks_var("decoder/sequencegenerator/readout/merge/transform_states.W")
  readout_in_W_feedback = load_blocks_var("decoder/sequencegenerator/readout/merge/transform_feedback.W")
  readout_in_W_att = load_blocks_var("decoder/sequencegenerator/readout/merge/transform_weighted_averages.W")
  readout_in_W = numpy.concatenate([readout_in_W_states, readout_in_W_feedback, readout_in_W_att], axis=0)
  import_var(our_params["output/rec/readout_in/W"], readout_in_W)
  import_var(our_params["output/rec/readout_in/b"], "decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b")
  import_var(our_params["output/rec/output_prob/W"], "decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W")
  import_var(our_params["output/rec/output_prob/b"], "decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b")

  print("Not initialized own params:")
  count = 0
  for key, v in sorted(our_params.items()):
    if key in our_loaded_params:
      continue
    print("  %s: %s, %s" % (key, v.shape, v.dtype.base_dtype.name))
    count += 1
  if not count:
    print("  None.")
  print("Not used Blocks params:")
  count = 0
  for key, value in sorted(blocks_params.items()):
    if key in blocks_used_params:
      continue
    print("  %s: %s, %s" % (key, value.shape, value.dtype))
    count += 1
  if not count:
    print("  None.")
  print("Done.")

  blocks_debug_dump_output = config.value("blocks_debug_dump_output", None)
  if blocks_debug_dump_output:
    print("Will read Blocks debug dump output from %r and compare with Returnn outputs." % blocks_debug_dump_output)
    blocks_initial_outputs = numpy.load("%s/initial_states_data.0.npz" % blocks_debug_dump_output)
    blocks_search_log = pickle.load(open("%s/search.log.pkl" % blocks_debug_dump_output, "rb"), encoding="bytes")
    blocks_search_log = {d[b"step"]: d for d in blocks_search_log}
    input_seq = blocks_initial_outputs["input"]
    beam_size, seq_len = input_seq.shape
    input_seq = input_seq[0]  # all the same, select beam 0
    assert isinstance(input_seq, numpy.ndarray)
    print("Debug input seq: %s" % input_seq.tolist())
    from GeneratingDataset import StaticDataset
    dataset = StaticDataset(
      data=[{"data": input_seq}],
      output_dim={"data": get_network().extern_data.get_default_input_data().get_kwargs()})
    dataset.init_seq_order(epoch=0)
    extract_output_dict = {
      "enc_src_emb": get_network().layers["source_embed"].output.get_placeholder_as_batch_major(),
      "encoder": get_network().layers["encoder"].output.get_placeholder_as_batch_major(),
      "enc_ctx": get_network().layers["enc_ctx"].output.get_placeholder_as_batch_major(),
      "output": get_network().layers["output"].output.get_placeholder_as_batch_major()
    }
    from TFNetworkLayer import concat_sources
    for i in range(num_encoder_layers):
      extract_output_dict["enc_layer_%i" % i] = concat_sources(
        [get_network().layers["lstm%i_fw" % i], get_network().layers["lstm%i_bw" % i]]
      ).get_placeholder_as_batch_major()
    extract_output_dict["enc_layer_0_fwd"] = get_network().layers["lstm0_fw"].output.get_placeholder_as_batch_major()
    our_output = rnn.engine.run_single(
      dataset=dataset, seq_idx=0, output_dict=extract_output_dict)
    blocks_out = blocks_initial_outputs["bidirectionalencoder_EncoderLookUp0__EncoderLookUp0_apply_output"]
    our_out = our_output["enc_src_emb"]
    print("our enc emb shape:", our_out.shape)
    print("Blocks enc emb shape:", blocks_out.shape)
    assert our_out.shape[:2] == (1, seq_len)
    assert blocks_out.shape[:2] == (seq_len, beam_size)
    assert our_out.shape[2] == blocks_out.shape[2]
    assert_almost_equal(our_out[0], blocks_out[:, 0], decimal=5)
    blocks_lstm0_out_ref = calc_lstm(blocks_out[:, 0], blocks_params)
    blocks_lstm0_out = blocks_initial_outputs["bidirectionalencoder_EncoderBidirectionalLSTM1_bidirectionalseparateparameters_forward__forward_apply_states"]
    our_lstm0_out = our_output["enc_layer_0_fwd"]
    assert blocks_lstm0_out.shape == (seq_len, beam_size) + blocks_lstm0_out_ref.shape
    assert our_lstm0_out.shape == (1, seq_len) + blocks_lstm0_out_ref.shape
    assert_almost_equal(blocks_lstm0_out[0, 0], blocks_lstm0_out_ref, decimal=6)
    print("Blocks LSTM0 frame 0 matched to ref calc.")
    assert_almost_equal(our_lstm0_out[0, 0], blocks_lstm0_out_ref, decimal=6)
    print("Our LSTM0 frame 0 matched to ref calc.")
    for i in range(num_encoder_layers):
      blocks_out = blocks_initial_outputs[
        "bidirectionalencoder_EncoderBidirectionalLSTM%i_bidirectionalseparateparameters__bidirectionalseparateparameters_apply_output_0" % (i + 1,)]
      our_out = our_output["enc_layer_%i" % i]
      print("our enc layer %i shape:" % i, our_out.shape)
      print("Blocks enc layer %i shape:" % i, blocks_out.shape)
      assert our_out.shape[:2] == (1, seq_len)
      assert blocks_out.shape[:2] == (seq_len, beam_size)
      assert our_out.shape[2] == blocks_out.shape[2]
      assert_almost_equal(our_out[0], blocks_out[:, 0], decimal=6)
    print("our encoder shape:", our_output["encoder"].shape)
    blocks_encoder_out = blocks_initial_outputs["bidirectionalencoder__bidirectionalencoder_apply_representation"]
    print("Blocks encoder shape:", blocks_encoder_out.shape)
    assert our_output["encoder"].shape[:2] == (1, seq_len)
    assert blocks_encoder_out.shape[:2] == (seq_len, beam_size)
    assert our_output["encoder"].shape[2] == blocks_encoder_out.shape[2]
    assert_almost_equal(our_output["encoder"][0], blocks_encoder_out[:, 0], decimal=6)
    blocks_first_frame_outputs = numpy.load("%s/next_states.0.npz" % blocks_debug_dump_output)
    blocks_enc_ctx_out = blocks_first_frame_outputs["decoder_sequencegenerator_att_trans_attention__attention_preprocess_preprocessed_attended"]
    our_enc_ctx_out = our_output["enc_ctx"]
    print("Blocks enc ctx shape:", blocks_enc_ctx_out.shape)
    assert blocks_enc_ctx_out.shape[:2] == (seq_len, beam_size)
    assert our_enc_ctx_out.shape[:2] == (1, seq_len)
    assert blocks_enc_ctx_out.shape[2:] == our_enc_ctx_out.shape[2:]
    assert_almost_equal(blocks_enc_ctx_out[:, 0], our_enc_ctx_out[0], decimal=5)
    fertility = numpy.dot(blocks_encoder_out[:, 0], blocks_params["decoder/sequencegenerator/att_trans/attention/fertility_transformer.W"])
    fertility = sigmoid(fertility)
    assert fertility.shape == (seq_len, 1)
    fertility = fertility[:, 0]
    assert fertility.shape == (seq_len,)
    our_dec_outputs = {v["step"]: v for v in _SubnetworkRecCell._debug_out}
    assert our_dec_outputs
    print("our dec frame keys:", sorted(our_dec_outputs[0].keys()))
    our_dec_search_outputs = {v["step"]: v for v in ChoiceLayer._debug_out}
    assert our_dec_search_outputs
    print("our dec search frame keys:", sorted(our_dec_search_outputs[0].keys()))
    print("Blocks search frame keys:", sorted(blocks_search_log[0].keys()))
    dec_lookup = blocks_params["decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W"]
    last_lstm_state = blocks_params["decoder/sequencegenerator/att_trans/lstm_decoder.initial_state"]
    last_lstm_cells = blocks_params["decoder/sequencegenerator/att_trans/lstm_decoder.initial_cells"]
    last_accumulated_weights = numpy.zeros((seq_len,), dtype="float32")
    last_output = 0
    dec_seq_len = 0
    for dec_step in range(100):
      blocks_frame_state_outputs_fn = "%s/next_states.%i.npz" % (blocks_debug_dump_output, dec_step)
      blocks_frame_probs_outputs_fn = "%s/logprobs.%i.npz" % (blocks_debug_dump_output, dec_step)
      if dec_step > 3:
        if not os.path.exists(blocks_frame_state_outputs_fn) or not os.path.exists(blocks_frame_probs_outputs_fn):
          print("Seq not ended yet but frame not found for step %i." % dec_step)
          break
      blocks_frame_state_outputs = numpy.load(blocks_frame_state_outputs_fn)
      blocks_frame_probs_outputs = numpy.load(blocks_frame_probs_outputs_fn)
      blocks_search_frame = blocks_search_log[dec_step]
      our_dec_frame_outputs = our_dec_outputs[dec_step]
      assert our_dec_frame_outputs["step"] == dec_step
      assert our_dec_frame_outputs[":i.output"].tolist() == [dec_step]
      our_dec_search_frame_outputs = our_dec_search_outputs[dec_step]

      blocks_last_lstm_state = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_states"]
      blocks_last_lstm_cells = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_cells"]
      assert blocks_last_lstm_state.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(blocks_last_lstm_state[0], last_lstm_state, decimal=5)
      assert_almost_equal(blocks_last_lstm_cells[0], last_lstm_cells, decimal=5)
      our_last_lstm_cells = our_dec_frame_outputs["prev:s.extra.state"][0]
      our_last_lstm_state = our_dec_frame_outputs["prev:s.extra.state"][1]
      assert our_last_lstm_state.shape == our_last_lstm_cells.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(our_last_lstm_state[0], last_lstm_state, decimal=5)
      assert_almost_equal(our_last_lstm_cells[0], last_lstm_cells, decimal=5)
      our_last_s = our_dec_frame_outputs["prev:s.output"]
      assert our_last_s.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(our_last_s[0], last_lstm_state, decimal=5)

      blocks_last_accum_weights = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_accumulated_weights"]
      assert blocks_last_accum_weights.shape == (beam_size, seq_len)
      assert_almost_equal(blocks_last_accum_weights[0], last_accumulated_weights, decimal=5)
      our_last_accum_weights = our_dec_frame_outputs["prev:accum_att_weights.output"]
      assert our_last_accum_weights.shape == (beam_size, seq_len if dec_step > 0 else 1, 1)
      if dec_step > 0:
        assert_almost_equal(our_last_accum_weights[0, :, 0], last_accumulated_weights, decimal=4)
      else:
        assert_almost_equal(our_last_accum_weights[0, 0, 0], last_accumulated_weights.sum(), decimal=4)

      energy_sum = numpy.copy(blocks_enc_ctx_out[:, 0])  # (T,enc-ctx-dim)
      weight_feedback = numpy.dot(last_accumulated_weights[:, None], blocks_params["decoder/sequencegenerator/att_trans/attention/sum_alignment_transformer.W"])
      energy_sum += weight_feedback
      transformed_states = numpy.dot(last_lstm_state[None, :], blocks_params["decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W"])
      transformed_cells = numpy.dot(last_lstm_cells[None, :], blocks_params["decoder/sequencegenerator/att_trans/attention/state_trans/transform_cells.W"])
      energy_sum += transformed_states + transformed_cells
      assert energy_sum.shape == (seq_len, blocks_enc_ctx_out.shape[-1])
      blocks_energy_sum_tanh = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention_energy_comp_tanh__tanh_apply_output"]
      assert blocks_energy_sum_tanh.shape == (seq_len, beam_size, energy_sum.shape[-1])
      assert_almost_equal(blocks_energy_sum_tanh[:, 0], numpy.tanh(energy_sum), decimal=5)
      assert_equal(our_dec_frame_outputs["weight_feedback.output"].shape, (beam_size, seq_len if dec_step > 0 else 1, blocks_enc_ctx_out.shape[-1]))
      assert_equal(our_dec_frame_outputs["prev_s_transformed.output"].shape, (beam_size, blocks_enc_ctx_out.shape[-1]))
      our_energy_sum = our_dec_frame_outputs["energy_in.output"]
      assert our_energy_sum.shape == (beam_size, seq_len, blocks_enc_ctx_out.shape[-1])
      assert_almost_equal(our_energy_sum[0], energy_sum, decimal=4)
      blocks_energy = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention_energy_comp__energy_comp_apply_output"]
      assert blocks_energy.shape == (seq_len, beam_size, 1)
      energy = numpy.dot(numpy.tanh(energy_sum), blocks_params["decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W"])
      assert energy.shape == (seq_len, 1)
      assert_almost_equal(blocks_energy[:, 0], energy, decimal=4)
      our_energy = our_dec_frame_outputs["energy.output"]
      assert our_energy.shape == (beam_size, seq_len, 1)
      assert_almost_equal(our_energy[0], energy, decimal=4)
      weights = softmax(energy[:, 0])
      assert weights.shape == (seq_len,)
      our_weights = our_dec_frame_outputs["att_weights.output"]
      assert our_weights.shape == (beam_size, seq_len, 1)
      assert_almost_equal(our_weights[0, :, 0], weights, decimal=4)
      accumulated_weights = last_accumulated_weights + weights / (2.0 * fertility)
      assert accumulated_weights.shape == (seq_len,)
      #blocks_accumulated_weights = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention__attention_take_glimpses_accumulated_weights"]
      #assert blocks_accumulated_weights.shape == (beam_size, seq_len)
      #assert_almost_equal(blocks_accumulated_weights[0], accumulated_weights, decimal=5)
      blocks_weights = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention__attention_compute_weights_output_0"]
      assert blocks_weights.shape == (seq_len, beam_size)
      assert_almost_equal(weights, blocks_weights[:, 0], decimal=4)
      our_accum_weights = our_dec_frame_outputs["accum_att_weights.output"]
      assert our_accum_weights.shape == (beam_size, seq_len, 1)
      weighted_avg = (weights[:, None] * blocks_encoder_out[:, 0]).sum(axis=0)  # att in our
      assert weighted_avg.shape == (blocks_encoder_out.shape[-1],)
      blocks_weighted_avg = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention__attention_compute_weighted_averages_output_0"]
      assert blocks_weighted_avg.shape == (beam_size, blocks_encoder_out.shape[-1])
      assert_almost_equal(blocks_weighted_avg[0], weighted_avg, decimal=4)
      our_att = our_dec_frame_outputs["att.output"]
      assert our_att.shape == (beam_size, blocks_encoder_out.shape[-1])
      assert_almost_equal(our_att[0], weighted_avg, decimal=4)

      blocks_last_output = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_outputs"]
      assert blocks_last_output.shape == (beam_size,)
      assert max(blocks_last_output[0], 0) == last_output
      last_target_embed = dec_lookup[last_output]
      if dec_step == 0:
        last_target_embed = numpy.zeros_like(last_target_embed)
      our_last_target_embed = our_dec_frame_outputs["prev:target_embed.output"]
      assert our_last_target_embed.shape == (beam_size, dec_lookup.shape[-1])
      assert_almost_equal(our_last_target_embed[0], last_target_embed, decimal=4)

      readout_in_state = numpy.dot(last_lstm_state, blocks_params["decoder/sequencegenerator/readout/merge/transform_states.W"])
      blocks_trans_state = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_states"]
      assert blocks_trans_state.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(blocks_trans_state[0], readout_in_state, decimal=4)
      readout_in_feedback = numpy.dot(last_target_embed, blocks_params["decoder/sequencegenerator/readout/merge/transform_feedback.W"])
      blocks_trans_feedback = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_feedback"]
      assert blocks_trans_feedback.shape == (beam_size, readout_in_feedback.shape[0])
      assert_almost_equal(blocks_trans_feedback[0], readout_in_feedback, decimal=4)
      readout_in_weighted_avg = numpy.dot(weighted_avg, blocks_params["decoder/sequencegenerator/readout/merge/transform_weighted_averages.W"])
      blocks_trans_weighted_avg = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_weighted_averages"]
      assert blocks_trans_weighted_avg.shape == (beam_size, readout_in_weighted_avg.shape[0])
      assert_almost_equal(blocks_trans_weighted_avg[0], readout_in_weighted_avg, decimal=4)
      readout_in = readout_in_state + readout_in_feedback + readout_in_weighted_avg
      blocks_readout_in = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_output"]
      assert blocks_readout_in.shape == (beam_size, readout_in.shape[0])
      assert_almost_equal(blocks_readout_in[0], readout_in, decimal=4)
      readout_in += blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b"]
      assert readout_in.shape == (blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b"].shape[0],)
      our_readout_in = our_dec_frame_outputs["readout_in.output"]
      assert our_readout_in.shape == (beam_size, readout_in.shape[0])
      assert_almost_equal(our_readout_in[0], readout_in, decimal=4)
      readout = readout_in.reshape((readout_in.shape[0] // 2, 2)).max(axis=1)
      our_readout = our_dec_frame_outputs["readout.output"]
      assert our_readout.shape == (beam_size, readout.shape[0])
      assert_almost_equal(our_readout[0], readout, decimal=4)
      prob_logits = numpy.dot(readout, blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W"]) + \
        blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b"]
      assert prob_logits.ndim == 1
      blocks_prob_logits = blocks_frame_probs_outputs["decoder_sequencegenerator_readout__readout_readout_output_0"]
      assert blocks_prob_logits.shape == (beam_size, prob_logits.shape[0])
      assert_almost_equal(blocks_prob_logits[0], prob_logits, decimal=4)
      output_prob = softmax(prob_logits)
      log_output_prob = log_softmax(prob_logits)
      assert_almost_equal(numpy.log(output_prob), log_output_prob, decimal=4)
      our_output_prob = our_dec_frame_outputs["output_prob.output"]
      assert our_output_prob.shape == (beam_size, output_prob.shape[0])
      assert_almost_equal(our_output_prob[0], output_prob, decimal=4)
      blocks_nlog_prob = blocks_frame_probs_outputs["logprobs"]
      assert blocks_nlog_prob.shape == (beam_size, output_prob.shape[0])
      assert_almost_equal(blocks_nlog_prob[0], -log_output_prob, decimal=4)
      assert_almost_equal(our_dec_search_frame_outputs["scores_in_orig"][0], output_prob, decimal=4)
      assert_almost_equal(blocks_search_frame[b'logprobs'][0], -log_output_prob, decimal=4)
      #for b in range(beam_size):
      #  assert_almost_equal(-numpy.log(our_output_prob[b]), blocks_frame_probs_outputs["logprobs"][b], decimal=4)
      ref_output = numpy.argmax(output_prob)
      # Note: Don't take the readout.emit outputs. They are randomly sampled.
      blocks_dec_output = blocks_search_frame[b'outputs']
      assert blocks_dec_output.shape == (beam_size,)
      our_dec_output = our_dec_frame_outputs["output.output"]
      assert our_dec_output.shape == (beam_size,)
      print("Frame %i: Ref best greedy output symbol: %i" % (dec_step, int(ref_output)))
      print("Blocks labels:", blocks_dec_output.tolist())
      print("Our labels:", our_dec_output.tolist())
      # Well, the following two could be not true if all the other beams have much better scores,
      # but this is unlikely.
      assert ref_output in blocks_dec_output
      assert ref_output in our_dec_output
      if dec_step == 0:
        # This assumes that the results are ordered by score which might not be true (see tf.nn.top_k).
        assert blocks_dec_output[0] == our_dec_output[0] == ref_output
      # We assume that the best is the same. Note that this also might not be true if there are two equally best scores.
      # It also assumes that it's ordered by the score which also might not be true (see tf.nn.top_k).
      # For the same reason, the remaining list and entries might also not perfectly match.
      assert our_dec_output[0] == blocks_dec_output[0]
      # Just follow the first beam.
      ref_output = blocks_dec_output[0]
      assert our_dec_search_frame_outputs["src_beam_idxs"].shape == (1, beam_size)
      assert our_dec_search_frame_outputs["scores"].shape == (1, beam_size)
      print("Blocks src_beam_idxs:", blocks_search_frame[b'indexes'].tolist())
      print("Our src_beam_idxs:", our_dec_search_frame_outputs["src_beam_idxs"][0].tolist())
      print("Blocks scores:", blocks_search_frame[b'chosen_costs'].tolist())
      print("Our scores:", our_dec_search_frame_outputs["scores"][0].tolist())
      if list(our_dec_search_frame_outputs["src_beam_idxs"][0]) != list(blocks_search_frame[b'indexes']):
        print("Warning, beams do not match.")
        print("Blocks scores base:", blocks_search_frame[b'scores_base'].flatten().tolist())
        print("Our scores base:", our_dec_search_frame_outputs["scores_base"].flatten().tolist())
        #print("Blocks score in orig top k:", sorted(blocks_search_frame[b'logprobs'].flatten())[:beam_size])
        #print("Our score in orig top k:", sorted(-numpy.log(our_dec_search_frame_outputs["scores_in_orig"].flatten()))[:beam_size])
        print("Blocks score in top k:", sorted((blocks_search_frame[b'logprobs'] * blocks_search_log[dec_step - 1][b'mask'][:, None]).flatten())[:beam_size])
        print("Our score in top k:", sorted(-our_dec_search_frame_outputs["scores_in"].flatten())[:beam_size])
        blocks_scores_combined = blocks_search_frame[b'next_costs']
        our_scores_combined = our_dec_search_frame_outputs["scores_combined"]
        print("Blocks scores combined top k:", sorted(blocks_scores_combined.flatten())[:beam_size])
        print("Our neg scores combined top k:", sorted(-our_scores_combined.flatten())[:beam_size])
        #raise Exception("beams mismatch")
      assert our_dec_search_frame_outputs["src_beam_idxs"][0][0] == blocks_search_frame[b'indexes'][0]
      beam_idx = our_dec_search_frame_outputs["src_beam_idxs"][0][0]
      if beam_idx != 0:
        print("Selecting different beam: %i." % beam_idx)
        # Just overwrite the needed states by Blocks outputs.
        accumulated_weights = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_attention__attention_take_glimpses_accumulated_weights"][0]
        weighted_avg = blocks_frame_state_outputs["decoder_sequencegenerator__sequencegenerator_generate_weighted_averages"][0]
        last_lstm_state = blocks_frame_state_outputs["decoder_sequencegenerator__sequencegenerator_generate_states"][0]
        last_lstm_cells = blocks_frame_state_outputs["decoder_sequencegenerator__sequencegenerator_generate_cells"][0]

      # From now on, use blocks_frame_state_outputs instead of blocks_frame_probs_outputs because
      # it will have the beam reordered.
      blocks_target_emb = blocks_frame_state_outputs["decoder_sequencegenerator_fork__fork_apply_feedback_decoder_input"]
      assert blocks_target_emb.shape == (beam_size, dec_lookup.shape[1])
      target_embed = dec_lookup[ref_output]
      assert target_embed.shape == (dec_lookup.shape[1],)
      assert_almost_equal(blocks_target_emb[0], target_embed)

      feedback_to_decoder = numpy.dot(target_embed, blocks_params["decoder/sequencegenerator/att_trans/feedback_to_decoder/fork_inputs.W"])
      context_to_decoder = numpy.dot(weighted_avg, blocks_params["decoder/sequencegenerator/att_trans/context_to_decoder/fork_inputs.W"])
      lstm_z = feedback_to_decoder + context_to_decoder
      assert lstm_z.shape == feedback_to_decoder.shape == context_to_decoder.shape == (last_lstm_state.shape[-1] * 4,)
      blocks_feedback_to_decoder = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_feedback_to_decoder__feedback_to_decoder_apply_inputs"]
      blocks_context_to_decoder = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_context_to_decoder__context_to_decoder_apply_inputs"]
      assert blocks_feedback_to_decoder.shape == blocks_context_to_decoder.shape == (beam_size, last_lstm_state.shape[-1] * 4)
      assert_almost_equal(blocks_feedback_to_decoder[0], feedback_to_decoder, decimal=4)
      assert_almost_equal(blocks_context_to_decoder[0], context_to_decoder, decimal=4)
      lstm_state, lstm_cells = calc_raw_lstm(
        lstm_z, blocks_params=blocks_params,
        prefix="decoder/sequencegenerator/att_trans/lstm_decoder.",
        last_state=last_lstm_state, last_cell=last_lstm_cells)
      assert lstm_state.shape == last_lstm_state.shape == lstm_cells.shape == last_lstm_cells.shape
      blocks_lstm_state = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_lstm_decoder__lstm_decoder_apply_states"]
      blocks_lstm_cells = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_lstm_decoder__lstm_decoder_apply_cells"]
      assert blocks_lstm_state.shape == blocks_lstm_cells.shape == (beam_size, last_lstm_state.shape[-1])
      assert_almost_equal(blocks_lstm_state[0], lstm_state, decimal=4)
      assert_almost_equal(blocks_lstm_cells[0], lstm_cells, decimal=4)
      our_lstm_cells = our_dec_frame_outputs["s.extra.state"][0]
      our_lstm_state = our_dec_frame_outputs["s.extra.state"][1]
      assert our_lstm_state.shape == our_lstm_cells.shape == (beam_size, lstm_state.shape[0])
      assert_almost_equal(our_lstm_state[0], lstm_state, decimal=4)
      assert_almost_equal(our_lstm_cells[0], lstm_cells, decimal=4)
      our_s = our_dec_frame_outputs["s.output"]
      assert our_s.shape == (beam_size, lstm_state.shape[0])
      assert_almost_equal(our_s[0], lstm_state, decimal=4)

      last_accumulated_weights = accumulated_weights
      last_lstm_state = lstm_state
      last_lstm_cells = lstm_cells
      last_output = ref_output
      if last_output == 0:
        print("Sequence finished, seq len %i." % dec_step)
        dec_seq_len = dec_step
        break
    assert dec_seq_len > 0
    print("All outputs seem to match.")
  else:
    print("blocks_debug_dump_output not specified. It will not compare the model outputs." % blocks_debug_dump_output)

  if dry_run:
    print("Dry-run, not saving model.")
  else:
    rnn.engine.save_model(our_model_fn)
  print("Finished importing.")
예제 #20
0
def test_multi_target_init():
  config = Config()
  config.update({
    "multiprocessing": False,
    "blocking": True,
    "device": "cpu",
    "num_epochs": 1,
    "num_inputs": 3,
    "num_outputs": {"t1": 4, "t2": 5},
    "learning_rate": 1.0,
  })
  config.network_topology_json = """
  {
  "fw0": {"class": "hidden", "activation": "identity", "n_out": 3},
  "out1": {"class": "softmax", "loss": "ce", "target": "t1", "from": ["fw0"]},
  "out2": {"class": "softmax", "loss": "ce", "target": "t2", "from": ["fw0"]}
  }
  """

  device = Device("cpu", config=config, blocking=True)
  assert_true(device.trainnet, "train network initialized")
  assert_true(device.testnet, "test network initialized")
  param_vars = device.trainnet.get_all_params_vars()
  print "params:", param_vars
  assert_equal(len(param_vars), 6, "W, b vars for each out, and fw")
  num_params = get_num_params(param_vars)
  assert_equal(num_params, (3 * 3 + 3) + (3 * 4 + 4) + (3 * 5 + 5), "W, b for each out, and fw")
  assert_in("fw0", device.testnet.hidden)
  assert_in("out1", device.testnet.output)
  assert_in("out2", device.testnet.output)
  assert_is(device.testnet.j["t1"], device.testnet.output["out1"].index)
  assert_true(device.updater)
  update_list = device.updater.getUpdateList()
  print "update list:"
  pprint(update_list)
  update_dict = dict(update_list)
  assert_equal(len(update_dict), len(update_list), "all params in update list only once")
  assert_in("fw0", device.trainnet.hidden)
  assert_equal(len(device.trainnet.hidden), 1)
  assert_in("W_in_data_fw0", device.trainnet.hidden["fw0"].params)
  assert_in("b_fw0", device.trainnet.hidden["fw0"].params)
  assert_equal(len(device.trainnet.hidden["fw0"].params), 2)
  assert_in("out1", device.trainnet.output)
  assert_equal(len(device.trainnet.output), 2)
  assert_in("W_in_fw0_out1", device.trainnet.output["out1"].params)
  assert_in("b_out1", device.trainnet.output["out1"].params)
  assert_equal(len(device.trainnet.output["out1"].params), 2)
  assert_in(device.trainnet.hidden["fw0"].params["W_in_data_fw0"], update_dict)
  assert_in(device.trainnet.hidden["fw0"].params["b_fw0"], update_dict)
  assert_in(device.trainnet.output["out1"].params["W_in_fw0_out1"], update_dict)
  assert_in(device.trainnet.output["out1"].params["b_out1"], update_dict)
  assert_in(device.trainnet.output["out2"].params["W_in_fw0_out2"], update_dict)
  assert_in(device.trainnet.output["out2"].params["b_out2"], update_dict)
  assert_equal(len(update_dict), 6)

  # Set net params.
  net_params = {
    "fw0": {"W_in_data_fw0": numpy.identity(3, dtype="float32"),
            "b_fw0": numpy.zeros((3,), dtype="float32")},
    "out1": {"W_in_fw0_out1": numpy.arange(0.0, 1.2, 0.1, dtype="float32").reshape((3, 4)),
             "b_out1": numpy.arange(0.0, 4, dtype="float32")},
    "out2": {"W_in_fw0_out2": numpy.arange(0.0, 1.5, 0.1, dtype="float32").reshape((3, 5)),
             "b_out2": numpy.arange(0.0, 5, dtype="float32")}
  }
  device.trainnet.set_params_by_dict(net_params)
  device.testnet.set_params_by_dict(net_params)

  # Show params.
  for p in param_vars:
    print "init %s:" % p
    pprint(p.get_value())

  # Init dataset.
  dataset = StaticDataset(data=[{
    "data": numpy.array([[0.1, 0.2, -0.3]], dtype="float32"),
    "t1": numpy.array([2]),
    "t2": numpy.array([4])
  }], output_dim=config.typed_value("num_outputs"))
  dataset.init_seq_order()
  assert_equal(dataset.is_data_sparse("data"), False)
  assert_equal(dataset.is_data_sparse("t1"), True)
  assert_equal(dataset.is_data_sparse("t2"), True)

  # Copy to device allocation.
  success = assign_dev_data_single_seq(device, dataset, 0)
  assert_true(success, "failed to allocate & assign data")

  # Check allocated data.
  assert_equal(device.targets["data"].shape, (1, 1, 3))  # input shape. (time,batch,dim)
  assert_in("t1", device.targets)
  assert_in("t2", device.targets)
  assert_equal(device.targets["t1"].shape, (1, 1))
  assert_equal(device.targets["t2"].shape, (1, 1))
  assert_equal(device.output_index["data"].shape, (1, 1))
  numpy.testing.assert_equal(device.output_index["data"], numpy.array([[1]]))
  assert_equal(device.output_index["t1"].shape, (1, 1))
  numpy.testing.assert_equal(device.output_index["t1"], numpy.array([[1]]))

  # Forward test.
  device.update_data()
  device.testnet.costs["out1"].name = "out1_cost"  # nice in the func graph
  out_i1 = device.testnet.output["out1"].index
  out_i1_nonzero = device.testnet.output["out1"].i
  nll1, pcx1 = T.nnet.crossentropy_softmax_1hot(x=device.testnet.output["out1"].y_m[out_i1_nonzero],
                                                y_idx=device.testnet.output["out1"].y_data_flat[out_i1_nonzero])
  forward_func = theano.function(
    inputs=[device.block_start, device.block_end],
    outputs=[
      device.testnet.j["t1"], out_i1, out_i1_nonzero[0], nll1, pcx1,
      device.testnet.costs["out1"],
      device.testnet.output["out1"].p_y_given_x,
      device.testnet.costs["out2"],
      device.testnet.output["out2"].p_y_given_x],
    givens=device.make_givens(device.testnet),
    no_default_updates=True,
    on_unused_input='warn',
    name="forward")
  #print "forward func:"
  #theano.printing.debugprint(forward_func)
  net_j1, out_i1_val, out_i1_nz_val, nll1_val, pcx1_val, t1_cost, t1_y, t2_cost, t2_y = forward_func(0, 1)
  print "forward results:"
  pprint(net_j1)
  pprint(out_i1_val)
  pprint(out_i1_nz_val)
  pprint(nll1_val)
  pprint(pcx1_val)
  pprint(t1_cost)
  pprint(t1_y)
  pprint(t2_cost)
  pprint(t2_y)
  assert_equal(net_j1, numpy.array([[1]]))
  assert_equal(out_i1_val, numpy.array([[1]]))
  assert_equal(out_i1_nz_val, numpy.array([0]))
  assert_almost_equal(nll1_val, numpy.array([t1_cost]))
  numpy.testing.assert_almost_equal(t1_y, pcx1_val)
  assert_almost_equal(t1_cost, 1.440189698561195, places=6)
  assert_almost_equal(t2_cost, 0.45191439593759336, places=6)
  numpy.testing.assert_almost_equal(t1_y, numpy.array([[ 0.0320586 ,  0.08714432,  0.23688282,  0.64391426]]), decimal=6)
  numpy.testing.assert_almost_equal(t2_y, numpy.array([[ 0.01165623,  0.03168492,  0.08612854,  0.23412166,  0.63640865]]), decimal=6)

  # One train step.
  device.set_learning_rate(config.typed_value("learning_rate"))
  device.run("train")
  output_list, outputs_format = device.result()
  assert_is_instance(output_list, list)
  assert_true(outputs_format, "for train, we should always get the format")
  outputs = Device.make_result_dict(output_list, outputs_format)
  pprint(outputs)
  assert_in("cost:out1", outputs)
  assert_greater(outputs["cost:out1"], 0)
  assert_almost_equal(outputs["cost:out1"], t1_cost)

  # Get net params.
  params = device.get_net_train_params(device.trainnet)
  references_params = {
    "W_in_data_fw0":
      numpy.array([[  1.00055406e+00,   5.54056978e-04,   5.54056978e-04],
                   [  1.10811396e-03,   1.00110811e+00,   1.10811396e-03],
                   [ -1.66217093e-03,  -1.66217093e-03,   9.98337829e-01]]),
    "b_fw0":
      numpy.array([ 0.00554057,  0.00554057,  0.00554057]),
    "W_in_fw0_out1":
      numpy.array([[-0.00320586,  0.09128557,  0.27631172,  0.23560857],
                   [ 0.39358828,  0.48257114,  0.75262344,  0.57121715],
                   [ 0.80961758,  0.9261433 ,  0.77106485,  1.29317428]]),
    "b_out1":
      numpy.array([-0.0320586 ,  0.91285568,  2.76311718,  2.35608574]),
    "W_in_fw0_out2":
      numpy.array([[ -1.16562310e-03,   9.68315079e-02,   1.91387146e-01,
                      2.76587834e-01,   4.36359135e-01],
                   [  4.97668754e-01,   5.93663016e-01,   6.82774291e-01,
                      7.53175669e-01,   9.72718271e-01],
                   [  1.00349687e+00,   1.10950548e+00,   1.22583856e+00,
                      1.37023650e+00,   1.29092259e+00]]),
    "b_out2":
      numpy.array([-0.01165623,  0.96831508,  1.91387146,  2.76587834,  4.36359135])
  }
  assert_equal(len(param_vars), len(params))
  for p, v in zip(param_vars, params):
    print "%s:" % p
    pprint(v)
    assert_true(p.name)
    numpy.testing.assert_almost_equal(references_params[p.name], v, decimal=6)
예제 #21
0
def test_combi_auto_enc():
  config = Config()
  config.update({
    "multiprocessing": False,
    "blocking": True,
    "device": "cpu",
    "num_epochs": 1,
    "num_inputs": 3,
    "num_outputs": {"classes": 2},
    "learning_rate": 1.0,
    "network": {
      "output": {"class": "softmax", "loss": "ce", "target": "classes"},
      "auto-enc": {"class": "softmax", "loss": "sse", "dtype": "float32", "target": "data"}
    }
  })

  device = Device("cpu", config=config, blocking=True)

  # Set net params.
  def get_net_params(with_auto_enc=True):
    d = {
      "output": {"W_in_data_output": numpy.arange(0.1, 0.7, 0.1, dtype="float32").reshape((3, 2)),
                 "b_output": numpy.arange(0.0, 2, dtype="float32")}
    }
    if with_auto_enc:
      d["auto-enc"] = {"W_in_data_auto-enc": numpy.arange(0.1, 1.0, 0.1, dtype="float32").reshape((3, 3)),
                       "b_auto-enc": numpy.arange(0.0, 3, dtype="float32")}
    return d
  device.trainnet.set_params_by_dict(get_net_params())
  device.testnet.set_params_by_dict(get_net_params())

  # Show params.
  for p in device.trainnet.get_all_params_vars():
    print "init %s:" % p
    pprint(p.get_value())

  # Init dataset.
  dataset = StaticDataset(data=[{
    "data": numpy.array([[0.1, 0.2, -0.3]], dtype="float32"),
    "classes": numpy.array([1]),
  }], output_dim=config.typed_value("num_outputs"))
  dataset.init_seq_order()

  # Copy to device allocation.
  success = assign_dev_data_single_seq(device, dataset, 0)
  assert_true(success, "failed to allocate & assign data")

  # One train step.
  device.set_learning_rate(config.typed_value("learning_rate"))
  device.run("train")
  output_list, outputs_format = device.result()
  assert_is_instance(output_list, list)
  assert_true(outputs_format, "for train, we should always get the format")
  outputs = Device.make_result_dict(output_list, outputs_format)
  pprint(outputs)
  assert_in("cost:output", outputs)
  assert_in("cost:auto-enc", outputs)
  expected_cost_output = 0.3132616877555847
  assert_almost_equal(outputs["cost:output"], expected_cost_output, places=6)
  exact_cost_output = outputs["cost:output"]
  assert_almost_equal(outputs["cost:auto-enc"], 5.263200283050537, places=6)

  # Now, drop the auto-enc from the network, and redo the same thing.
  del config.typed_value("network")["auto-enc"]
  device = Device("cpu", config=config, blocking=True)
  device.trainnet.set_params_by_dict(get_net_params(with_auto_enc=False))
  device.testnet.set_params_by_dict(get_net_params(with_auto_enc=False))
  for p in device.trainnet.get_all_params_vars():
    print "second run, init %s:" % p
    pprint(p.get_value())
  dataset.init_seq_order()  # reset. probably not needed
  success = assign_dev_data_single_seq(device, dataset, 0)
  assert_true(success, "failed to allocate & assign data")
  device.set_learning_rate(config.typed_value("learning_rate"))
  device.run("train")
  output_list, outputs_format = device.result()
  assert_is_instance(output_list, list)
  assert_true(outputs_format, "for train, we should always get the format")
  outputs = Device.make_result_dict(output_list, outputs_format)
  pprint(outputs)
  assert_in("cost:output", outputs)
  assert_not_in("cost:auto-enc", outputs)
  assert_almost_equal(outputs["cost:output"], expected_cost_output, places=6)
  assert_equal(outputs["cost:output"], exact_cost_output)
예제 #22
0
def main():
  rnn.init(
    commandLineOptions=sys.argv[1:],
    config_updates={
      "task": "nop", "log": None, "device": "cpu",
      "allow_random_model_init": True,
      "debug_add_check_numerics_on_output": False},
    extra_greeting="Import Blocks MT model.")
  assert Util.BackendEngine.is_tensorflow_selected()
  config = rnn.config

  # Load Blocks MT model params.
  if not config.has("blocks_mt_model"):
    print("Please provide the option blocks_mt_model.")
    sys.exit(1)
  blocks_mt_model_fn = config.value("blocks_mt_model", "")
  assert blocks_mt_model_fn
  assert os.path.exists(blocks_mt_model_fn)
  if os.path.isdir(blocks_mt_model_fn):
    blocks_mt_model_fn += "/params.npz"
    assert os.path.exists(blocks_mt_model_fn)

  dry_run = config.bool("dry_run", False)
  if dry_run:
    our_model_fn = None
    print("Dry-run, will not save model.")
  else:
    our_model_fn = config.value('model', "returnn-model") + ".imported"
    print("Will save Returnn model as %s." % our_model_fn)
    assert os.path.exists(os.path.dirname(our_model_fn) or "."), "model-dir does not exist"
    assert not os.path.exists(our_model_fn + Util.get_model_filename_postfix()), "model-file already exists"

  blocks_mt_model = numpy.load(blocks_mt_model_fn)
  assert isinstance(blocks_mt_model, numpy.lib.npyio.NpzFile), "did not expect type %r in file %r" % (
    type(blocks_mt_model), blocks_mt_model_fn)
  print("Params found in Blocks model:")
  blocks_params = {}  # type: dict[str,numpy.ndarray]
  blocks_params_hierarchy = {}  # type: dict[str,dict[str]]
  blocks_total_num_params = 0
  for key in sorted(blocks_mt_model.keys()):
    value = blocks_mt_model[key]
    key = key.replace("-", "/")
    assert key[0] == "/"
    key = key[1:]
    blocks_params[key] = value
    print("  %s: %s, %s" % (key, value.shape, value.dtype))
    blocks_total_num_params += numpy.prod(value.shape)
    d = blocks_params_hierarchy
    for part in key.split("/"):
      d = d.setdefault(part, {})
  print("Blocks total num params: %i" % blocks_total_num_params)

  # Init our network structure.
  from TFNetworkRecLayer import _SubnetworkRecCell
  _SubnetworkRecCell._debug_out = []  # enable for debugging intermediate values below
  ChoiceLayer._debug_out = []  # also for debug outputs of search
  rnn.engine.use_search_flag = True  # construct the net as in search
  rnn.engine.init_network_from_config()
  print("Our network model params:")
  our_params = {}  # type: dict[str,tf.Variable]
  our_total_num_params = 0
  for v in rnn.engine.network.get_params_list():
    key = v.name[:-2]
    our_params[key] = v
    print("  %s: %s, %s" % (key, v.shape, v.dtype.base_dtype.name))
    our_total_num_params += numpy.prod(v.shape.as_list())
  print("Our total num params: %i" % our_total_num_params)

  # Now matching...
  blocks_used_params = set()  # type: set[str]
  our_loaded_params = set()  # type: set[str]

  def import_var(our_var, blocks_param):
    """
    :param tf.Variable our_var:
    :param str|numpy.ndarray blocks_param:
    """
    assert isinstance(our_var, tf.Variable)
    if isinstance(blocks_param, str):
      blocks_param = load_blocks_var(blocks_param)
    assert isinstance(blocks_param, numpy.ndarray)
    assert_equal(tuple(our_var.shape.as_list()), blocks_param.shape)
    our_loaded_params.add(our_var.name[:-2])
    our_var.load(blocks_param, session=rnn.engine.tf_session)

  def load_blocks_var(blocks_param_name):
    """
    :param str blocks_param_name:
    :rtype: numpy.ndarray
    """
    assert isinstance(blocks_param_name, str)
    assert blocks_param_name in blocks_params
    blocks_used_params.add(blocks_param_name)
    return blocks_params[blocks_param_name]

  enc_name = "bidirectionalencoder"
  enc_embed_name = "EncoderLookUp0.W"
  assert enc_name in blocks_params_hierarchy
  assert enc_embed_name in blocks_params_hierarchy[enc_name]  # input embedding
  num_encoder_layers = max([
    int(re.match(".*([0-9]+)", s).group(1))
    for s in blocks_params_hierarchy[enc_name]
    if s.startswith("EncoderBidirectionalLSTM")])
  blocks_input_dim, blocks_input_embed_dim = blocks_params["%s/%s" % (enc_name, enc_embed_name)].shape
  print("Blocks input dim: %i, embed dim: %i" % (blocks_input_dim, blocks_input_embed_dim))
  print("Blocks num encoder layers: %i" % num_encoder_layers)
  expected_enc_entries = (
    ["EncoderLookUp0.W"] +
    ["EncoderBidirectionalLSTM%i" % i for i in range(1, num_encoder_layers + 1)])
  assert_equal(set(expected_enc_entries), set(blocks_params_hierarchy[enc_name].keys()))

  our_input_layer = find_our_input_embed_layer()
  assert our_input_layer.input_data.dim == blocks_input_dim
  assert our_input_layer.output.dim == blocks_input_embed_dim
  assert not our_input_layer.with_bias
  import_var(our_input_layer.params["W"], "%s/%s" % (enc_name, enc_embed_name))

  dec_name = "decoder/sequencegenerator"
  dec_hierarchy_base = get_in_hierarchy(dec_name, blocks_params_hierarchy)
  assert_equal(set(dec_hierarchy_base.keys()), {"att_trans", "readout"})
  dec_embed_name = "readout/lookupfeedbackwmt15/lookuptable.W"
  get_in_hierarchy(dec_embed_name, dec_hierarchy_base)  # check

  for i in range(num_encoder_layers):
    # Assume standard LSTMCell.
    # i = input_gate, j = new_input, f = forget_gate, o = output_gate
    # lstm_matrix = self._linear1([inputs, m_prev])
    # i, j, f, o = array_ops.split(value=lstm_matrix, num_or_size_splits=4, axis=1)
    # bias (4*in), kernel (in+out,4*out), w_(f|i|o)_diag (out)
    # prefix: rec/rnn/lstm_cell
    # Blocks: gate-in, gate-forget, next-in, gate-out
    for direction in ("fwd", "bwd"):
      our_layer = get_network().layers["lstm%i_%s" % (i, direction[:2])]
      blocks_prefix = "bidirectionalencoder/EncoderBidirectionalLSTM%i" % (i + 1,)
      # (in,out*4), (out*4,)
      W_in, b = [load_blocks_var(
        "%s/%s_fork/fork_inputs.%s" % (blocks_prefix, {"bwd": "back", "fwd": "fwd"}[direction], p))
        for p in ("W", "b")]
      W_re = load_blocks_var(
        "%s/bidirectionalseparateparameters/%s.W_state" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction]))
      W = numpy.concatenate([W_in, W_re], axis=0)
      b = lstm_vec_blocks_to_tf(b)
      W = lstm_vec_blocks_to_tf(W)
      import_var(our_layer.params["rnn/lstm_cell/bias"], b)
      import_var(our_layer.params["rnn/lstm_cell/kernel"], W)
      import_var(our_layer.params["initial_c"], "%s/bidirectionalseparateparameters/%s.initial_cells" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction]))
      import_var(our_layer.params["initial_h"], "%s/bidirectionalseparateparameters/%s.initial_state" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction]))
      for s1, s2 in [("W_cell_to_in", "w_i_diag"), ("W_cell_to_forget", "w_f_diag"), ("W_cell_to_out", "w_o_diag")]:
        import_var(our_layer.params["rnn/lstm_cell/%s" % s2], "%s/bidirectionalseparateparameters/%s.%s" % (blocks_prefix, {"fwd": "forward", "bwd": "backward"}[direction], s1))
  import_var(get_network().layers["enc_ctx"].params["W"], "decoder/sequencegenerator/att_trans/attention/encoder_state_transformer.W")
  import_var(get_network().layers["enc_ctx"].params["b"], "decoder/sequencegenerator/att_trans/attention/encoder_state_transformer.b")
  import_var(our_params["output/rec/s/initial_c"], "decoder/sequencegenerator/att_trans/lstm_decoder.initial_cells")
  import_var(our_params["output/rec/s/initial_h"], "decoder/sequencegenerator/att_trans/lstm_decoder.initial_state")
  import_var(our_params["output/rec/weight_feedback/W"], "decoder/sequencegenerator/att_trans/attention/sum_alignment_transformer.W")
  import_var(our_params["output/rec/target_embed/W"], "decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W")
  import_var(our_params["fertility/W"], "decoder/sequencegenerator/att_trans/attention/fertility_transformer.W")
  import_var(our_params["output/rec/energy/W"], "decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W")
  prev_s_trans_W_states = load_blocks_var("decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W")
  prev_s_trans_W_cells = load_blocks_var("decoder/sequencegenerator/att_trans/attention/state_trans/transform_cells.W")
  prev_s_trans_W = numpy.concatenate([prev_s_trans_W_cells, prev_s_trans_W_states], axis=0)
  import_var(our_params["output/rec/prev_s_transformed/W"], prev_s_trans_W)
  import_var(our_params["output/rec/s/rec/lstm_cell/bias"], numpy.zeros(our_params["output/rec/s/rec/lstm_cell/bias"].shape))
  dec_lstm_kernel_in_feedback = load_blocks_var("decoder/sequencegenerator/att_trans/feedback_to_decoder/fork_inputs.W")
  dec_lstm_kernel_in_ctx = load_blocks_var("decoder/sequencegenerator/att_trans/context_to_decoder/fork_inputs.W")
  dec_lstm_kernel_re = load_blocks_var("decoder/sequencegenerator/att_trans/lstm_decoder.W_state")
  dec_lstm_kernel = numpy.concatenate([dec_lstm_kernel_in_feedback, dec_lstm_kernel_in_ctx, dec_lstm_kernel_re], axis=0)
  dec_lstm_kernel = lstm_vec_blocks_to_tf(dec_lstm_kernel)
  import_var(our_params["output/rec/s/rec/lstm_cell/kernel"], dec_lstm_kernel)
  for s1, s2 in [("W_cell_to_in", "w_i_diag"), ("W_cell_to_forget", "w_f_diag"), ("W_cell_to_out", "w_o_diag")]:
    import_var(our_params["output/rec/s/rec/lstm_cell/%s" % s2], "decoder/sequencegenerator/att_trans/lstm_decoder.%s" % s1)
  readout_in_W_states = load_blocks_var("decoder/sequencegenerator/readout/merge/transform_states.W")
  readout_in_W_feedback = load_blocks_var("decoder/sequencegenerator/readout/merge/transform_feedback.W")
  readout_in_W_att = load_blocks_var("decoder/sequencegenerator/readout/merge/transform_weighted_averages.W")
  readout_in_W = numpy.concatenate([readout_in_W_states, readout_in_W_feedback, readout_in_W_att], axis=0)
  import_var(our_params["output/rec/readout_in/W"], readout_in_W)
  import_var(our_params["output/rec/readout_in/b"], "decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b")
  import_var(our_params["output/rec/output_prob/W"], "decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W")
  import_var(our_params["output/rec/output_prob/b"], "decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b")

  print("Not initialized own params:")
  count = 0
  for key, v in sorted(our_params.items()):
    if key in our_loaded_params:
      continue
    print("  %s: %s, %s" % (key, v.shape, v.dtype.base_dtype.name))
    count += 1
  if not count:
    print("  None.")
  print("Not used Blocks params:")
  count = 0
  for key, value in sorted(blocks_params.items()):
    if key in blocks_used_params:
      continue
    print("  %s: %s, %s" % (key, value.shape, value.dtype))
    count += 1
  if not count:
    print("  None.")
  print("Done.")

  blocks_debug_dump_output = config.value("blocks_debug_dump_output", None)
  if blocks_debug_dump_output:
    print("Will read Blocks debug dump output from %r and compare with Returnn outputs." % blocks_debug_dump_output)
    blocks_initial_outputs = numpy.load("%s/initial_states_data.0.npz" % blocks_debug_dump_output)
    blocks_search_log = pickle.load(open("%s/search.log.pkl" % blocks_debug_dump_output, "rb"), encoding="bytes")
    blocks_search_log = {d[b"step"]: d for d in blocks_search_log}
    input_seq = blocks_initial_outputs["input"]
    beam_size, seq_len = input_seq.shape
    input_seq = input_seq[0]  # all the same, select beam 0
    assert isinstance(input_seq, numpy.ndarray)
    print("Debug input seq: %s" % input_seq.tolist())
    from GeneratingDataset import StaticDataset
    dataset = StaticDataset(
      data=[{"data": input_seq}],
      output_dim={"data": get_network().extern_data.get_default_input_data().get_kwargs()})
    dataset.init_seq_order(epoch=0)
    extract_output_dict = {
      "enc_src_emb": get_network().layers["source_embed"].output.get_placeholder_as_batch_major(),
      "encoder": get_network().layers["encoder"].output.get_placeholder_as_batch_major(),
      "enc_ctx": get_network().layers["enc_ctx"].output.get_placeholder_as_batch_major(),
      "output": get_network().layers["output"].output.get_placeholder_as_batch_major()
    }
    from TFNetworkLayer import concat_sources
    for i in range(num_encoder_layers):
      extract_output_dict["enc_layer_%i" % i] = concat_sources(
        [get_network().layers["lstm%i_fw" % i], get_network().layers["lstm%i_bw" % i]]
      ).get_placeholder_as_batch_major()
    extract_output_dict["enc_layer_0_fwd"] = get_network().layers["lstm0_fw"].output.get_placeholder_as_batch_major()
    our_output = rnn.engine.run_single(
      dataset=dataset, seq_idx=0, output_dict=extract_output_dict)
    blocks_out = blocks_initial_outputs["bidirectionalencoder_EncoderLookUp0__EncoderLookUp0_apply_output"]
    our_out = our_output["enc_src_emb"]
    print("our enc emb shape:", our_out.shape)
    print("Blocks enc emb shape:", blocks_out.shape)
    assert our_out.shape[:2] == (1, seq_len)
    assert blocks_out.shape[:2] == (seq_len, beam_size)
    assert our_out.shape[2] == blocks_out.shape[2]
    assert_almost_equal(our_out[0], blocks_out[:, 0], decimal=5)
    blocks_lstm0_out_ref = calc_lstm(blocks_out[:, 0], blocks_params)
    blocks_lstm0_out = blocks_initial_outputs["bidirectionalencoder_EncoderBidirectionalLSTM1_bidirectionalseparateparameters_forward__forward_apply_states"]
    our_lstm0_out = our_output["enc_layer_0_fwd"]
    assert blocks_lstm0_out.shape == (seq_len, beam_size) + blocks_lstm0_out_ref.shape
    assert our_lstm0_out.shape == (1, seq_len) + blocks_lstm0_out_ref.shape
    assert_almost_equal(blocks_lstm0_out[0, 0], blocks_lstm0_out_ref, decimal=6)
    print("Blocks LSTM0 frame 0 matched to ref calc.")
    assert_almost_equal(our_lstm0_out[0, 0], blocks_lstm0_out_ref, decimal=6)
    print("Our LSTM0 frame 0 matched to ref calc.")
    for i in range(num_encoder_layers):
      blocks_out = blocks_initial_outputs[
        "bidirectionalencoder_EncoderBidirectionalLSTM%i_bidirectionalseparateparameters__bidirectionalseparateparameters_apply_output_0" % (i + 1,)]
      our_out = our_output["enc_layer_%i" % i]
      print("our enc layer %i shape:" % i, our_out.shape)
      print("Blocks enc layer %i shape:" % i, blocks_out.shape)
      assert our_out.shape[:2] == (1, seq_len)
      assert blocks_out.shape[:2] == (seq_len, beam_size)
      assert our_out.shape[2] == blocks_out.shape[2]
      assert_almost_equal(our_out[0], blocks_out[:, 0], decimal=6)
    print("our encoder shape:", our_output["encoder"].shape)
    blocks_encoder_out = blocks_initial_outputs["bidirectionalencoder__bidirectionalencoder_apply_representation"]
    print("Blocks encoder shape:", blocks_encoder_out.shape)
    assert our_output["encoder"].shape[:2] == (1, seq_len)
    assert blocks_encoder_out.shape[:2] == (seq_len, beam_size)
    assert our_output["encoder"].shape[2] == blocks_encoder_out.shape[2]
    assert_almost_equal(our_output["encoder"][0], blocks_encoder_out[:, 0], decimal=6)
    blocks_first_frame_outputs = numpy.load("%s/next_states.0.npz" % blocks_debug_dump_output)
    blocks_enc_ctx_out = blocks_first_frame_outputs["decoder_sequencegenerator_att_trans_attention__attention_preprocess_preprocessed_attended"]
    our_enc_ctx_out = our_output["enc_ctx"]
    print("Blocks enc ctx shape:", blocks_enc_ctx_out.shape)
    assert blocks_enc_ctx_out.shape[:2] == (seq_len, beam_size)
    assert our_enc_ctx_out.shape[:2] == (1, seq_len)
    assert blocks_enc_ctx_out.shape[2:] == our_enc_ctx_out.shape[2:]
    assert_almost_equal(blocks_enc_ctx_out[:, 0], our_enc_ctx_out[0], decimal=5)
    fertility = numpy.dot(blocks_encoder_out[:, 0], blocks_params["decoder/sequencegenerator/att_trans/attention/fertility_transformer.W"])
    fertility = sigmoid(fertility)
    assert fertility.shape == (seq_len, 1)
    fertility = fertility[:, 0]
    assert fertility.shape == (seq_len,)
    our_dec_outputs = {v["step"]: v for v in _SubnetworkRecCell._debug_out}
    assert our_dec_outputs
    print("our dec frame keys:", sorted(our_dec_outputs[0].keys()))
    our_dec_search_outputs = {v["step"]: v for v in ChoiceLayer._debug_out}
    assert our_dec_search_outputs
    print("our dec search frame keys:", sorted(our_dec_search_outputs[0].keys()))
    print("Blocks search frame keys:", sorted(blocks_search_log[0].keys()))
    dec_lookup = blocks_params["decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W"]
    last_lstm_state = blocks_params["decoder/sequencegenerator/att_trans/lstm_decoder.initial_state"]
    last_lstm_cells = blocks_params["decoder/sequencegenerator/att_trans/lstm_decoder.initial_cells"]
    last_accumulated_weights = numpy.zeros((seq_len,), dtype="float32")
    last_output = 0
    dec_seq_len = 0
    for dec_step in range(100):
      blocks_frame_state_outputs_fn = "%s/next_states.%i.npz" % (blocks_debug_dump_output, dec_step)
      blocks_frame_probs_outputs_fn = "%s/logprobs.%i.npz" % (blocks_debug_dump_output, dec_step)
      if dec_step > 3:
        if not os.path.exists(blocks_frame_state_outputs_fn) or not os.path.exists(blocks_frame_probs_outputs_fn):
          print("Seq not ended yet but frame not found for step %i." % dec_step)
          break
      blocks_frame_state_outputs = numpy.load(blocks_frame_state_outputs_fn)
      blocks_frame_probs_outputs = numpy.load(blocks_frame_probs_outputs_fn)
      blocks_search_frame = blocks_search_log[dec_step]
      our_dec_frame_outputs = our_dec_outputs[dec_step]
      assert our_dec_frame_outputs["step"] == dec_step
      assert our_dec_frame_outputs[":i.output"].tolist() == [dec_step]
      our_dec_search_frame_outputs = our_dec_search_outputs[dec_step]

      blocks_last_lstm_state = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_states"]
      blocks_last_lstm_cells = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_cells"]
      assert blocks_last_lstm_state.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(blocks_last_lstm_state[0], last_lstm_state, decimal=5)
      assert_almost_equal(blocks_last_lstm_cells[0], last_lstm_cells, decimal=5)
      our_last_lstm_cells = our_dec_frame_outputs["prev:s.extra.state"][0]
      our_last_lstm_state = our_dec_frame_outputs["prev:s.extra.state"][1]
      assert our_last_lstm_state.shape == our_last_lstm_cells.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(our_last_lstm_state[0], last_lstm_state, decimal=5)
      assert_almost_equal(our_last_lstm_cells[0], last_lstm_cells, decimal=5)
      our_last_s = our_dec_frame_outputs["prev:s.output"]
      assert our_last_s.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(our_last_s[0], last_lstm_state, decimal=5)

      blocks_last_accum_weights = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_accumulated_weights"]
      assert blocks_last_accum_weights.shape == (beam_size, seq_len)
      assert_almost_equal(blocks_last_accum_weights[0], last_accumulated_weights, decimal=5)
      our_last_accum_weights = our_dec_frame_outputs["prev:accum_att_weights.output"]
      assert our_last_accum_weights.shape == (beam_size, seq_len if dec_step > 0 else 1, 1)
      if dec_step > 0:
        assert_almost_equal(our_last_accum_weights[0, :, 0], last_accumulated_weights, decimal=4)
      else:
        assert_almost_equal(our_last_accum_weights[0, 0, 0], last_accumulated_weights.sum(), decimal=4)

      energy_sum = numpy.copy(blocks_enc_ctx_out[:, 0])  # (T,enc-ctx-dim)
      weight_feedback = numpy.dot(last_accumulated_weights[:, None], blocks_params["decoder/sequencegenerator/att_trans/attention/sum_alignment_transformer.W"])
      energy_sum += weight_feedback
      transformed_states = numpy.dot(last_lstm_state[None, :], blocks_params["decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W"])
      transformed_cells = numpy.dot(last_lstm_cells[None, :], blocks_params["decoder/sequencegenerator/att_trans/attention/state_trans/transform_cells.W"])
      energy_sum += transformed_states + transformed_cells
      assert energy_sum.shape == (seq_len, blocks_enc_ctx_out.shape[-1])
      blocks_energy_sum_tanh = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention_energy_comp_tanh__tanh_apply_output"]
      assert blocks_energy_sum_tanh.shape == (seq_len, beam_size, energy_sum.shape[-1])
      assert_almost_equal(blocks_energy_sum_tanh[:, 0], numpy.tanh(energy_sum), decimal=5)
      assert_equal(our_dec_frame_outputs["weight_feedback.output"].shape, (beam_size, seq_len if dec_step > 0 else 1, blocks_enc_ctx_out.shape[-1]))
      assert_equal(our_dec_frame_outputs["prev_s_transformed.output"].shape, (beam_size, blocks_enc_ctx_out.shape[-1]))
      our_energy_sum = our_dec_frame_outputs["energy_in.output"]
      assert our_energy_sum.shape == (beam_size, seq_len, blocks_enc_ctx_out.shape[-1])
      assert_almost_equal(our_energy_sum[0], energy_sum, decimal=4)
      blocks_energy = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention_energy_comp__energy_comp_apply_output"]
      assert blocks_energy.shape == (seq_len, beam_size, 1)
      energy = numpy.dot(numpy.tanh(energy_sum), blocks_params["decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W"])
      assert energy.shape == (seq_len, 1)
      assert_almost_equal(blocks_energy[:, 0], energy, decimal=4)
      our_energy = our_dec_frame_outputs["energy.output"]
      assert our_energy.shape == (beam_size, seq_len, 1)
      assert_almost_equal(our_energy[0], energy, decimal=4)
      weights = softmax(energy[:, 0])
      assert weights.shape == (seq_len,)
      our_weights = our_dec_frame_outputs["att_weights.output"]
      assert our_weights.shape == (beam_size, seq_len, 1)
      assert_almost_equal(our_weights[0, :, 0], weights, decimal=4)
      accumulated_weights = last_accumulated_weights + weights / (2.0 * fertility)
      assert accumulated_weights.shape == (seq_len,)
      #blocks_accumulated_weights = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention__attention_take_glimpses_accumulated_weights"]
      #assert blocks_accumulated_weights.shape == (beam_size, seq_len)
      #assert_almost_equal(blocks_accumulated_weights[0], accumulated_weights, decimal=5)
      blocks_weights = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention__attention_compute_weights_output_0"]
      assert blocks_weights.shape == (seq_len, beam_size)
      assert_almost_equal(weights, blocks_weights[:, 0], decimal=4)
      our_accum_weights = our_dec_frame_outputs["accum_att_weights.output"]
      assert our_accum_weights.shape == (beam_size, seq_len, 1)
      weighted_avg = (weights[:, None] * blocks_encoder_out[:, 0]).sum(axis=0)  # att in our
      assert weighted_avg.shape == (blocks_encoder_out.shape[-1],)
      blocks_weighted_avg = blocks_frame_probs_outputs["decoder_sequencegenerator_att_trans_attention__attention_compute_weighted_averages_output_0"]
      assert blocks_weighted_avg.shape == (beam_size, blocks_encoder_out.shape[-1])
      assert_almost_equal(blocks_weighted_avg[0], weighted_avg, decimal=4)
      our_att = our_dec_frame_outputs["att.output"]
      assert our_att.shape == (beam_size, blocks_encoder_out.shape[-1])
      assert_almost_equal(our_att[0], weighted_avg, decimal=4)

      blocks_last_output = blocks_frame_probs_outputs["decoder_sequencegenerator__sequencegenerator_generate_outputs"]
      assert blocks_last_output.shape == (beam_size,)
      assert max(blocks_last_output[0], 0) == last_output
      last_target_embed = dec_lookup[last_output]
      if dec_step == 0:
        last_target_embed = numpy.zeros_like(last_target_embed)
      our_last_target_embed = our_dec_frame_outputs["prev:target_embed.output"]
      assert our_last_target_embed.shape == (beam_size, dec_lookup.shape[-1])
      assert_almost_equal(our_last_target_embed[0], last_target_embed, decimal=4)

      readout_in_state = numpy.dot(last_lstm_state, blocks_params["decoder/sequencegenerator/readout/merge/transform_states.W"])
      blocks_trans_state = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_states"]
      assert blocks_trans_state.shape == (beam_size, last_lstm_state.shape[0])
      assert_almost_equal(blocks_trans_state[0], readout_in_state, decimal=4)
      readout_in_feedback = numpy.dot(last_target_embed, blocks_params["decoder/sequencegenerator/readout/merge/transform_feedback.W"])
      blocks_trans_feedback = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_feedback"]
      assert blocks_trans_feedback.shape == (beam_size, readout_in_feedback.shape[0])
      assert_almost_equal(blocks_trans_feedback[0], readout_in_feedback, decimal=4)
      readout_in_weighted_avg = numpy.dot(weighted_avg, blocks_params["decoder/sequencegenerator/readout/merge/transform_weighted_averages.W"])
      blocks_trans_weighted_avg = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_weighted_averages"]
      assert blocks_trans_weighted_avg.shape == (beam_size, readout_in_weighted_avg.shape[0])
      assert_almost_equal(blocks_trans_weighted_avg[0], readout_in_weighted_avg, decimal=4)
      readout_in = readout_in_state + readout_in_feedback + readout_in_weighted_avg
      blocks_readout_in = blocks_frame_probs_outputs["decoder_sequencegenerator_readout_merge__merge_apply_output"]
      assert blocks_readout_in.shape == (beam_size, readout_in.shape[0])
      assert_almost_equal(blocks_readout_in[0], readout_in, decimal=4)
      readout_in += blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b"]
      assert readout_in.shape == (blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b"].shape[0],)
      our_readout_in = our_dec_frame_outputs["readout_in.output"]
      assert our_readout_in.shape == (beam_size, readout_in.shape[0])
      assert_almost_equal(our_readout_in[0], readout_in, decimal=4)
      readout = readout_in.reshape((readout_in.shape[0] // 2, 2)).max(axis=1)
      our_readout = our_dec_frame_outputs["readout.output"]
      assert our_readout.shape == (beam_size, readout.shape[0])
      assert_almost_equal(our_readout[0], readout, decimal=4)
      prob_logits = numpy.dot(readout, blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W"]) + \
        blocks_params["decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b"]
      assert prob_logits.ndim == 1
      blocks_prob_logits = blocks_frame_probs_outputs["decoder_sequencegenerator_readout__readout_readout_output_0"]
      assert blocks_prob_logits.shape == (beam_size, prob_logits.shape[0])
      assert_almost_equal(blocks_prob_logits[0], prob_logits, decimal=4)
      output_prob = softmax(prob_logits)
      log_output_prob = log_softmax(prob_logits)
      assert_almost_equal(numpy.log(output_prob), log_output_prob, decimal=4)
      our_output_prob = our_dec_frame_outputs["output_prob.output"]
      assert our_output_prob.shape == (beam_size, output_prob.shape[0])
      assert_almost_equal(our_output_prob[0], output_prob, decimal=4)
      blocks_nlog_prob = blocks_frame_probs_outputs["logprobs"]
      assert blocks_nlog_prob.shape == (beam_size, output_prob.shape[0])
      assert_almost_equal(blocks_nlog_prob[0], -log_output_prob, decimal=4)
      assert_almost_equal(our_dec_search_frame_outputs["scores_in_orig"][0], output_prob, decimal=4)
      assert_almost_equal(blocks_search_frame[b'logprobs'][0], -log_output_prob, decimal=4)
      #for b in range(beam_size):
      #  assert_almost_equal(-numpy.log(our_output_prob[b]), blocks_frame_probs_outputs["logprobs"][b], decimal=4)
      ref_output = numpy.argmax(output_prob)
      # Note: Don't take the readout.emit outputs. They are randomly sampled.
      blocks_dec_output = blocks_search_frame[b'outputs']
      assert blocks_dec_output.shape == (beam_size,)
      our_dec_output = our_dec_frame_outputs["output.output"]
      assert our_dec_output.shape == (beam_size,)
      print("Frame %i: Ref best greedy output symbol: %i" % (dec_step, int(ref_output)))
      print("Blocks labels:", blocks_dec_output.tolist())
      print("Our labels:", our_dec_output.tolist())
      # Well, the following two could be not true if all the other beams have much better scores,
      # but this is unlikely.
      assert ref_output in blocks_dec_output
      assert ref_output in our_dec_output
      if dec_step == 0:
        # This assumes that the results are ordered by score which might not be true (see tf.nn.top_k).
        assert blocks_dec_output[0] == our_dec_output[0] == ref_output
      # We assume that the best is the same. Note that this also might not be true if there are two equally best scores.
      # It also assumes that it's ordered by the score which also might not be true (see tf.nn.top_k).
      # For the same reason, the remaining list and entries might also not perfectly match.
      assert our_dec_output[0] == blocks_dec_output[0]
      # Just follow the first beam.
      ref_output = blocks_dec_output[0]
      assert our_dec_search_frame_outputs["src_beam_idxs"].shape == (1, beam_size)
      assert our_dec_search_frame_outputs["scores"].shape == (1, beam_size)
      print("Blocks src_beam_idxs:", blocks_search_frame[b'indexes'].tolist())
      print("Our src_beam_idxs:", our_dec_search_frame_outputs["src_beam_idxs"][0].tolist())
      print("Blocks scores:", blocks_search_frame[b'chosen_costs'].tolist())
      print("Our scores:", our_dec_search_frame_outputs["scores"][0].tolist())
      if list(our_dec_search_frame_outputs["src_beam_idxs"][0]) != list(blocks_search_frame[b'indexes']):
        print("Warning, beams do not match.")
        print("Blocks scores base:", blocks_search_frame[b'scores_base'].flatten().tolist())
        print("Our scores base:", our_dec_search_frame_outputs["scores_base"].flatten().tolist())
        #print("Blocks score in orig top k:", sorted(blocks_search_frame[b'logprobs'].flatten())[:beam_size])
        #print("Our score in orig top k:", sorted(-numpy.log(our_dec_search_frame_outputs["scores_in_orig"].flatten()))[:beam_size])
        print("Blocks score in top k:", sorted((blocks_search_frame[b'logprobs'] * blocks_search_log[dec_step - 1][b'mask'][:, None]).flatten())[:beam_size])
        print("Our score in top k:", sorted(-our_dec_search_frame_outputs["scores_in"].flatten())[:beam_size])
        blocks_scores_combined = blocks_search_frame[b'next_costs']
        our_scores_combined = our_dec_search_frame_outputs["scores_combined"]
        print("Blocks scores combined top k:", sorted(blocks_scores_combined.flatten())[:beam_size])
        print("Our neg scores combined top k:", sorted(-our_scores_combined.flatten())[:beam_size])
        #raise Exception("beams mismatch")
      assert our_dec_search_frame_outputs["src_beam_idxs"][0][0] == blocks_search_frame[b'indexes'][0]
      beam_idx = our_dec_search_frame_outputs["src_beam_idxs"][0][0]
      if beam_idx != 0:
        print("Selecting different beam: %i." % beam_idx)
        # Just overwrite the needed states by Blocks outputs.
        accumulated_weights = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_attention__attention_take_glimpses_accumulated_weights"][0]
        weighted_avg = blocks_frame_state_outputs["decoder_sequencegenerator__sequencegenerator_generate_weighted_averages"][0]
        last_lstm_state = blocks_frame_state_outputs["decoder_sequencegenerator__sequencegenerator_generate_states"][0]
        last_lstm_cells = blocks_frame_state_outputs["decoder_sequencegenerator__sequencegenerator_generate_cells"][0]

      # From now on, use blocks_frame_state_outputs instead of blocks_frame_probs_outputs because
      # it will have the beam reordered.
      blocks_target_emb = blocks_frame_state_outputs["decoder_sequencegenerator_fork__fork_apply_feedback_decoder_input"]
      assert blocks_target_emb.shape == (beam_size, dec_lookup.shape[1])
      target_embed = dec_lookup[ref_output]
      assert target_embed.shape == (dec_lookup.shape[1],)
      assert_almost_equal(blocks_target_emb[0], target_embed)

      feedback_to_decoder = numpy.dot(target_embed, blocks_params["decoder/sequencegenerator/att_trans/feedback_to_decoder/fork_inputs.W"])
      context_to_decoder = numpy.dot(weighted_avg, blocks_params["decoder/sequencegenerator/att_trans/context_to_decoder/fork_inputs.W"])
      lstm_z = feedback_to_decoder + context_to_decoder
      assert lstm_z.shape == feedback_to_decoder.shape == context_to_decoder.shape == (last_lstm_state.shape[-1] * 4,)
      blocks_feedback_to_decoder = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_feedback_to_decoder__feedback_to_decoder_apply_inputs"]
      blocks_context_to_decoder = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_context_to_decoder__context_to_decoder_apply_inputs"]
      assert blocks_feedback_to_decoder.shape == blocks_context_to_decoder.shape == (beam_size, last_lstm_state.shape[-1] * 4)
      assert_almost_equal(blocks_feedback_to_decoder[0], feedback_to_decoder, decimal=4)
      assert_almost_equal(blocks_context_to_decoder[0], context_to_decoder, decimal=4)
      lstm_state, lstm_cells = calc_raw_lstm(
        lstm_z, blocks_params=blocks_params,
        prefix="decoder/sequencegenerator/att_trans/lstm_decoder.",
        last_state=last_lstm_state, last_cell=last_lstm_cells)
      assert lstm_state.shape == last_lstm_state.shape == lstm_cells.shape == last_lstm_cells.shape
      blocks_lstm_state = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_lstm_decoder__lstm_decoder_apply_states"]
      blocks_lstm_cells = blocks_frame_state_outputs["decoder_sequencegenerator_att_trans_lstm_decoder__lstm_decoder_apply_cells"]
      assert blocks_lstm_state.shape == blocks_lstm_cells.shape == (beam_size, last_lstm_state.shape[-1])
      assert_almost_equal(blocks_lstm_state[0], lstm_state, decimal=4)
      assert_almost_equal(blocks_lstm_cells[0], lstm_cells, decimal=4)
      our_lstm_cells = our_dec_frame_outputs["s.extra.state"][0]
      our_lstm_state = our_dec_frame_outputs["s.extra.state"][1]
      assert our_lstm_state.shape == our_lstm_cells.shape == (beam_size, lstm_state.shape[0])
      assert_almost_equal(our_lstm_state[0], lstm_state, decimal=4)
      assert_almost_equal(our_lstm_cells[0], lstm_cells, decimal=4)
      our_s = our_dec_frame_outputs["s.output"]
      assert our_s.shape == (beam_size, lstm_state.shape[0])
      assert_almost_equal(our_s[0], lstm_state, decimal=4)

      last_accumulated_weights = accumulated_weights
      last_lstm_state = lstm_state
      last_lstm_cells = lstm_cells
      last_output = ref_output
      if last_output == 0:
        print("Sequence finished, seq len %i." % dec_step)
        dec_seq_len = dec_step
        break
    assert dec_seq_len > 0
    print("All outputs seem to match.")
  else:
    print("blocks_debug_dump_output not specified. It will not compare the model outputs." % blocks_debug_dump_output)

  if dry_run:
    print("Dry-run, not saving model.")
  else:
    rnn.engine.save_model(our_model_fn)
  print("Finished importing.")