Esempio n. 1
0
 def __init__(self, config, datadir, actspace, writer):
     self._c = config
     self._actspace = actspace
     self._actdim = actspace.n if hasattr(actspace,
                                          'n') else actspace.shape[0]
     self._writer = writer
     self._random = np.random.RandomState(config.seed)
     with tf.device('cpu:0'):
         self._step = tf.Variable(count_steps(datadir, config),
                                  dtype=tf.int64)
     self._should_pretrain = tools.Once()
     self._should_train = tools.Every(config.train_every)
     self._should_log = tools.Every(config.log_every)
     self._last_log = None
     self._last_time = time.time()
     self._metrics = collections.defaultdict(tf.metrics.Mean)
     self._metrics['expl_amount']  # Create variable for checkpoint.
     self._float = prec.global_policy().compute_dtype
     self._strategy = tf.distribute.MirroredStrategy()
     with self._strategy.scope():
         self._dataset = iter(
             self._strategy.experimental_distribute_dataset(
                 load_dataset(datadir, self._c)))
         self._build_model()
     print(f'model_lr:{self._c.model_lr}')
     print(f'actor_lr:{self._c.actor_lr}')
     print(f'value_lr:{self._c.value_lr}')
     print(f'grad_clip:{self._c.grad_clip}')
     print(f'batch_size:{self._c.batch_size}')
     print(f'deter_size:{self._c.deter_size}')
     print(f'stoch_size:{self._c.stoch_size}')
     print(f'kl_scale:{self._c.kl_scale}')
Esempio n. 2
0
    def __init__(self, config, datadir, actspace, writer):
        self._c = config
        self._actspace = actspace
        self._actdim = actspace.n if hasattr(actspace,
                                             'n') else actspace.shape[0]
        self._writer = writer
        self._random = np.random.RandomState(config.seed)
        with tf.device('cpu:0'):
            self._step = tf.Variable(count_steps(datadir, config),
                                     dtype=tf.int64)
        self._should_pretrain = tools.Once()
        self._should_train = tools.Every(config.train_every)
        self._should_log = tools.Every(config.log_every)
        self._last_log = None
        self._last_time = time.time()
        self._metrics = collections.defaultdict(tf.metrics.Mean)
        self._metrics['expl_amount']  # Create variable for checkpoint.
        self._float = prec.global_policy().compute_dtype
        self._strategy = tf.distribute.MirroredStrategy(
            devices=["/gpu:" + str(0)])
        print("device_num:", self._strategy.num_replicas_in_sync)

        with self._strategy.scope():
            self._dataset = iter(
                self._strategy.experimental_distribute_dataset(
                    load_dataset(datadir, self._c)))
            self._build_model()
Esempio n. 3
0
 def __init__(self, config, logger, dataset):
     self._config = config
     self._logger = logger
     self._float = prec.global_policy().compute_dtype
     self._should_log = tools.Every(config.log_every)
     self._should_train = tools.Every(config.train_every)
     self._should_pretrain = tools.Once()
     self._should_reset = tools.Every(config.reset_every)
     self._should_expl = tools.Until(
         int(config.expl_until / config.action_repeat))
     self._metrics = collections.defaultdict(tf.metrics.Mean)
     with tf.device('cpu:0'):
         self._step = tf.Variable(count_steps(config.traindir),
                                  dtype=tf.int64)
     # Schedules.
     config.actor_entropy = (
         lambda x=config.actor_entropy: tools.schedule(x, self._step))
     config.actor_state_entropy = (
         lambda x=config.actor_state_entropy: tools.schedule(x, self._step))
     config.imag_gradient_mix = (
         lambda x=config.imag_gradient_mix: tools.schedule(x, self._step))
     self._dataset = iter(dataset)
     self._wm = models.WorldModel(self._step, config)
     self._task_behavior = models.ImagBehavior(config, self._wm,
                                               config.behavior_stop_grad)
     reward = lambda f, s, a: self._wm.heads['reward'](f).mode()
     self._expl_behavior = dict(
         greedy=lambda: self._task_behavior,
         random=lambda: expl.Random(config),
         plan2explore=lambda: expl.Plan2Explore(config, self._wm, reward),
     )[config.expl_behavior]()
     # Train step to initialize variables including optimizer statistics.
     self._train(next(self._dataset))