Beispiel #1
0
def make_gym_env(name,
                 seed=None,
                 frame_op=None,
                 frame_op_len=None,
                 reward_scale=None,
                 normalize_state=False):
    '''General method to create any Gym env; auto wraps Atari'''
    env = gym.make(name)
    if seed is not None:
        env.seed(seed)
    if 'NoFrameskip' in env.spec.id:  # Atari
        env = wrap_atari(env)
        # no reward clipping to allow monitoring; Atari memory clips it
        episode_life = not util.in_eval_lab_modes()
        env = wrap_deepmind(env, episode_life, frame_op_len)
    elif len(env.observation_space.shape) == 3:  # image-state env
        env = PreprocessImage(env)
        if normalize_state:
            env = NormalizeStateEnv(env)
        if frame_op_len is not None:  # use concat for image (1, 84, 84)
            env = FrameStack(env, 'concat', frame_op_len)
    else:  # vector-state env
        if normalize_state:
            env = NormalizeStateEnv(env)
        if frame_op is not None:
            env = FrameStack(env, frame_op, frame_op_len)
    if reward_scale is not None:
        env = ScaleRewardEnv(env, reward_scale)
    return env
Beispiel #2
0
 def train(self):
     '''Train actor critic by computing the loss in batch efficiently'''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         pdparams, v_preds = self.calc_pdparam_v(batch)
         advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
         policy_loss = self.calc_policy_loss(batch, pdparams, advs)  # from actor
         val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
         if self.shared:  # shared network
             loss = policy_loss + val_loss
             self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
         else:
             self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
             self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net)
             loss = policy_loss + val_loss
         # reset
         self.to_train = 0
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
Beispiel #3
0
 def __init__(self, spec):
     super().__init__(spec)
     try_register_env(spec)  # register if it's a custom gym env
     seed = ps.get(spec, 'meta.random_seed')
     episode_life = not util.in_eval_lab_modes()
     if self.is_venv:  # make vector environment
         self.u_env = make_gym_venv(name=self.name,
                                    num_envs=self.num_envs,
                                    seed=seed,
                                    frame_op=self.frame_op,
                                    frame_op_len=self.frame_op_len,
                                    image_downsize=self.image_downsize,
                                    reward_scale=self.reward_scale,
                                    normalize_state=self.normalize_state,
                                    episode_life=episode_life)
     else:
         self.u_env = make_gym_env(name=self.name,
                                   seed=seed,
                                   frame_op=self.frame_op,
                                   frame_op_len=self.frame_op_len,
                                   image_downsize=self.image_downsize,
                                   reward_scale=self.reward_scale,
                                   normalize_state=self.normalize_state,
                                   episode_life=episode_life)
     if self.name.startswith('Unity'):
         # Unity is always initialized as singleton gym env, but the Unity runtime can be vec_env
         self.num_envs = self.u_env.num_envs
         # update variables dependent on num_envs
         self._infer_venv_attr()
         self._set_clock()
     self._set_attr_from_u_env(self.u_env)
     self.max_t = self.max_t or self.u_env.spec.max_episode_steps
     assert self.max_t is not None
     logger.info(util.self_desc(self))
Beispiel #4
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_iter):
             batch = self.sample()
             clock.set_batch_size(len(batch))
             for _ in range(self.training_batch_iter):
                 loss = self.calc_q_loss(batch)
                 self.net.train_step(loss,
                                     self.optim,
                                     self.lr_scheduler,
                                     clock=clock,
                                     global_net=self.global_net)
                 total_loss += loss
         loss = total_loss / (self.training_iter * self.training_batch_iter)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Beispiel #5
0
 def update(self, algorithm, clock):
     '''Get an updated value for var'''
     if (util.in_eval_lab_modes()) or self._updater_name == 'no_decay':
         return self.end_val
     step = clock.get()
     val = self._updater(self.start_val, self.end_val, self.start_step, self.end_step, step)
     return val
Beispiel #6
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency.
     Each training step consists of sampling n batches from the agent's memory.
     For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         self.body.flush()
         return np.nan
     clock = self.body.env.clock
     tick = clock.get(clock.max_tick_unit)
     self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0)
     if self.to_train == 1:
         total_loss = torch.tensor(0.0, device=self.net.device)
         for _ in range(self.training_epoch):
             batch = self.sample()
             for _ in range(self.training_batch_epoch):
                 loss = self.calc_q_loss(batch)
                 self.net.training_step(loss=loss, lr_clock=clock)
                 total_loss += loss
         loss = total_loss / (self.training_epoch * self.training_batch_epoch)
         # reset
         self.to_train = 0
         self.body.flush()
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
Beispiel #7
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         loss = self.calc_q_loss(batch)
         self.net.train_step(loss,
                             self.optim,
                             self.lr_scheduler,
                             clock=clock,
                             global_net=self.global_net)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Beispiel #8
0
def random(state, algorithm, body):
    '''Random action using gym.action_space.sample(), with the same format as default()'''
    if body.env.is_venv and not util.in_eval_lab_modes():
        _action = [body.action_space.sample() for _ in range(body.env.num_envs)]
    else:
        _action = [body.action_space.sample()]
    action = torch.tensor(_action)
    return action
Beispiel #9
0
 def act(self, state):
     '''Random action'''
     body = self.body
     if body.env.is_venv and not util.in_eval_lab_modes():
         action = np.array(
             [body.action_space.sample() for _ in range(body.env.num_envs)])
     else:
         action = body.action_space.sample()
     return action
Beispiel #10
0
 def train(self):
     '''Trains the algorithm'''
     if util.in_eval_lab_modes():
         self.body.flush()
         return np.nan
     if self.shared:
         return self.train_shared()
     else:
         return self.train_separate()
Beispiel #11
0
def guard_tensor(state, body):
    '''Guard-cast tensor before being input to network'''
    if isinstance(state, LazyFrames):
        state = state.__array__()  # realize data
    state = torch.from_numpy(state.astype(np.float32))
    if not body.env.is_venv or util.in_eval_lab_modes():
        # singleton state, unsqueeze as minibatch for net input
        state = state.unsqueeze(dim=0)
    return state
Beispiel #12
0
def try_scale_reward(cls, reward):
    '''Env class to scale reward'''
    if util.in_eval_lab_modes():  # only trigger on training
        return reward
    if cls.reward_scale is not None:
        if cls.sign_reward:
            reward = np.sign(reward)
        else:
            reward *= cls.reward_scale
    return reward
Beispiel #13
0
 def space_train(self):
     if util.in_eval_lab_modes():
         return np.nan
     losses = []
     for body in self.agent.nanflat_body_a:
         self.body = body
         losses.append(self.train())
     # set body reference back to default
     self.body = self.agent.nanflat_body_a[0]
     loss_a = self.nanflat_to_data_a('loss', losses)
     return loss_a
Beispiel #14
0
 def update(self, state, action, reward, next_state, done):
     '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net'''
     self.body.update(state, action, reward, next_state, done)
     if util.in_eval_lab_modes():  # eval does not update agent for training
         return
     self.body.memory.update(state, action, reward, next_state, done)
     loss = self.algorithm.train()
     if not np.isnan(loss):  # set for log_summary()
         self.body.loss = loss
     explore_var = self.algorithm.update()
     return loss, explore_var
Beispiel #15
0
 def to_ckpt(self, env, mode='eval'):
     '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end'''
     if mode == 'eval' and util.in_eval_lab_modes(
     ):  # avoid double-eval: eval-ckpt in eval mode
         return False
     clock = env.clock
     frame = clock.get()
     frequency = env.eval_frequency if mode == 'eval' else env.log_frequency
     to_ckpt = util.frame_mod(frame, frequency,
                              env.num_envs) or frame == clock.max_frame
     return to_ckpt
Beispiel #16
0
 def post_init_nets(self):
     '''
     Method to conditionally load models.
     Call at the end of init_nets() after setting self.net_names
     '''
     assert hasattr(self, 'net_names')
     if util.in_eval_lab_modes():
         logger.info(
             f'Loaded algorithm models for lab_mode: {util.get_lab_mode()}')
         self.load()
     else:
         logger.info(
             f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}'
         )
Beispiel #17
0
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         net_util.copy(self.net, self.old_net)  # update old net
         batch = self.sample()
         clock.set_batch_size(len(batch))
         with torch.no_grad():
             states = batch['states']
             if self.body.env.is_venv:
                 states = math_util.venv_unpack(states)
             # NOTE states is massive with batch_size = time_horizon * num_envs. Chunk up so forward pass can fit into device esp. GPU
             num_chunks = int(len(states) / self.minibatch_size)
             v_preds_chunks = [self.calc_v(states_chunk, use_cache=False) for states_chunk in torch.chunk(states, num_chunks)]
             v_preds = torch.cat(v_preds_chunks)
             advs, v_targets = self.calc_advs_v_targets(batch, v_preds)
         # piggy back on batch, but remember to not pack or unpack
         batch['advs'], batch['v_targets'] = advs, v_targets
         if self.body.env.is_venv:  # unpack if venv for minibatch sampling
             for k, v in batch.items():
                 if k not in ('advs', 'v_targets'):
                     batch[k] = math_util.venv_unpack(v)
         total_loss = torch.tensor(0.0)
         for _ in range(self.training_epoch):
             minibatches = util.split_minibatch(batch, self.minibatch_size)
             for minibatch in minibatches:
                 if self.body.env.is_venv:  # re-pack to restore proper shape
                     for k, v in minibatch.items():
                         if k not in ('advs', 'v_targets'):
                             minibatch[k] = math_util.venv_pack(v, self.body.env.num_envs)
                 advs, v_targets = minibatch['advs'], minibatch['v_targets']
                 pdparams, v_preds = self.calc_pdparam_v(minibatch)
                 policy_loss = self.calc_policy_loss(minibatch, pdparams, advs)  # from actor
                 val_loss = self.calc_val_loss(v_preds, v_targets)  # from critic
                 if self.shared:  # shared network
                     loss = policy_loss + val_loss
                     self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
                 else:
                     self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)
                     self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net)
                     loss = policy_loss + val_loss
                 total_loss += loss
         loss = total_loss / self.training_epoch / len(minibatches)
         # reset
         self.to_train = 0
         logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
         return loss.item()
     else:
         return np.nan
Beispiel #18
0
 def to_ckpt(self, env, mode='eval'):
     '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end'''
     if mode == 'eval' and util.in_eval_lab_modes(
     ):  # avoid double-eval: eval-ckpt in eval mode
         return False
     clock = env.clock
     frame = clock.get()
     frequency = env.eval_frequency if mode == 'eval' else env.log_frequency
     if frequency is None:  # default episodic
         to_ckpt = env.done
     else:  # normal ckpt condition by mod remainder (general for venv)
         to_ckpt = util.frame_mod(frame, frequency,
                                  env.num_envs) or frame == clock.max_frame
     return to_ckpt
Beispiel #19
0
def gather_aeb_rewards_df(aeb, session_datas, max_tick_unit):
    '''Gather rewards from each session for a body into a df'''
    aeb_session_rewards = {}
    for s, session_data in session_datas.items():
        aeb_df = session_data[aeb]
        aeb_reward_sr = aeb_df['reward_ma']
        aeb_reward_sr.index = aeb_df[max_tick_unit]
        # guard for duplicate eval result
        aeb_reward_sr = aeb_reward_sr[~aeb_reward_sr.index.duplicated()]
        if util.in_eval_lab_modes():
            # guard for eval appending possibly not ordered
            aeb_reward_sr.sort_index(inplace=True)
        aeb_session_rewards[s] = aeb_reward_sr
    aeb_rewards_df = pd.DataFrame(aeb_session_rewards)
    return aeb_rewards_df
Beispiel #20
0
def load_algorithm(algorithm):
    '''Save all the nets for an algorithm'''
    agent = algorithm.agent
    net_names = algorithm.net_names
    if util.in_eval_lab_modes():
        # load specific model in eval mode
        prepath = agent.info_space.eval_model_prepath
    else:
        prepath = util.get_prepath(agent.spec, agent.info_space, unit='session')
    logger.info(f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names}')
    for net_name in net_names:
        net = getattr(algorithm, net_name)
        model_path = f'{prepath}_{net_name}_model.pth'
        load(net, model_path)
        optim_path = f'{prepath}_{net_name}_optim.pth'
        load(net.optim, optim_path)
Beispiel #21
0
 def __init__(self, spec):
     self.done = False
     self.env_spec = spec['env'][0]  # idx 0 for single-env
     # set default
     util.set_attr(
         self,
         dict(
             log_frequency=None,  # default to log at epi done
             frame_op=None,
             frame_op_len=None,
             normalize_state=False,
             reward_scale=None,
             num_envs=None,
         ))
     util.set_attr(self, spec['meta'], [
         'log_frequency',
         'eval_frequency',
     ])
     util.set_attr(self, self.env_spec, [
         'name',
         'frame_op',
         'frame_op_len',
         'normalize_state',
         'reward_scale',
         'num_envs',
         'max_t',
         'max_frame',
     ])
     seq_len = ps.get(spec, 'agent.0.net.seq_len')
     if seq_len is not None:  # infer if using RNN
         self.frame_op = 'stack'
         self.frame_op_len = seq_len
     if util.in_eval_lab_modes():  # use singleton for eval
         self.num_envs = 1
         self.log_frequency = None
     if spec['meta'][
             'distributed'] != False:  # divide max_frame for distributed
         self.max_frame = int(self.max_frame / spec['meta']['max_session'])
     self.is_venv = (self.num_envs is not None and self.num_envs > 1)
     if self.is_venv:
         assert self.log_frequency is not None, f'Specify log_frequency when using venv'
     self.clock_speed = 1 * (
         self.num_envs or 1
     )  # tick with a multiple of num_envs to properly count frames
     self.clock = Clock(self.max_frame, self.clock_speed)
     self.to_render = util.to_render()
Beispiel #22
0
    def try_ckpt(self, agent, env):
        '''Try to checkpoint agent at the start, save_freq, and the end'''
        tick = env.clock.get(env.max_tick_unit)
        to_ckpt = False
        if not util.in_eval_lab_modes() and tick <= env.max_tick:
            to_ckpt = (tick % env.eval_frequency == 0) or tick == env.max_tick
        if env.max_tick_unit == 'epi':  # extra condition for epi
            to_ckpt = to_ckpt and env.done

        if to_ckpt:
            if self.spec['meta'].get('parallel_eval'):
                retro_analysis.run_parallel_eval(self, agent, env)
            else:
                self.run_eval_episode()
            if analysis.new_best(agent):
                agent.save(ckpt='best')
            if tick > 0:  # nothing to analyze at start
                analysis.analyze_session(self, eager_analyze_trial=True)
Beispiel #23
0
 def __init__(self, spec):
     super().__init__(spec)
     try_register_env(spec)  # register if it's a custom gym env
     seed = ps.get(spec, 'meta.random_seed')
     episode_life = not util.in_eval_lab_modes()
     if self.is_venv:  # make vector environment
         self.u_env = make_gym_venv(self.name, self.num_envs, seed,
                                    self.frame_op, self.frame_op_len,
                                    self.reward_scale, self.normalize_state,
                                    episode_life)
     else:
         self.u_env = make_gym_env(self.name, seed, self.frame_op,
                                   self.frame_op_len, self.reward_scale,
                                   self.normalize_state, episode_life)
     self._set_attr_from_u_env(self.u_env)
     self.max_t = self.max_t or self.u_env.spec.max_episode_steps
     assert self.max_t is not None
     logger.info(util.self_desc(self))
Beispiel #24
0
 def train(self):
     if util.in_eval_lab_modes():
         self.body.flush()
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         loss = self.calc_policy_loss(batch)
         self.net.training_step(loss=loss, lr_clock=clock)
         # reset
         self.to_train = 0
         self.body.flush()
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Beispiel #25
0
    def train(self):
        '''Train actor critic by computing the loss in batch efficiently'''
        if util.in_eval_lab_modes():
            return np.nan
        clock = self.body.env.clock
        if self.to_train == 1:
            for _ in range(self.training_iter):
                batch = self.sample()
                clock.set_batch_size(len(batch))

                states = batch['states']
                actions = self.guard_q_actions(batch['actions'])
                q_targets = self.calc_q_targets(batch)
                # Q-value loss for both Q nets
                q1_preds = self.calc_q(states, actions, self.q1_net)
                q1_loss = self.calc_reg_loss(q1_preds, q_targets)
                self.q1_net.train_step(q1_loss, self.q1_optim, self.q1_lr_scheduler, clock=clock, global_net=self.global_q1_net)

                q2_preds = self.calc_q(states, actions, self.q2_net)
                q2_loss = self.calc_reg_loss(q2_preds, q_targets)
                self.q2_net.train_step(q2_loss, self.q2_optim, self.q2_lr_scheduler, clock=clock, global_net=self.global_q2_net)

                # policy loss
                action_pd = policy_util.init_action_pd(self.body.ActionPD, self.calc_pdparam(states))
                log_probs, reparam_actions = self.calc_log_prob_action(action_pd, reparam=True)
                policy_loss = self.calc_policy_loss(batch, log_probs, reparam_actions)
                self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net)

                # alpha loss
                alpha_loss = self.calc_alpha_loss(log_probs)
                self.train_alpha(alpha_loss)

                loss = q1_loss + q2_loss + policy_loss + alpha_loss
                # update target networks
                self.update_nets()
                # update PER priorities if availalbe
                self.try_update_per(torch.min(q1_preds, q2_preds), q_targets)

            # reset
            self.to_train = 0
            logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}')
            return loss.item()
        else:
            return np.nan
Beispiel #26
0
def load_algorithm(algorithm):
    '''Save all the nets for an algorithm'''
    agent = algorithm.agent
    net_names = algorithm.net_names
    if util.in_eval_lab_modes():
        # load specific model in eval mode
        model_prepath = agent.spec['meta']['eval_model_prepath']
    else:
        model_prepath = agent.spec['meta']['model_prepath']
    logger.info(
        f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names} from {model_prepath}_*.pt'
    )
    for net_name in net_names:
        net = getattr(algorithm, net_name)
        model_path = f'{model_prepath}_{net_name}_model.pt'
        load(net, model_path)
        optim_name = net_name.replace('net', 'optim')
        optim = getattr(algorithm, optim_name, None)
        if optim is not None:  # only trainable net has optim
            optim_path = f'{model_prepath}_{net_name}_optim.pt'
            load(optim, optim_path)
Beispiel #27
0
def save_session_df(session_data, filepath, info_space):
    '''Save session_df, and if is in eval mode, modify it and save with append'''
    if util.in_eval_lab_modes():
        ckpt = util.find_ckpt(info_space.eval_model_prepath)
        epi = int(re.search('epi(\d+)', ckpt)[1])
        totalt = int(re.search('totalt(\d+)', ckpt)[1])
        session_df = pd.concat(session_data, axis=1)
        mean_sr = session_df.mean()
        mean_sr.name = totalt  # set index to prevent all being the same
        eval_session_df = pd.DataFrame(data=[mean_sr])
        # set sr name too, to total_t
        for aeb in util.get_df_aeb_list(eval_session_df):
            eval_session_df.loc[:, aeb + ('epi',)] = epi
            eval_session_df.loc[:, aeb + ('total_t',)] = totalt
        # if eval, save with append mode
        header = not os.path.exists(filepath)
        with open(filepath, 'a') as f:
            eval_session_df.to_csv(f, header=header)
    else:
        session_df = pd.concat(session_data, axis=1)
        util.write(session_df, filepath)
Beispiel #28
0
 def train(self):
     '''
     Completes one training step for the agent if it is time to train.
     Otherwise this function does nothing.
     '''
     if util.in_eval_lab_modes():
         self.body.flush()
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         loss = self.calc_q_loss(batch)
         self.net.training_step(loss=loss, lr_clock=clock)
         # reset
         self.to_train = 0
         self.body.flush()
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan
Beispiel #29
0
 def __init__(self, spec):
     self.env_spec = spec['env'][0]  # idx 0 for single-env
     # set default
     util.set_attr(
         self,
         dict(
             eval_frequency=10000,
             log_frequency=10000,
             frame_op=None,
             frame_op_len=None,
             image_downsize=(84, 84),
             normalize_state=False,
             reward_scale=None,
             num_envs=1,
         ))
     util.set_attr(self, spec['meta'], [
         'eval_frequency',
         'log_frequency',
     ])
     util.set_attr(self, self.env_spec, [
         'name',
         'frame_op',
         'frame_op_len',
         'image_downsize',
         'normalize_state',
         'reward_scale',
         'num_envs',
         'max_t',
         'max_frame',
     ])
     # override if env is for eval
     if util.in_eval_lab_modes():
         self.num_envs = ps.get(spec, 'meta.rigorous_eval')
     self.to_render = util.to_render()
     self._infer_frame_attr(spec)
     self._infer_venv_attr()
     self._set_clock()
     self.done = False
     self.total_reward = np.nan
Beispiel #30
0
 def train(self):
     if util.in_eval_lab_modes():
         return np.nan
     clock = self.body.env.clock
     if self.to_train == 1:
         batch = self.sample()
         clock.set_batch_size(len(batch))
         pdparams = self.calc_pdparam_batch(batch)
         advs = self.calc_ret_advs(batch)
         loss = self.calc_policy_loss(batch, pdparams, advs)
         self.net.train_step(loss,
                             self.optim,
                             self.lr_scheduler,
                             clock=clock,
                             global_net=self.global_net)
         # reset
         self.to_train = 0
         logger.debug(
             f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}'
         )
         return loss.item()
     else:
         return np.nan