def make_gym_env(name, seed=None, frame_op=None, frame_op_len=None, reward_scale=None, normalize_state=False): '''General method to create any Gym env; auto wraps Atari''' env = gym.make(name) if seed is not None: env.seed(seed) if 'NoFrameskip' in env.spec.id: # Atari env = wrap_atari(env) # no reward clipping to allow monitoring; Atari memory clips it episode_life = not util.in_eval_lab_modes() env = wrap_deepmind(env, episode_life, frame_op_len) elif len(env.observation_space.shape) == 3: # image-state env env = PreprocessImage(env) if normalize_state: env = NormalizeStateEnv(env) if frame_op_len is not None: # use concat for image (1, 84, 84) env = FrameStack(env, 'concat', frame_op_len) else: # vector-state env if normalize_state: env = NormalizeStateEnv(env) if frame_op is not None: env = FrameStack(env, frame_op, frame_op_len) if reward_scale is not None: env = ScaleRewardEnv(env, reward_scale) return env
def train(self): '''Train actor critic by computing the loss in batch efficiently''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) pdparams, v_preds = self.calc_pdparam_v(batch) advs, v_targets = self.calc_advs_v_targets(batch, v_preds) policy_loss = self.calc_policy_loss(batch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss # reset self.to_train = 0 logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def __init__(self, spec): super().__init__(spec) try_register_env(spec) # register if it's a custom gym env seed = ps.get(spec, 'meta.random_seed') episode_life = not util.in_eval_lab_modes() if self.is_venv: # make vector environment self.u_env = make_gym_venv(name=self.name, num_envs=self.num_envs, seed=seed, frame_op=self.frame_op, frame_op_len=self.frame_op_len, image_downsize=self.image_downsize, reward_scale=self.reward_scale, normalize_state=self.normalize_state, episode_life=episode_life) else: self.u_env = make_gym_env(name=self.name, seed=seed, frame_op=self.frame_op, frame_op_len=self.frame_op_len, image_downsize=self.image_downsize, reward_scale=self.reward_scale, normalize_state=self.normalize_state, episode_life=episode_life) if self.name.startswith('Unity'): # Unity is always initialized as singleton gym env, but the Unity runtime can be vec_env self.num_envs = self.u_env.num_envs # update variables dependent on num_envs self._infer_venv_attr() self._set_clock() self._set_attr_from_u_env(self.u_env) self.max_t = self.max_t or self.u_env.spec.max_episode_steps assert self.max_t is not None logger.info(util.self_desc(self))
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_iter): batch = self.sample() clock.set_batch_size(len(batch)) for _ in range(self.training_batch_iter): loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) total_loss += loss loss = total_loss / (self.training_iter * self.training_batch_iter) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def update(self, algorithm, clock): '''Get an updated value for var''' if (util.in_eval_lab_modes()) or self._updater_name == 'no_decay': return self.end_val step = clock.get() val = self._updater(self.start_val, self.end_val, self.start_step, self.end_step, step) return val
def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): self.body.flush() return np.nan clock = self.body.env.clock tick = clock.get(clock.max_tick_unit) self.to_train = (tick > self.training_start_step and tick % self.training_frequency == 0) if self.to_train == 1: total_loss = torch.tensor(0.0, device=self.net.device) for _ in range(self.training_epoch): batch = self.sample() for _ in range(self.training_batch_epoch): loss = self.calc_q_loss(batch) self.net.training_step(loss=loss, lr_clock=clock) total_loss += loss loss = total_loss / (self.training_epoch * self.training_batch_epoch) # reset self.to_train = 0 self.body.flush() logger.debug(f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def train(self): ''' Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def random(state, algorithm, body): '''Random action using gym.action_space.sample(), with the same format as default()''' if body.env.is_venv and not util.in_eval_lab_modes(): _action = [body.action_space.sample() for _ in range(body.env.num_envs)] else: _action = [body.action_space.sample()] action = torch.tensor(_action) return action
def act(self, state): '''Random action''' body = self.body if body.env.is_venv and not util.in_eval_lab_modes(): action = np.array( [body.action_space.sample() for _ in range(body.env.num_envs)]) else: action = body.action_space.sample() return action
def train(self): '''Trains the algorithm''' if util.in_eval_lab_modes(): self.body.flush() return np.nan if self.shared: return self.train_shared() else: return self.train_separate()
def guard_tensor(state, body): '''Guard-cast tensor before being input to network''' if isinstance(state, LazyFrames): state = state.__array__() # realize data state = torch.from_numpy(state.astype(np.float32)) if not body.env.is_venv or util.in_eval_lab_modes(): # singleton state, unsqueeze as minibatch for net input state = state.unsqueeze(dim=0) return state
def try_scale_reward(cls, reward): '''Env class to scale reward''' if util.in_eval_lab_modes(): # only trigger on training return reward if cls.reward_scale is not None: if cls.sign_reward: reward = np.sign(reward) else: reward *= cls.reward_scale return reward
def space_train(self): if util.in_eval_lab_modes(): return np.nan losses = [] for body in self.agent.nanflat_body_a: self.body = body losses.append(self.train()) # set body reference back to default self.body = self.agent.nanflat_body_a[0] loss_a = self.nanflat_to_data_a('loss', losses) return loss_a
def update(self, state, action, reward, next_state, done): '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net''' self.body.update(state, action, reward, next_state, done) if util.in_eval_lab_modes(): # eval does not update agent for training return self.body.memory.update(state, action, reward, next_state, done) loss = self.algorithm.train() if not np.isnan(loss): # set for log_summary() self.body.loss = loss explore_var = self.algorithm.update() return loss, explore_var
def to_ckpt(self, env, mode='eval'): '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end''' if mode == 'eval' and util.in_eval_lab_modes( ): # avoid double-eval: eval-ckpt in eval mode return False clock = env.clock frame = clock.get() frequency = env.eval_frequency if mode == 'eval' else env.log_frequency to_ckpt = util.frame_mod(frame, frequency, env.num_envs) or frame == clock.max_frame return to_ckpt
def post_init_nets(self): ''' Method to conditionally load models. Call at the end of init_nets() after setting self.net_names ''' assert hasattr(self, 'net_names') if util.in_eval_lab_modes(): logger.info( f'Loaded algorithm models for lab_mode: {util.get_lab_mode()}') self.load() else: logger.info( f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}' )
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: net_util.copy(self.net, self.old_net) # update old net batch = self.sample() clock.set_batch_size(len(batch)) with torch.no_grad(): states = batch['states'] if self.body.env.is_venv: states = math_util.venv_unpack(states) # NOTE states is massive with batch_size = time_horizon * num_envs. Chunk up so forward pass can fit into device esp. GPU num_chunks = int(len(states) / self.minibatch_size) v_preds_chunks = [self.calc_v(states_chunk, use_cache=False) for states_chunk in torch.chunk(states, num_chunks)] v_preds = torch.cat(v_preds_chunks) advs, v_targets = self.calc_advs_v_targets(batch, v_preds) # piggy back on batch, but remember to not pack or unpack batch['advs'], batch['v_targets'] = advs, v_targets if self.body.env.is_venv: # unpack if venv for minibatch sampling for k, v in batch.items(): if k not in ('advs', 'v_targets'): batch[k] = math_util.venv_unpack(v) total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): minibatches = util.split_minibatch(batch, self.minibatch_size) for minibatch in minibatches: if self.body.env.is_venv: # re-pack to restore proper shape for k, v in minibatch.items(): if k not in ('advs', 'v_targets'): minibatch[k] = math_util.venv_pack(v, self.body.env.num_envs) advs, v_targets = minibatch['advs'], minibatch['v_targets'] pdparams, v_preds = self.calc_pdparam_v(minibatch) policy_loss = self.calc_policy_loss(minibatch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss total_loss += loss loss = total_loss / self.training_epoch / len(minibatches) # reset self.to_train = 0 logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def to_ckpt(self, env, mode='eval'): '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end''' if mode == 'eval' and util.in_eval_lab_modes( ): # avoid double-eval: eval-ckpt in eval mode return False clock = env.clock frame = clock.get() frequency = env.eval_frequency if mode == 'eval' else env.log_frequency if frequency is None: # default episodic to_ckpt = env.done else: # normal ckpt condition by mod remainder (general for venv) to_ckpt = util.frame_mod(frame, frequency, env.num_envs) or frame == clock.max_frame return to_ckpt
def gather_aeb_rewards_df(aeb, session_datas, max_tick_unit): '''Gather rewards from each session for a body into a df''' aeb_session_rewards = {} for s, session_data in session_datas.items(): aeb_df = session_data[aeb] aeb_reward_sr = aeb_df['reward_ma'] aeb_reward_sr.index = aeb_df[max_tick_unit] # guard for duplicate eval result aeb_reward_sr = aeb_reward_sr[~aeb_reward_sr.index.duplicated()] if util.in_eval_lab_modes(): # guard for eval appending possibly not ordered aeb_reward_sr.sort_index(inplace=True) aeb_session_rewards[s] = aeb_reward_sr aeb_rewards_df = pd.DataFrame(aeb_session_rewards) return aeb_rewards_df
def load_algorithm(algorithm): '''Save all the nets for an algorithm''' agent = algorithm.agent net_names = algorithm.net_names if util.in_eval_lab_modes(): # load specific model in eval mode prepath = agent.info_space.eval_model_prepath else: prepath = util.get_prepath(agent.spec, agent.info_space, unit='session') logger.info(f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names}') for net_name in net_names: net = getattr(algorithm, net_name) model_path = f'{prepath}_{net_name}_model.pth' load(net, model_path) optim_path = f'{prepath}_{net_name}_optim.pth' load(net.optim, optim_path)
def __init__(self, spec): self.done = False self.env_spec = spec['env'][0] # idx 0 for single-env # set default util.set_attr( self, dict( log_frequency=None, # default to log at epi done frame_op=None, frame_op_len=None, normalize_state=False, reward_scale=None, num_envs=None, )) util.set_attr(self, spec['meta'], [ 'log_frequency', 'eval_frequency', ]) util.set_attr(self, self.env_spec, [ 'name', 'frame_op', 'frame_op_len', 'normalize_state', 'reward_scale', 'num_envs', 'max_t', 'max_frame', ]) seq_len = ps.get(spec, 'agent.0.net.seq_len') if seq_len is not None: # infer if using RNN self.frame_op = 'stack' self.frame_op_len = seq_len if util.in_eval_lab_modes(): # use singleton for eval self.num_envs = 1 self.log_frequency = None if spec['meta'][ 'distributed'] != False: # divide max_frame for distributed self.max_frame = int(self.max_frame / spec['meta']['max_session']) self.is_venv = (self.num_envs is not None and self.num_envs > 1) if self.is_venv: assert self.log_frequency is not None, f'Specify log_frequency when using venv' self.clock_speed = 1 * ( self.num_envs or 1 ) # tick with a multiple of num_envs to properly count frames self.clock = Clock(self.max_frame, self.clock_speed) self.to_render = util.to_render()
def try_ckpt(self, agent, env): '''Try to checkpoint agent at the start, save_freq, and the end''' tick = env.clock.get(env.max_tick_unit) to_ckpt = False if not util.in_eval_lab_modes() and tick <= env.max_tick: to_ckpt = (tick % env.eval_frequency == 0) or tick == env.max_tick if env.max_tick_unit == 'epi': # extra condition for epi to_ckpt = to_ckpt and env.done if to_ckpt: if self.spec['meta'].get('parallel_eval'): retro_analysis.run_parallel_eval(self, agent, env) else: self.run_eval_episode() if analysis.new_best(agent): agent.save(ckpt='best') if tick > 0: # nothing to analyze at start analysis.analyze_session(self, eager_analyze_trial=True)
def __init__(self, spec): super().__init__(spec) try_register_env(spec) # register if it's a custom gym env seed = ps.get(spec, 'meta.random_seed') episode_life = not util.in_eval_lab_modes() if self.is_venv: # make vector environment self.u_env = make_gym_venv(self.name, self.num_envs, seed, self.frame_op, self.frame_op_len, self.reward_scale, self.normalize_state, episode_life) else: self.u_env = make_gym_env(self.name, seed, self.frame_op, self.frame_op_len, self.reward_scale, self.normalize_state, episode_life) self._set_attr_from_u_env(self.u_env) self.max_t = self.max_t or self.u_env.spec.max_episode_steps assert self.max_t is not None logger.info(util.self_desc(self))
def train(self): if util.in_eval_lab_modes(): self.body.flush() return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() loss = self.calc_policy_loss(batch) self.net.training_step(loss=loss, lr_clock=clock) # reset self.to_train = 0 self.body.flush() logger.debug( f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def train(self): '''Train actor critic by computing the loss in batch efficiently''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: for _ in range(self.training_iter): batch = self.sample() clock.set_batch_size(len(batch)) states = batch['states'] actions = self.guard_q_actions(batch['actions']) q_targets = self.calc_q_targets(batch) # Q-value loss for both Q nets q1_preds = self.calc_q(states, actions, self.q1_net) q1_loss = self.calc_reg_loss(q1_preds, q_targets) self.q1_net.train_step(q1_loss, self.q1_optim, self.q1_lr_scheduler, clock=clock, global_net=self.global_q1_net) q2_preds = self.calc_q(states, actions, self.q2_net) q2_loss = self.calc_reg_loss(q2_preds, q_targets) self.q2_net.train_step(q2_loss, self.q2_optim, self.q2_lr_scheduler, clock=clock, global_net=self.global_q2_net) # policy loss action_pd = policy_util.init_action_pd(self.body.ActionPD, self.calc_pdparam(states)) log_probs, reparam_actions = self.calc_log_prob_action(action_pd, reparam=True) policy_loss = self.calc_policy_loss(batch, log_probs, reparam_actions) self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) # alpha loss alpha_loss = self.calc_alpha_loss(log_probs) self.train_alpha(alpha_loss) loss = q1_loss + q2_loss + policy_loss + alpha_loss # update target networks self.update_nets() # update PER priorities if availalbe self.try_update_per(torch.min(q1_preds, q2_preds), q_targets) # reset self.to_train = 0 logger.debug(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def load_algorithm(algorithm): '''Save all the nets for an algorithm''' agent = algorithm.agent net_names = algorithm.net_names if util.in_eval_lab_modes(): # load specific model in eval mode model_prepath = agent.spec['meta']['eval_model_prepath'] else: model_prepath = agent.spec['meta']['model_prepath'] logger.info( f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names} from {model_prepath}_*.pt' ) for net_name in net_names: net = getattr(algorithm, net_name) model_path = f'{model_prepath}_{net_name}_model.pt' load(net, model_path) optim_name = net_name.replace('net', 'optim') optim = getattr(algorithm, optim_name, None) if optim is not None: # only trainable net has optim optim_path = f'{model_prepath}_{net_name}_optim.pt' load(optim, optim_path)
def save_session_df(session_data, filepath, info_space): '''Save session_df, and if is in eval mode, modify it and save with append''' if util.in_eval_lab_modes(): ckpt = util.find_ckpt(info_space.eval_model_prepath) epi = int(re.search('epi(\d+)', ckpt)[1]) totalt = int(re.search('totalt(\d+)', ckpt)[1]) session_df = pd.concat(session_data, axis=1) mean_sr = session_df.mean() mean_sr.name = totalt # set index to prevent all being the same eval_session_df = pd.DataFrame(data=[mean_sr]) # set sr name too, to total_t for aeb in util.get_df_aeb_list(eval_session_df): eval_session_df.loc[:, aeb + ('epi',)] = epi eval_session_df.loc[:, aeb + ('total_t',)] = totalt # if eval, save with append mode header = not os.path.exists(filepath) with open(filepath, 'a') as f: eval_session_df.to_csv(f, header=header) else: session_df = pd.concat(session_data, axis=1) util.write(session_df, filepath)
def train(self): ''' Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): self.body.flush() return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() loss = self.calc_q_loss(batch) self.net.training_step(loss=loss, lr_clock=clock) # reset self.to_train = 0 self.body.flush() logger.debug( f'Trained {self.name} at epi: {clock.epi}, total_t: {clock.total_t}, t: {clock.t}, total_reward so far: {self.body.memory.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def __init__(self, spec): self.env_spec = spec['env'][0] # idx 0 for single-env # set default util.set_attr( self, dict( eval_frequency=10000, log_frequency=10000, frame_op=None, frame_op_len=None, image_downsize=(84, 84), normalize_state=False, reward_scale=None, num_envs=1, )) util.set_attr(self, spec['meta'], [ 'eval_frequency', 'log_frequency', ]) util.set_attr(self, self.env_spec, [ 'name', 'frame_op', 'frame_op_len', 'image_downsize', 'normalize_state', 'reward_scale', 'num_envs', 'max_t', 'max_frame', ]) # override if env is for eval if util.in_eval_lab_modes(): self.num_envs = ps.get(spec, 'meta.rigorous_eval') self.to_render = util.to_render() self._infer_frame_attr(spec) self._infer_venv_attr() self._set_clock() self.done = False self.total_reward = np.nan
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) pdparams = self.calc_pdparam_batch(batch) advs = self.calc_ret_advs(batch) loss = self.calc_policy_loss(batch, pdparams, advs) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.env.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan