def train(self): ''' Completes one training step for the agent if it is time to train. i.e. the environment timestep is greater than the minimum training timestep and a multiple of the training_frequency. Each training step consists of sampling n batches from the agent's memory. For each of the batches, the target Q values (q_targets) are computed and a single training step is taken k times Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_iter): batch = self.sample() clock.set_batch_size(len(batch)) for _ in range(self.training_batch_iter): loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) total_loss += loss loss = total_loss / (self.training_iter * self.training_batch_iter) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: total_loss = torch.tensor(0.0) for _ in range(self.training_iter): batches = [] if self.body.warmup_memory.size >= self.body.warmup_memory.batch_size: batches.append(self.warmup_sample()) if self.body.memory.size >= self.body.memory.batch_size: batches.append(self.sample()) clock.set_batch_size(sum(len(batch) for batch in batches)) for batch in batches: for _ in range(self.training_batch_iter): loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) total_loss += loss loss = total_loss / (self.training_iter * self.training_batch_iter) # reset self.to_train = 0 logger.info( f'Trained {self.name} at epi: {clock.epi}, warmup_size: {self.body.warmup_memory.size}, memory_size: {self.body.memory.size}, loss: {loss:g}' ) return loss.item() else: return np.nan
def update(self, obs, action, reward, next_obs, done): '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net''' # update state input_act, next_state, encoded_state = self.state_update( next_obs, action) # update body self.body.update(self.body.state, action, reward, next_state, done) # update memory if util.in_eval_lab_modes( ) or self.algorithm.__class__.__name__ == 'ExternalPolicy': # eval does not update agent for training self.body.state, self.body.encoded_state = next_state, encoded_state return if not hasattr( self.body, 'warmup_memory') or self.body.env.clock.epi > self.warmup_epi: self.body.memory.update(self.body.encoded_state, self.body.action, reward, encoded_state, done) else: self.body.warmup_memory.update(self.body.encoded_state, self.body.action, reward, encoded_state, done) # update body self.body.state, self.body.encoded_state = next_state, encoded_state # train algorithm loss = self.algorithm.train() if not np.isnan(loss): # set for log_summary() self.body.loss = loss explore_var = self.algorithm.update() return loss, explore_var
def update(self, algorithm, clock): '''Get an updated value for var''' if (util.in_eval_lab_modes()) or self._updater_name == 'no_decay': return self.end_val step = clock.get() val = self._updater(self.start_val, self.end_val, self.start_step, self.end_step, step) return val
def train(self): ''' Completes one training step for the agent if it is time to train. Otherwise this function does nothing. ''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) loss = self.calc_q_loss(batch) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def save(self, ckpt=None): '''Save agent''' if self.algorithm.__class__.__name__ == 'ExternalPolicy': return if util.in_eval_lab_modes(): # eval does not save new models return self.algorithm.save(ckpt=ckpt)
def random(state, algorithm, body): '''Random action using gym.action_space.sample(), with the same format as default()''' if body.env.is_venv and not util.in_eval_lab_modes(): _action = [body.action_space.sample() for _ in range(body.env.num_envs)] else: _action = body.action_space.sample() action = torch.tensor([_action]) return action
def act(self, state): '''Random action''' body = self.body if body.env.is_venv and not util.in_eval_lab_modes(): action = np.array( [body.action_space.sample() for _ in range(body.env.num_envs)]) else: action = body.action_space.sample() return action
def guard_tensor(state, body): '''Guard-cast tensor before being input to network''' # if isinstance(state, LazyFrames): # state = state.__array__() # realize data state = torch.from_numpy(state.astype(np.float32)) if not body.env.is_venv or util.in_eval_lab_modes(): # singleton state, unsqueeze as minibatch for net input state = state.unsqueeze(dim=0) return state
def run(self): if util.in_eval_lab_modes(): self.run_eval() metrics = None else: self.run_rl() metrics = analysis.analyze_session(self.spec, self.agent.body.eval_df, 'eval') self.close() return metrics
def update(self, state, action, reward, next_state, done): '''Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net''' self.body.update(state, action, reward, next_state, done) if util.in_eval_lab_modes(): # eval does not update agent for training return self.body.memory.update(state, action, reward, next_state, done) loss = self.algorithm.train() if not np.isnan(loss): # set for log_summary() self.body.loss = loss explore_var = self.algorithm.update() return loss, explore_var
def post_init_nets(self): ''' Method to conditionally load models. Call at the end of init_nets() after setting self.net_names ''' assert hasattr(self, 'net_names') if util.in_eval_lab_modes(): logger.info(f'Loaded algorithm models for lab_mode: {util.get_lab_mode()}') self.load() else: logger.info(f'Initialized algorithm models for lab_mode: {util.get_lab_mode()}')
def space_train(self): if util.in_eval_lab_modes(): return np.nan losses = [] for body in self.agent.nanflat_body_a: self.body = body losses.append(self.train()) # set body reference back to default self.body = self.agent.nanflat_body_a[0] loss_a = self.nanflat_to_data_a('loss', losses) return loss_a
def warmup_default(state, algorithm, body): action = default(state, algorithm, body) if util.in_eval_lab_modes(): return action if body.env.clock.epi < algorithm.warmup_epi: if hasattr(body, 'state'): action = rule_guide(body.state, algorithm, body) else: action = rule_guide(state, algorithm, body) return action
def to_ckpt(self, env, mode='eval'): '''Check with clock whether to run log/eval ckpt: at the start, save_freq, and the end''' if mode == 'eval' and util.in_eval_lab_modes(): # avoid double-eval: eval-ckpt in eval mode return False clock = env.clock frame = clock.get() frequency = env.eval_frequency if mode == 'eval' else env.log_frequency if frame == 0 or clock.get('opt_step') == 0: # avoid ckpt at init to_ckpt = False elif frequency is None: # default episodic to_ckpt = env.done else: # normal ckpt condition by mod remainder (general for venv) to_ckpt = util.frame_mod(frame, frequency, env.num_envs) or frame == clock.max_frame return to_ckpt
def warmup_epsilon_greedy(state, algorithm, body): action = default(state, algorithm, body) if util.in_eval_lab_modes(): return action epsilon = body.explore_var if epsilon > np.random.rand(): action = random(state, algorithm, body) if body.env.clock.epi < algorithm.warmup_epi: if hasattr(body, 'state'): action = rule_guide(body.state, algorithm, body) else: action = rule_guide(state, algorithm, body) return action
def train(self): '''Train actor critic by computing the loss in batch efficiently''' if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() """ Add rewards over here. """ batch = self.replace_reward_batch(batch) # batch = self.fetch_disc_reward(batch) clock.set_batch_size(len(batch)) pdparams, v_preds = self.calc_pdparam_v(batch) # get loss of critic: advs and targets of critic v_targets. advs, v_targets = self.calc_advs_v_targets(batch, v_preds) policy_loss = self.calc_policy_loss(batch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: # not shared! F**k You! self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step(val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock # import pdb; pdb.set_trace() # self.batch_count = 0 # print("***********") if self.to_train == 1: # print("===========") self.reward_agent.eval() total_loss = torch.tensor(0.0) self.reward_count = 0 self.batch_count = 0 for _ in range(self.training_iter): batches = [] warmup = False if self.body.warmup_memory.size >= self.body.warmup_memory.batch_size: batches.append(self.warmup_sample()) # if self.body.env.clock.frame < 100000: # batches.append(self.warmup_sample()) # else: # batches.append(self.sample()) warmup = True if self.body.memory.size >= self.body.memory.batch_size: batches.append(self.sample()) clock.set_batch_size(sum(len(batch) for batch in batches)) for idx, batch in enumerate(batches): for _ in range(self.training_batch_iter): loss = self.calc_q_loss(batch, False) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) total_loss += loss loss = total_loss / (self.training_iter * self.training_batch_iter) reward_irl = self.reward_count / self.batch_count logger.info("***********") logger.info(reward_irl) # reset self.to_train = 0 logger.info( f'Trained {self.name} at epi: {clock.epi}, warmup_size: {self.body.warmup_memory.size}, memory_size: {self.body.memory.size}, loss: {loss:g}, irl_reward: {reward_irl}' ) # logger.info(f'Trained {self.name} at epi: {clock.epi}, warmup_size: {self.body.warmup_memory.size}, memory_size: {self.body.memory.size}, loss: {loss:g}') return loss.item() else: return np.nan
def __init__(self, spec, e=None): self.e = e or 0 # for multi-env self.done = False self.env_spec = spec['env'][self.e] # set default util.set_attr( self, dict( log_frequency=None, # default to log at epi done frame_op=None, frame_op_len=None, normalize_state=False, reward_scale=None, num_envs=None, )) util.set_attr(self, spec['meta'], [ 'log_frequency', 'eval_frequency', ]) util.set_attr(self, self.env_spec, [ 'name', 'frame_op', 'frame_op_len', 'normalize_state', 'reward_scale', 'num_envs', 'max_t', 'max_frame', ]) seq_len = ps.get(spec, 'agent.0.net.seq_len') if seq_len is not None: # infer if using RNN self.frame_op = 'stack' self.frame_op_len = seq_len if util.in_eval_lab_modes(): # use singleton for eval self.num_envs = 1 self.log_frequency = None if spec['meta'][ 'distributed'] != False: # divide max_frame for distributed self.max_frame = int(self.max_frame / spec['meta']['max_session']) self.is_venv = (self.num_envs is not None and self.num_envs > 1) if self.is_venv: assert self.log_frequency is not None, f'Specify log_frequency when using venv' self.clock_speed = 1 * ( self.num_envs or 1 ) # tick with a multiple of num_envs to properly count frames self.clock = Clock(self.max_frame, self.clock_speed) self.to_render = util.to_render()
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: batch = self.sample() clock.set_batch_size(len(batch)) pdparams = self.calc_pdparam_batch(batch) advs = self.calc_ret_advs(batch) loss = self.calc_policy_loss(batch, pdparams, advs) self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) # reset self.to_train = 0 logger.info(f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}') return loss.item() else: return np.nan
def load_algorithm(algorithm): '''Save all the nets for an algorithm''' agent = algorithm.agent net_names = algorithm.net_names if util.in_eval_lab_modes(): # load specific model in eval mode model_prepath = agent.spec['meta']['eval_model_prepath'] else: model_prepath = agent.spec['meta']['model_prepath'] logger.info(f'Loading algorithm {util.get_class_name(algorithm)} nets {net_names} from {model_prepath}_*.pt') for net_name in net_names: net = getattr(algorithm, net_name) model_path = f'{model_prepath}_{net_name}_model.pt' load(net, model_path) optim_name = net_name.replace('net', 'optim') optim = getattr(algorithm, optim_name, None) if optim is not None: # only trainable net has optim optim_path = f'{model_prepath}_{net_name}_optim.pt' load(optim, optim_path)
def save(self, ckpt=None): '''Save agent''' if util.in_eval_lab_modes(): # eval does not save new models return self.algorithm.save(ckpt=ckpt)
def train(self): '''Implement algorithm train, or throw NotImplementedError''' if util.in_eval_lab_modes(): return np.nan raise NotImplementedError
def train(self): # torch.save(self.net.state_dict(), './reward_model/policy_pretrain.mdl') # raise ValueError("policy pretrain stops") if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.body.env.clock.epi > 700: self.pretrain_finished = True # torch.save(self.discriminator.state_dict(), './reward_model/airl_pretrain.mdl') # raise ValueError("pretrain stops here") if self.to_train == 1: net_util.copy(self.net, self.old_net) # update old net batch = self.sample() if self.reward_type == 'OFFGAN': batch = self.replace_reward_batch(batch) # if self.reward_type =='DISC': # batch = self.fetch_disc_reward(batch) # if self.reward_type =='AIRL': # batch = self.fetch_airl_reward(batch) # if self.reward_type == 'OFFGAN_update': # batch = self.fetch_offgan_reward(batch) clock.set_batch_size(len(batch)) _pdparams, v_preds = self.calc_pdparam_v(batch) advs, v_targets = self.calc_advs_v_targets(batch, v_preds) # piggy back on batch, but remember to not pack or unpack batch['advs'], batch['v_targets'] = advs, v_targets if self.body.env.is_venv: # unpack if venv for minibatch sampling for k, v in batch.items(): if k not in ('advs', 'v_targets'): batch[k] = math_util.venv_unpack(v) total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): minibatches = util.split_minibatch(batch, self.minibatch_size) # if not self.pretrain_finished or not self.policy_training_flag: # break for minibatch in minibatches: if self.body.env.is_venv: # re-pack to restore proper shape for k, v in minibatch.items(): if k not in ('advs', 'v_targets'): minibatch[k] = math_util.venv_pack( v, self.body.env.num_envs) advs, v_targets = minibatch['advs'], minibatch['v_targets'] pdparams, v_preds = self.calc_pdparam_v(minibatch) policy_loss = self.calc_policy_loss( minibatch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: # pretrain_finished = false -> policy keep fixed, updating value net and disc if not self.pretrain_finished: self.critic_net.train_step( val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = val_loss if self.pretrain_finished and self.policy_training_flag: self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step( val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss total_loss += loss loss = total_loss / self.training_epoch / len(minibatches) if not self.pretrain_finished: logger.info( "warmup Value net, epi: {}, frame: {}, loss: {}".format( clock.epi, clock.frame, loss)) # reset self.to_train = 0 self.policy_training_flag = False logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan
def get_env(self): return self.body.eval_env if util.in_eval_lab_modes( ) else self.body.env
def train(self): if util.in_eval_lab_modes(): return np.nan clock = self.body.env.clock if self.to_train == 1: net_util.copy(self.net, self.old_net) # update old net batch = self.sample() clock.set_batch_size(len(batch)) _pdparams, v_preds = self.calc_pdparam_v(batch) advs, v_targets = self.calc_advs_v_targets(batch, v_preds) # piggy back on batch, but remember to not pack or unpack batch['advs'], batch['v_targets'] = advs, v_targets if self.body.env.is_venv: # unpack if venv for minibatch sampling for k, v in batch.items(): if k not in ('advs', 'v_targets'): batch[k] = math_util.venv_unpack(v) total_loss = torch.tensor(0.0) for _ in range(self.training_epoch): minibatches = util.split_minibatch(batch, self.minibatch_size) for minibatch in minibatches: if self.body.env.is_venv: # re-pack to restore proper shape for k, v in minibatch.items(): if k not in ('advs', 'v_targets'): minibatch[k] = math_util.venv_pack( v, self.body.env.num_envs) advs, v_targets = minibatch['advs'], minibatch['v_targets'] pdparams, v_preds = self.calc_pdparam_v(minibatch) policy_loss = self.calc_policy_loss( minibatch, pdparams, advs) # from actor val_loss = self.calc_val_loss(v_preds, v_targets) # from critic if self.shared: # shared network loss = policy_loss + val_loss self.net.train_step(loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) else: self.net.train_step(policy_loss, self.optim, self.lr_scheduler, clock=clock, global_net=self.global_net) self.critic_net.train_step( val_loss, self.critic_optim, self.critic_lr_scheduler, clock=clock, global_net=self.global_critic_net) loss = policy_loss + val_loss total_loss += loss loss = total_loss / self.training_epoch / len(minibatches) # reset self.to_train = 0 logger.debug( f'Trained {self.name} at epi: {clock.epi}, frame: {clock.frame}, t: {clock.t}, total_reward so far: {self.body.total_reward}, loss: {loss:g}' ) return loss.item() else: return np.nan