def _compute_loss_actor(self, imag_beliefs, imag_states, imag_ac_logps=None): # reward and value prediction of imagined trajectories imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states)) imag_values = bottle(self.value_model, (imag_beliefs, imag_states)) with torch.no_grad(): if self.args.pcont: pcont = bottle(self.pcont_model, (imag_beliefs, imag_states)) else: pcont = self.args.discount * torch.ones_like(imag_rewards) pcont = pcont.detach() if imag_ac_logps is not None: imag_values[ 1:] -= self.args.temp * imag_ac_logps # add entropy here returns = cal_returns(imag_rewards[:-1], imag_values[:-1], imag_values[-1], pcont[:-1], lambda_=self.args.disclam) discount = torch.cumprod( torch.cat([torch.ones_like(pcont[:1]), pcont[:-2]], 0), 0).detach() actor_loss = -torch.mean(discount * returns) return actor_loss
def _compute_loss_critic(self, imag_beliefs, imag_states, imag_ac_logps=None): with torch.no_grad(): # calculate the target with the target nn target_imag_values = bottle(self.target_value_model, (imag_beliefs, imag_states)) imag_rewards = bottle(self.reward_model, (imag_beliefs, imag_states)) if self.args.pcont: pcont = bottle(self.pcont_model, (imag_beliefs, imag_states)) else: pcont = self.args.discount * torch.ones_like(imag_rewards) if imag_ac_logps is not None: target_imag_values[1:] -= self.args.temp * imag_ac_logps returns = cal_returns(imag_rewards[:-1], target_imag_values[:-1], target_imag_values[-1], pcont[:-1], lambda_=self.args.disclam) target_return = returns.detach() value_pred = bottle(self.value_model, (imag_beliefs, imag_states))[:-1] value_loss = F.mse_loss(value_pred, target_return, reduction="none").mean(dim=(0, 1)) return value_loss
def _compute_loss_world(self, state, data): # unpackage data beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state observations, rewards, nonterminals = data observation_loss = F.mse_loss( bottle(self.observation_model, (beliefs, posterior_states)), observations, reduction='none').sum( dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1)) reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards, reduction='none').mean(dim=(0, 1)) # TODO: 5 # transition loss kl_loss = torch.max( kl_divergence( Independent(Normal(posterior_means, posterior_std_devs), 1), Independent(Normal(prior_means, prior_std_devs), 1)), self.free_nats).mean(dim=(0, 1)) if self.args.pcont: pcont_loss = F.binary_cross_entropy( bottle(self.pcont_model, (beliefs, posterior_states)), nonterminals) return observation_loss, self.args.reward_scale * reward_loss, kl_loss, ( self.args.pcont_scale * pcont_loss if self.args.pcont else 0)
def fit_buffer(self,episode): #### # Fit data taken from buffer ###### # Model fitting losses = [] tqdm.write("Fitting buffer") for s in tqdm(range(self.parms.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = self.D.sample(self.parms.batch_size, self.parms.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(self.parms.batch_size, self.parms.belief_size, device=self.parms.device), torch.zeros(self.parms.batch_size, self.parms.state_size, device=self.parms.device) encoded_obs = bottle(self.encoder, (observations[1:], )) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(init_state, actions[:-1], init_belief, encoded_obs, nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) # LOSS observation_loss = F.mse_loss(bottle(self.observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum((2, 3, 4)).mean(dim=(0, 1)) kl_loss = torch.max(kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2), self.free_nats).mean(dim=(0, 1)) reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1)) # Update model parameters self.optimiser.zero_grad() (observation_loss + reward_loss + kl_loss).backward() # BACKPROPAGATION nn.utils.clip_grad_norm_(self.param_list, self.parms.grad_clip_norm, norm_type=2) self.optimiser.step() # Store (0) observation loss (1) reward loss (2) KL loss losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()])#, regularizer_loss.item()]) #save statistics and plot them losses = tuple(zip(*losses)) self.metrics['observation_loss'].append(losses[0]) self.metrics['reward_loss'].append(losses[1]) self.metrics['kl_loss'].append(losses[2]) lineplot(self.metrics['episodes'][-len(self.metrics['observation_loss']):], self.metrics['observation_loss'], 'observation_loss', self.statistics_path) lineplot(self.metrics['episodes'][-len(self.metrics['reward_loss']):], self.metrics['reward_loss'], 'reward_loss', self.statistics_path) lineplot(self.metrics['episodes'][-len(self.metrics['kl_loss']):], self.metrics['kl_loss'], 'kl_loss', self.statistics_path)
def _compute_loss_world(self, state, data): # unpackage data beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = state observations, rewards, nonterminals = data # observation_loss = F.mse_loss( # bottle(self.observation_model, (beliefs, posterior_states)), # observations[1:], # reduction='none').sum(dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1)) # # reward_loss = F.mse_loss( # bottle(self.reward_model, (beliefs, posterior_states)), # rewards[1:], # reduction='none').mean(dim=(0,1)) observation_loss = F.mse_loss( bottle(self.observation_model, (beliefs, posterior_states)), observations, reduction='none').sum( dim=2 if self.args.symbolic else (2, 3, 4)).mean(dim=(0, 1)) reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards, reduction='none').mean(dim=(0, 1)) # TODO: 5 # transition loss kl_loss = torch.max( kl_divergence( Independent(Normal(posterior_means, posterior_std_devs), 1), Independent(Normal(prior_means, prior_std_devs), 1)), self.free_nats).mean(dim=(0, 1)) # print("check the reward", bottle(pcont_model, (beliefs, posterior_states)).shape, nonterminals[:-1].shape) if self.args.pcont: pcont_loss = F.binary_cross_entropy( bottle(self.pcont_model, (beliefs, posterior_states)), nonterminals) # pcont_pred = torch.distributions.Bernoulli(logits=bottle(self.pcont_model, (beliefs, posterior_states))) # pcont_loss = -pcont_pred.log_prob(nonterminals[1:]).mean(dim=(0, 1)) return observation_loss, self.args.reward_scale * reward_loss, kl_loss, ( self.args.pcont_scale * pcont_loss if self.args.pcont else 0)
def update_belief_and_act(args, env, actor_model, transition_model, encoder, belief, posterior_state, action, observation, deterministic=False): # Infer belief over current state q(s_t|o≤t,a<t) from the history belief, _, _, _, posterior_state, _, _ = transition_model( posterior_state, action.unsqueeze(dim=0), belief, encoder(observation).unsqueeze( dim=0)) # Action and observation need extra time dimension belief, posterior_state = belief.squeeze(dim=0), posterior_state.squeeze( dim=0) # Remove time dimension from belief/state # # if explore: # action = actor_model(belief, posterior_state).rsample() # batch_shape=1, event_shape=6 # # add exploration noise -- following the original code: line 275-280 # action = Normal(action, args.expl_amount).rsample() # # # TODO: add this later # # action = torch.clamp(action, [-1.0, 0.0], [1.0, 5.0]) # else: # action = actor_model(belief, posterior_state).mode() action, _ = actor_model( belief, posterior_state, deterministic=deterministic, with_logprob=False ) # with sac, not need to add exploration noise, the max entropy can maintain it. if args.temp == 0 and not deterministic: action = Normal(action, args.expl_amount).rsample() action[:, 1] = 0.3 # TODO: fix the speed next_observation, reward, done = env.step( action.cpu() if isinstance(env, EnvBatcher) else action[0].cpu( )) # Perform environment step (action repeats handled internally) print( bottle(value_model1, (belief.unsqueeze(dim=0), posterior_state.unsqueeze(dim=0))).item()) return belief, posterior_state, action, next_observation, reward, done
def compute_curious_action_values(beliefs, states, means, std_devs, actions, onestep_models, curious_actor_model, curious_value_model, discount): intrinsic_reward = compute_intrinsic_reward(beliefs, actions, onestep_models) reward = intrinsic_reward # reward -= compute_action_divergence(beliefs, states, curious_actor) # reward -= compute_state_divergence(means, std_devs) pcont = torch.ones_like(reward) pcont *= discount value = Normal(bottle(curious_value_model, (beliefs, states)),1).mean() reward = reward[:, :-1] value = value[:, :-1] pcont = pcont[:, :-1] bootstrap = value[:, -1] return_ = lambda_return( reward, value, pcont, bootstrap, lambda_=self._c.disclam, axis=1) return_ *= tf.stop_gradient(tf.math.cumprod(tf.concat([ tf.ones_like(pcont[:, :1]), pcont[:, :-1]], 1), 1)) return return_
for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros( args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) if args.worldmodel_LogProbLoss: observation_dist = Normal( bottle(observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob( observations[1:]).sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) else: observation_loss = F.mse_loss( bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) if args.worldmodel_LogProbLoss: reward_dist = Normal(
for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros( args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) #print("******************") #print(beliefs.shape) #print(prior_states.shape) #print(prior_means.shape) #print(prior_std_devs.shape) #print(actions.shape) #print("******************") # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) observation_loss = F.mse_loss( bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) reward_loss = F.mse_loss(bottle(reward_model, (beliefs, posterior_states)),
def to_image(obs): return torch.nn.functional.interpolate(obs.view(args.test_episodes,1,20,10),scale_factor=5) # Training (and testing) for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes, initial=metrics['episodes'][-1] + 1): # Model fitting losses = [] model_modules = transition_model.modules+encoder.modules+observation_model.modules+reward_model.modules print("training loop") for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample(args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros(args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) if args.worldmodel_MSEloss: observation_loss = F.mse_loss(bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum(dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) else: observation_dist = Normal(bottle(observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob(observations[1:]).sum(dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) if args.algo == "p2e": if args.zero_shot: reward_dist = Normal(bottle(reward_model, (beliefs.detach(), posterior_states)),1) else: if metrics['steps'][-1]*args.action_repeat > args.adaptation_step: reward_dist = Normal(bottle(reward_model, (beliefs, posterior_states)),1) else: reward_dist = Normal(bottle(reward_model, (beliefs.detach(), posterior_states)),1) reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1))
def train_algorithm(self, actor_states, actor_beliefs): [ self.actor_pipes[i][0].send(1) for i, w in enumerate(self.workers_actor) ] # Parent_pipe send data using i'th pipes [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool) ] # waitting the children finish with FreezeParameters(self.model_modules): imagination_traj = self.imagine_merge_ahead( prev_state=actor_states, prev_belief=actor_beliefs, policy_pool=self.actor_pool, transition_model=self.transition_model, merge_model=self.merge_actor_model) imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj with FreezeParameters(self.model_modules + self.merge_value_model_modules): imged_reward = bottle(self.reward_model, (imged_beliefs, imged_prior_states)) value_pred = bottle(self.merge_value_model, (imged_beliefs, imged_prior_states)) with FreezeParameters(self.actor_pool_modules): returns = lambda_return(imged_reward, value_pred, bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam) merge_actor_loss = -torch.mean(returns) # Update model parameters self.merge_actor_optimizer.zero_grad() merge_actor_loss.backward() nn.utils.clip_grad_norm_(self.merge_actor_model.parameters(), args.grad_clip_norm, norm_type=2) self.merge_actor_optimizer.step() # Dreamer implementation: value loss calculation and optimization with torch.no_grad(): value_beliefs = imged_beliefs.detach() value_prior_states = imged_prior_states.detach() target_return = returns.detach() value_dist = Normal( bottle(self.merge_value_model, (value_beliefs, value_prior_states)), 1) # detach the input tensor from the transition network. merge_value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1)) # Update model parameters self.merge_value_optimizer.zero_grad() merge_value_loss.backward() nn.utils.clip_grad_norm_(self.merge_value_model.parameters(), args.grad_clip_norm, norm_type=2) self.merge_value_optimizer.step() self.merge_losses.append( [merge_actor_loss.item(), merge_value_loss.item()])
def train(self): # Model fitting losses = [] print("training loop") # args.collect_interval = 1 for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = self.D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros( args.batch_size, args.belief_size, device=args.device), torch.zeros(args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) obs = bottle(self.encoder, (observations[1:], )) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.upper_transition_model( prev_state=init_state, actions=actions[:-1], prev_belief=init_belief, obs=obs, nonterminals=nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) observation_loss, reward_loss, kl_loss = self.train_env_model( beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs, observations, actions, rewards, nonterminals) # Dreamer implementation: actor loss calculation and optimization with torch.no_grad(): actor_states = posterior_states.detach().to( device=args.device).share_memory_() actor_beliefs = beliefs.detach().to( device=args.device).share_memory_() # if not os.path.exists(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir)): os.mkdir(os.path.join(os.getcwd(), 'tensor_data/' + args.results_dir)) torch.save( actor_states, os.path.join(os.getcwd(), args.results_dir + '/actor_states.pt')) torch.save( actor_beliefs, os.path.join(os.getcwd(), args.results_dir + '/actor_beliefs.pt')) # [self.actor_pipes[i][0].send(1) for i, w in enumerate(self.workers_actor)] # Parent_pipe send data using i'th pipes # [self.actor_pipes[i][0].recv() for i, _ in enumerate(self.actor_pool)] # waitting the children finish self.algorithms.train_algorithm(actor_states, actor_beliefs) losses.append( [observation_loss.item(), reward_loss.item(), kl_loss.item()]) # if self.algorithms.train_algorithm(actor_states, actor_beliefs) is not None: # merge_actor_loss, merge_value_loss = self.algorithms.train_algorithm(actor_states, actor_beliefs) # losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item(), merge_actor_loss.item(), merge_value_loss.item()]) # else: # losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()]) return losses
env.close() quit() print('Training Dada') # Training (and testing) for episode in tqdm(range(metrics['episodes'][-1] + 1, args.episodes + 1), total=args.episodes, initial=metrics['episodes'][-1] + 1): # Model fitting losses = [] for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample(args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros(args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model(init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) observation_loss = F.mse_loss(bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum(dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) reward_loss = F.mse_loss(bottle(reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1)) kl_loss = torch.max(kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2), free_nats).mean(dim=(0, 1)) # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out # print (type(beliefs)) if args.global_kl_beta != 0: kl_loss += args.global_kl_beta * kl_divergence(Normal(posterior_means, posterior_std_devs), global_prior).sum(dim=2).mean(dim=(0, 1)) # Calculate latent overshooting objective for t > 0 if args.overshooting_kl_beta != 0: overshooting_vars = [] # Collect variables for overshooting to process in batch for t in range(1, args.chunk_size - 1): d = min(t + args.overshooting_distance, args.chunk_size - 1) # Overshooting distance t_, d_ = t - 1, d - 1 # Use t_ and d_ to deal with different time indexing for latent states
def update_parameters(self, data, gradient_steps): loss_info = [] # used to record loss for s in tqdm(range(gradient_steps)): # get state and belief of samples observations, actions, rewards, nonterminals = data init_belief = torch.zeros(self.args.batch_size, self.args.belief_size, device=self.args.device) init_state = torch.zeros(self.args.batch_size, self.args.state_size, device=self.args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model( init_state, actions, init_belief, bottle(self.encoder, (observations, )), nonterminals) # TODO: 4 # update paras of world model world_model_loss = self._compute_loss_world( state=(beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs), data=(observations, rewards, nonterminals)) observation_loss, reward_loss, kl_loss, pcont_loss = world_model_loss self.world_optimizer.zero_grad() (observation_loss + reward_loss + kl_loss + pcont_loss).backward() nn.utils.clip_grad_norm_(self.world_param, self.args.grad_clip_norm, norm_type=2) self.world_optimizer.step() # freeze params to save memory for p in self.world_param: p.requires_grad = False for p in self.value_model.parameters(): p.requires_grad = False # latent imagination imag_beliefs, imag_states, imag_ac_logps = self._latent_imagination( beliefs, posterior_states, with_logprob=self.args.with_logprob) # update actor actor_loss = self._compute_loss_actor(imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps) self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor_model.parameters(), self.args.grad_clip_norm, norm_type=2) self.actor_optimizer.step() for p in self.world_param: p.requires_grad = True for p in self.value_model.parameters(): p.requires_grad = True # update critic imag_beliefs = imag_beliefs.detach() imag_states = imag_states.detach() critic_loss = self._compute_loss_critic( imag_beliefs, imag_states, imag_ac_logps=imag_ac_logps) self.value_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.value_model.parameters(), self.args.grad_clip_norm, norm_type=2) self.value_optimizer.step() loss_info.append([ observation_loss.item(), reward_loss.item(), kl_loss.item(), pcont_loss.item() if self.args.pcont else 0, actor_loss.item(), critic_loss.item() ]) # finally, update target value function every #gradient_steps with torch.no_grad(): self.target_value_model.load_state_dict( self.value_model.state_dict()) return loss_info
def train_env_model(self, beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs, observations, actions, rewards, nonterminals): # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) if args.worldmodel_LogProbLoss: observation_dist = Normal( bottle(self.observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob( observations[1:]).sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) else: observation_loss = F.mse_loss( bottle(self.observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) if args.worldmodel_LogProbLoss: reward_dist = Normal( bottle(self.reward_model, (beliefs, posterior_states)), 1) reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1)) else: reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1)) # transition loss div = kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2) kl_loss = torch.max(div, self.free_nats).mean( dim=(0, 1) ) # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out if args.global_kl_beta != 0: kl_loss += args.global_kl_beta * kl_divergence( Normal(posterior_means, posterior_std_devs), self.global_prior).sum(dim=2).mean(dim=(0, 1)) # Calculate latent overshooting objective for t > 0 if args.overshooting_kl_beta != 0: overshooting_vars = [ ] # Collect variables for overshooting to process in batch for t in range(1, args.chunk_size - 1): d = min(t + args.overshooting_distance, args.chunk_size - 1) # Overshooting distance t_, d_ = t - 1, d - 1 # Use t_ and d_ to deal with different time indexing for latent states seq_pad = ( 0, 0, 0, 0, 0, t - d + args.overshooting_distance ) # Calculate sequence padding so overshooting terms can be calculated in one batch # Store (0) actions, (1) nonterminals, (2) rewards, (3) beliefs, (4) prior states, (5) posterior means, (6) posterior standard deviations and (7) sequence masks overshooting_vars.append( (F.pad(actions[t:d], seq_pad), F.pad(nonterminals[t:d], seq_pad), F.pad(rewards[t:d], seq_pad[2:]), beliefs[t_], prior_states[t_], F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad), F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(), seq_pad, value=1), F.pad( torch.ones(d - t, args.batch_size, args.state_size, device=args.device), seq_pad)) ) # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences overshooting_vars = tuple(zip(*overshooting_vars)) # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs = self.upper_transition_model( torch.cat(overshooting_vars[4], dim=0), torch.cat(overshooting_vars[0], dim=1), torch.cat(overshooting_vars[3], dim=0), None, torch.cat(overshooting_vars[1], dim=1)) seq_mask = torch.cat(overshooting_vars[7], dim=1) # Calculate overshooting KL loss with sequence mask kl_loss += ( 1 / args.overshooting_distance ) * args.overshooting_kl_beta * torch.max((kl_divergence( Normal(torch.cat(overshooting_vars[5], dim=1), torch.cat(overshooting_vars[6], dim=1)), Normal(prior_means, prior_std_devs) ) * seq_mask).sum(dim=2), self.free_nats).mean(dim=(0, 1)) * ( args.chunk_size - 1 ) # Update KL loss (compensating for extra average over each overshooting/open loop sequence) # Calculate overshooting reward prediction loss with sequence mask if args.overshooting_reward_scale != 0: reward_loss += ( 1 / args.overshooting_distance ) * args.overshooting_reward_scale * F.mse_loss( bottle(self.reward_model, (beliefs, prior_states)) * seq_mask[:, :, 0], torch.cat(overshooting_vars[2], dim=1), reduction='none' ).mean(dim=(0, 1)) * ( args.chunk_size - 1 ) # Update reward loss (compensating for extra average over each overshooting/open loop sequence) # Apply linearly ramping learning rate schedule if args.learning_rate_schedule != 0: for group in self.model_optimizer.param_groups: group['lr'] = min( group['lr'] + args.model_learning_rate / args.model_learning_rate_schedule, args.model_learning_rate) model_loss = observation_loss + reward_loss + kl_loss # Update model parameters self.model_optimizer.zero_grad() model_loss.backward() nn.utils.clip_grad_norm_(self.param_list, args.grad_clip_norm, norm_type=2) self.model_optimizer.step() return observation_loss, reward_loss, kl_loss
def train(args: argparse.Namespace, env: Env, D: ExperienceReplay, models: Tuple[nn.Module, nn.Module, nn.Module, nn.Module], optimizer: Tuple[optim.Optimizer, optim.Optimizer], param_list: List[nn.parameter.Parameter], planner: nn.Module): # auxilliary tensors global_prior = Normal( torch.zeros(args.batch_size, args.state_size, device=args.device), torch.ones(args.batch_size, args.state_size, device=args.device) ) # Global prior N(0, I) # Allowed deviation in KL divergence free_nats = torch.full((1, ), args.free_nats, dtype=torch.float32, device=args.device) summary_writter = SummaryWriter(args.tensorboard_dir) # unpack models transition_model, observation_model, reward_model, encoder = models transition_optimizer, reward_optimizer = optimizer for idx_episode in trange(args.episodes, leave=False, desc="Episode"): for idx_train in trange(args.collect_interval, leave=False, desc="Training"): # Draw sequence chunks {(o[t], a[t], r[t+1], z[t+1])} ~ D uniformly at random from the dataset # The first two dimensions of the tensors are L (chunk size) and n (batch size) # We want to use o[t+1] to correct the error of the transition model, # so we need to convert the sequence to {(o[t+1], a[t], r[t+1], z[t+1])} observations, actions, rewards_dist, rewards_coll, nonterminals = D.sample(args.batch_size, args.chunk_size) # Create initial belief and state for time t = 0 init_belief = torch.zeros(args.batch_size, args.belief_size, device=args.device) init_state = torch.zeros(args.batch_size, args.state_size, device=args.device) # Transition model forward # deterministic: h[t+1] = f(h[t], a[t]) # prior: s[t+1] ~ Prob(s|h[t+1]) # posterior: s[t+1] ~ Prob(s|h[t+1], o[t+1]) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1] ) # observation loss predictions = bottle(observation_model, (beliefs, posterior_states)) visual_loss = F.mse_loss( predictions[:, :, :3*64*64], observations[1:, :, :3*64*64] ).mean() symbol_loss = F.mse_loss( predictions[:, :, 3*64*64:], observations[1:, :, 3*64*64:] ).mean() observation_loss = visual_loss + symbol_loss # KL divergence loss. Minimize the difference between posterior and prior kl_loss = torch.max( kl_divergence( Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs) ).sum(dim=2), free_nats ).mean(dim=(0, 1)) # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out if args.global_kl_beta != 0: kl_loss += args.global_kl_beta * kl_divergence( Normal(posterior_means, posterior_std_devs), global_prior ).sum(dim=2).mean(dim=(0, 1)) # overshooting loss if args.overshooting_kl_beta != 0: overshooting_vars = [] # Collect variables for overshooting to process in batch for t in range(1, args.chunk_size - 1): d = min(t + args.overshooting_distance, args.chunk_size - 1) # Overshooting distance # Use t_ and d_ to deal with different time indexing for latent states t_, d_ = t - 1, d - 1 # Calculate sequence padding so overshooting terms can be calculated in one batch seq_pad = (0, 0, 0, 0, 0, t - d + args.overshooting_distance) # Store # * a[t:d], # * z[t+1:d+1] # * r[t+1:d+1] # * h[t] # * s[t] prior # * E[s[t:d]] posterior # * Var[s[t:d]] posterior # * mask: # the last few sequences do not have enough length, # so we pad it with 0 to the same length as previous sequence for batch operation, # and use mask to indicate invalid variables. overshooting_vars.append( (F.pad(actions[t:d], seq_pad), F.pad(nonterminals[t:d], seq_pad), F.pad(rewards_dist[t:d], seq_pad[2:]), beliefs[t_], prior_states[t_], F.pad(posterior_means[t_ + 1:d_ + 1].detach(), seq_pad), F.pad(posterior_std_devs[t_ + 1:d_ + 1].detach(), seq_pad, value=1), F.pad(torch.ones(d - t, args.batch_size, args.state_size, device=args.device), seq_pad) ) ) # Posterior standard deviations must be padded with > 0 to prevent infinite KL divergences overshooting_vars = tuple(zip(*overshooting_vars)) # Update belief/state using prior from previous belief/state and previous action (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs = transition_model( torch.cat(overshooting_vars[4], dim=0), torch.cat(overshooting_vars[0], dim=1), torch.cat(overshooting_vars[3], dim=0), None, torch.cat(overshooting_vars[1], dim=1) ) seq_mask = torch.cat(overshooting_vars[7], dim=1) # Calculate overshooting KL loss with sequence mask kl_loss += (1 / args.overshooting_distance) * args.overshooting_kl_beta * torch.max( (kl_divergence( Normal(torch.cat(overshooting_vars[5], dim=1), torch.cat(overshooting_vars[6], dim=1)), Normal(prior_means, prior_std_devs) ) * seq_mask).sum(dim=2), free_nats ).mean(dim=(0, 1)) * (args.chunk_size - 1) # Update KL loss (compensating for extra average over each overshooting/open loop sequence) # TODO: add learning rate schedule # Update model parameters transition_optimizer.zero_grad() loss = observation_loss * 200 + kl_loss loss.backward() nn.utils.clip_grad_norm_(param_list, args.grad_clip_norm, norm_type=2) transition_optimizer.step() # reward loss rewards_dist_predict, rewards_coll_predict = bottle(reward_model.raw, (beliefs.detach(), posterior_states.detach())) reward_loss = F.mse_loss( rewards_dist_predict, rewards_dist[:-1], reduction='mean' ) + F.binary_cross_entropy( rewards_coll_predict, rewards_coll[:-1], reduction='mean' ) reward_optimizer.zero_grad() reward_loss.backward() reward_optimizer.step() # add tensorboard log global_step = idx_train + idx_episode * args.collect_interval summary_writter.add_scalar("observation_loss", observation_loss, global_step) summary_writter.add_scalar("reward_loss", reward_loss, global_step) summary_writter.add_scalar("kl_loss", kl_loss, global_step) for idx_collect in trange(1, leave=False, desc="Collecting"): experience = collect_experience(args, env, models, planner, True, desc="Collecting experience {}".format(idx_collect)) T = len(experience["observation"]) for idx_step in range(T): D.append(experience["observation"][idx_step], experience["action"][idx_step], experience["reward_dist"][idx_step], experience["reward_coll"][idx_step], experience["done"][idx_step]) # Checkpoint models if (idx_episode + 1) % args.checkpoint_interval == 0: record_path = os.path.join(args.checkpoint_dir, "checkpoint") checkpoint_path = os.path.join(args.checkpoint_dir, 'models_%d.pth' % (idx_episode+1)) torch.save( { 'transition_model': transition_model.state_dict(), 'observation_model': observation_model.state_dict(), 'reward_model': reward_model.state_dict(), 'encoder': encoder.state_dict(), 'transition_optimizer': transition_optimizer.state_dict(), 'reward_optimizer': reward_optimizer.state_dict() }, checkpoint_path) with open(record_path, "w") as f: f.write('models_%d.pth' % (idx_episode+1)) planner.save(os.path.join(args.torchscript_dir, "mpc_planner.pth")) transition_model.save(os.path.join(args.torchscript_dir, "transition_model.pth")) reward_model.save(os.path.join(args.torchscript_dir, "reward_model.pth")) observation_model.save(os.path.join(args.torchscript_dir, "observation_decoder.pth")) encoder.save(os.path.join(args.torchscript_dir, "observation_encoder.pth")) summary_writter.close()
observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 01 # print("data shape check", observations.shape, actions.shape, rewards.shape, nonterminals.shape) """world model update""" init_belief = torch.zeros(args.batch_size, args.belief_size, device=args.device) init_state = torch.zeros(args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) observation_loss = F.mse_loss( bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum(dim=2 if args.symbolic else (2, 3, 4)).mean( dim=(0, 1)) reward_loss = F.mse_loss(bottle(reward_model, (beliefs, posterior_states)), rewards[1:], reduction='none').mean(dim=(0, 1)) # transition loss kl_loss = torch.max( kl_divergence(
for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=args.device), torch.zeros( args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) if args.worldmodel_MSEloss: observation_loss = F.mse_loss( bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) else: observation_dist = Normal( bottle(observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob( observations[1:]).sum( dim=2 if args.symbolic_env else (2, 3, 4)).mean(dim=(0, 1)) if args.algo == "p2e": if args.zero_shot:
def run(self) -> None: # print("children process {} waiting to get data".format(self.process_id)) # Run = self.child_conn.recv() # print("children process {} Geted data form parent".format(self.process_id)) actor_loss, value_loss = None, None while self.child_conn.recv() == 1: # print("Start Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id)) for _ in range(args.sub_traintime): with FreezeParameters(self.env_model_modules): actor_states = torch.load( os.path.join(os.getcwd(), self.results_dir + '/actor_states.pt')) actor_beliefs = torch.load( os.path.join(os.getcwd(), self.results_dir + '/actor_beliefs.pt')) actor_states = actor_states.cuda( ) if torch.cuda.is_available( ) and not args.disable_cuda else actor_states.cpu() actor_beliefs = actor_beliefs.cuda( ) if torch.cuda.is_available( ) and not args.disable_cuda else actor_beliefs.cpu() imagination_traj = imagine_ahead( actor_states, actor_beliefs, self.actor_l, self.transition_model, args.planning_horizon, action_scale=self.process_id) imged_beliefs, imged_prior_states, imged_prior_means, imged_prior_std_devs = imagination_traj # Update model parameters with FreezeParameters(self.env_model_modules + self.value_model_l_modules): imged_reward = bottle(self.reward_model, (imged_beliefs, imged_prior_states)) value_pred = bottle(self.value_l, (imged_beliefs, imged_prior_states)) returns = lambda_return(imged_reward, value_pred, bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam) actor_loss = -torch.mean(returns) # calculate local gradients and push local parameters to global self.actor_optimizer_l.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor_l.parameters(), args.grad_clip_norm, norm_type=2) # for la, ga in zip(self.actor_l.parameters(), self.actor_g.parameters()): # ga._grad = la.grad self.actor_optimizer_l.step() # push global parameters # self.actor_l.load_state_dict(self.actor_g.state_dict()) # Dreamer implementation: value loss calculation and optimization with torch.no_grad(): value_beliefs = imged_beliefs.detach() value_prior_states = imged_prior_states.detach() target_return = returns.detach() value_dist = Normal( bottle(self.value_l, (value_beliefs, value_prior_states)), 1) # detach the input tensor from the transition network. value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1)) # Update model parameters self.value_optimizer_l.zero_grad() value_loss.backward() nn.utils.clip_grad_norm_(self.value_l.parameters(), args.grad_clip_norm, norm_type=2) self.value_optimizer_l.step() # save the loss data self.losses.append([actor_loss.item(), value_loss.item()]) if self.count == args.collect_interval - 1: losses = tuple(zip(*self.losses)) self.metrics['actor_loss'].append(losses[0]) self.metrics['value_loss'].append(losses[1]) Save_Txt(self.metrics['episodes'][-1], self.metrics['actor_loss'][-1], 'actor_loss' + str(self.process_id), self.results_dir) Save_Txt(self.metrics['episodes'][-1], self.metrics['value_loss'][-1], 'value_loss' + str(self.process_id), self.results_dir) self.count = 0 self.losses = [] self.metrics['episodes'].append(self.metrics['episodes'][-1] + 1) self.count += 1 # print("End Multi actor-critic Processing, The Process ID is {} -------------------------------".format(self.process_id)) self.child_conn.send(1)
total=args.episodes, initial=metrics['episodes'][-1] + 1): # Model fitting losses = [] for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_state = torch.zeros(args.batch_size, args.state_size, device=args.device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], bottle(encoder, (observations[1:], ))) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) observation_loss = F.mse_loss( bottle(observation_model, (posterior_states, )), observations[1:], reduction='none').sum(dim=(2, 3, 4)).mean(dim=(0, 1)) kl_loss = torch.max( kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2), free_nats ).mean( dim=(0, 1) ) # Note that normalisation by overshooting distance and weighting by overshooting distance cancel out if args.global_kl_beta != 0: kl_loss += args.global_kl_beta * kl_divergence( Normal(posterior_means, posterior_std_devs),
for s in tqdm(range(args.collect_interval)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size, args.belief_size, device=device), torch.zeros( args.batch_size, args.state_size, device=device) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (observations[1:], )), nonterminals[:-1]) # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?) if args.worldmodel_LogProbLoss: observation_dist = Normal( bottle(observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob( observations[1:]).sum(dim=(2, 3, 4)).mean(dim=(0, 1)) else: observation_loss = F.mse_loss( bottle(observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum(dim=(2, 3, 4)).mean(dim=(0, 1)) if args.worldmodel_LogProbLoss: reward_dist = Normal( bottle(reward_model, (beliefs, posterior_states)), 1) reward_loss = -reward_dist.log_prob(rewards[:-1]).mean(dim=(0, 1))
# we also sample obs_aug observations, actions, rewards, nonterminals, observations_aug0, observations_aug1 = D.sample( args.batch_size, args.chunk_size) # Transitions start at time t = 0 # combine two obs_aug in as batches obs_aug_both = torch.cat((observations_aug0, observations_aug1), dim=1) # perhaps repeat is enough obs_gt = torch.cat((observations, observations), dim=1) rewards_gt = torch.cat((rewards, rewards), dim=1) # Create initial belief and state for time t = 0 init_belief, init_state = torch.zeros(args.batch_size*2, args.belief_size, device=device), torch.zeros( args.batch_size*2, args.state_size, device=device) nonterminals_both=torch.cat((nonterminals,nonterminals),dim=1) # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once) # obs_aug is used for state estimation beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = transition_model( init_state, actions[:-1], init_belief, bottle(encoder, (obs_aug_both[1:], )), nonterminals_both[:-1]) # we used the orginal observation for reconstruction if args.worldmodel_LogProbLoss: observation_dist = Normal( bottle(observation_model, (beliefs, posterior_states)), 1) observation_loss = -observation_dist.log_prob( obs_gt[1:]).sum(dim=(2, 3, 4)).mean(dim=(0, 1)) else: observation_loss = F.mse_loss( bottle(observation_model, (beliefs, posterior_states)), obs_gt[1:], reduction='none').sum(dim=(2, 3, 4)).mean(dim=(0, 1)) if args.worldmodel_LogProbLoss: reward_dist = Normal( bottle(reward_model, (beliefs, posterior_states)), 1) reward_loss = - \ reward_dist.log_prob(rewards_gt[:-1]).mean(dim=(0, 1)) else: