def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args if is_atari: self.actor = CNNPolicy(state_dim, action_dim).to(self.device) self.critic = CNNCritic(state_dim).to(self.device) else: self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.critic = Value(state_dim).to(self.device) # initialize optimizer for actor and critic self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.learning_rate) # optimization epoch number and batch size for PPO self.optim_epochs = 10 self.optim_batch_size = 64
def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args self.is_dict_action = is_dict_action self.is_atari = is_atari self.state_dim = state_dim self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.actor_loss = nn.CrossEntropyLoss( ) if self.is_dict_action else nn.MSELoss()
return env np.random.seed(args.seed) torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 action_dim = (1 if is_disc_action else env_dummy.action_space.shape[0]) ActionTensor = LongTensor if is_disc_action else DoubleTensor """define actor, critic and discrimiator""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0]) value_net = Value(state_dim) discrim_net = Discriminator(state_dim + action_dim) discrim_criterion = nn.BCELoss() if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() discrim_net = discrim_net.cuda() discrim_criterion = discrim_criterion.cuda() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate)
state = env.reset() state_dim = state['observation'].shape[0] + state['desired_goal'].shape[0] subgoal_dim = state['achieved_goal'].shape[0] is_disc_action = len(env.action_space.shape) == 0 #running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = Policy(state_dim, subgoal_dim, log_std=args.log_std, activation_factor=5) policy_wrk = Policy(state_dim - subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim - subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device)
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.WGAN: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='relu', slope=0.1, dropout=False, dprob=0.2) elif args.GEOMGAN: # new kernel #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim) noise_dim = 64 discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate / 2) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_decay) if args.WGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() # return -discrim_net(state_action).sum().item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.GEOMGAN: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): #dataSize = states.size()[0] # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device) exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.GEOMGAN: # tbd, no discriminaotr learning pass else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.GEOMGAN: optimizer_kernel.zero_grad() if args.WGAN: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake, args.sigma_list) #tbd #rewards = K[0]+K[1]-2*K[2] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.GEOMGAN: # larger, better, but slower noise_num = 100 mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach() errD = mmd2_D #+ args.lambda_rg * one_side_errD discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) if args.GEOMGAN: optimizer_kernel.step() """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.GEOMGAN: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
def train(**kwargs): print('here') config = { "lr": kwargs['lr'], "gamma": kwargs['gamma'] } dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.gpu_index) """environment""" env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) # """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) # optimization epoch number and batch size for PPO optim_epochs = 10 optim_batch_size = 64 """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter, config): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size)) for _ in range(optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) def main_loop(config): optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr']) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr']) for i_iter in range(args.max_iter_num): """generate multiple trajectories that reach the minimum batch_size""" batch, log = agent.collect_samples(args.min_batch_size) t0 = time.time() update_params(batch, i_iter, config) t1 = time.time() if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward'])) if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0: to_device(torch.device('cpu'), policy_net, value_net) pickle.dump((policy_net, value_net, running_state), open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb')) to_device(device, policy_net, value_net) # """clean up gpu memory""" torch.cuda.empty_cache() return agent.evaluate() print('a') print(config) print(args) return main_loop(config)
class PPO(object): def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args if is_atari: self.actor = CNNPolicy(state_dim, action_dim).to(self.device) self.critic = CNNCritic(state_dim).to(self.device) else: self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.critic = Value(state_dim).to(self.device) # initialize optimizer for actor and critic self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.learning_rate) # optimization epoch number and batch size for PPO self.optim_epochs = 10 self.optim_batch_size = 64 def train(self, batch): """ Train the policy using the given batch. :param batch: :return: """ states = torch.DoubleTensor(np.stack(batch.state)).to(self.device) actions = torch.DoubleTensor(np.stack(batch.action)).to(self.device) rewards = torch.DoubleTensor(np.stack(batch.reward)).to(self.device) masks = torch.DoubleTensor(np.stack(batch.mask)).to(self.device) with torch.no_grad(): values = self.critic(states) fixed_log_probs = self.actor.get_log_prob(states, actions) # get advantage estimation from the trajectories advantages, returns = estimate_advantages(rewards, masks, values, self.config.gamma, self.config.tau, self.device) # compute minibatch size optim_iter_num = int(math.ceil(states.shape[0] / self.optim_batch_size)) # PPO updates for _ in range(self.optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = torch.LongTensor(perm).to(self.device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * self.optim_batch_size, min((i + 1) * self.optim_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] self.ppo_step(states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b) def ppo_step(self, states, actions, returns, advantages, fixed_log_probs): """ A PPO policy gradient update step. :param states: :param actions: :param returns: :param advantages: :param fixed_log_probs: :return: """ # update critic, for now assume one epoch values_pred = self.critic(states) value_loss = (values_pred - returns).pow(2).mean() # weight decay for param in self.critic.parameters(): value_loss += param.pow(2).sum() * self.config.l2_reg self.critic_optimizer.zero_grad() value_loss.backward() self.critic_optimizer.step() # update actor log_probs = self.actor.get_log_prob(states, actions) ratio = torch.exp(log_probs - fixed_log_probs) surr1 = ratio * advantages surr2 = torch.clamp(ratio, 1.0 - self.config.clip_epsilon, 1.0 + self.config.clip_epsilon) * advantages policy_surr = -torch.min(surr1, surr2).mean() self.actor.zero_grad() policy_surr.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 40) self.actor_optimizer.step()
np.random.seed(args.seed) torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy """create agent""" agent = Agent(env_factory, policy_net,
state = env.reset() state_dim = state['observation'].shape[0] + state['desired_goal'].shape[0] subgoal_dim = 3 is_disc_action = len(env.action_space.shape) == 0 #running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 7) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device)
else: running_state = None # if args.reward_running_state == 1: # running_reward = ZFilter((1,), demean=False, clip=10) # else: # running_reward = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: if args.sac_policy: policy_net = Policy_Tanh_Gaussian(state_dim, env.action_space.shape[0], hidden_size=(64, 64), log_std=args.log_std) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state, = pickle.load(
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.AL: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='leakyrelu', slope=0.1, dropout=True, dprob=0.2) elif args.VAKLIL: noise_dim = 64 mid_dim = 32 discrim_net = VAEDiscriminator(state_dim + action_dim, num_outputs=noise_dim, sigmoid_out=False, sn=True, test=False, w_init=False, hidden_size_enc=(), hidden_size_dec=(), encode_size=mid_dim, activation='relu', dropout=False) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', dropout=False) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_kernel_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_kernel_decay) if args.AL: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.VAKLIL: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.VAKLIL: g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake, mean_mode=False) e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real, mean_mode=False) else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.VAKLIL: optimizer_kernel.zero_grad() if args.AL: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.VAKLIL: noise_num = 20000 mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2( e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) errD = (mmd2_D_net + mmd2_D_rbf) / 2 # 1e-8: small number for numerical stability i_c = 0.2 bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat( (e_mu, g_mu), dim=0)**2) + (torch.cat( (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat( (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1, dim=1))) - i_c discrim_loss = -errD + (args.beta * bottleneck_loss) + ( args.lambda_h * penalty) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) discrim_loss.backward() optimizer_discrim.step() if args.VAKLIL: optimizer_kernel.step() if args.VAKLIL: with torch.no_grad(): noise_num = 20000 g_o_enc, _, _ = discrim_net(dis_input_fake) e_o_enc, _, _ = discrim_net(dis_input_real) _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) K = [sum(x) / 2 for x in zip(K_net, K_rbf)] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards #.detach() advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.VAKLIL: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
force=True, mode='training') return env env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=(500, 500), activation='relu', log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) print('loaded pre_trained model!') if args.based_model is True: policy_net.load_state_dict( torch.load(assets_dir() +
# is_disc_action = len(env_dummy.action_space[0]) == 0 is_disc_action = True # ActionTensor = LongTensor if is_disc_action else DoubleTensor ActionTensor = LongTensor if is_disc_action else FloatTensor running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" policy_net = [] value_net = [] if args.model_path is None: if is_disc_action: for i in range(env_dummy.n): policy_net.append(DiscretePolicy(obs_shape_n[i], act_shape_n[i])) # print(policy_net[i]) else: policy_net = Policy(obs_shape_n[i], env_dummy.action_space.shape[0], log_std=args.log_std) # value_net = Value(state_dim) for i in range(env_dummy.n): value_net.append(Value(obs_shape_n[i]*env_dummy.n)) # print(value_net[i]) else: # TODO policy_net, value_net = pickle.load(open(args.model_path, "rb")) # policy_net = [env_dummy.observation_space[i].shape[0] for i in range(env_dummy.n)] if use_gpu: # policy_net = policy_net.cuda() # value_net = value_net.cuda() for i in range(env_dummy.n):
is_disc_action = len(env.action_space[0].shape) == 0 """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) p_nets = [] v_nets = [] p_opts = [] v_opts = [] """define actor and critic""" if args.model_path is None: if is_disc_action: for i in range(env.n_agents): p_nets.append( DiscretePolicy(args.dec_agents, env.n_agents, state_dim, env.action_space[0].n)) v_nets.append(Value(env.n_agents, state_dim)) # add only one policy and value networks if using team unified network settings. if args.dec_agents is False: break else: policy_net = Policy(state_dim, env.action_space[0].n, log_std=args.log_std) else: p_nets, v_nets, running_state = pickle.load(open(args.model_path, "rb")) dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cpu')
torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20)) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3)) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5
class BC(object): """ A vanilla Behavior Cloning model. """ def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args self.is_dict_action = is_dict_action self.is_atari = is_atari self.state_dim = state_dim self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.actor_loss = nn.CrossEntropyLoss( ) if self.is_dict_action else nn.MSELoss() def set_expert(self, expert_traj, num_trajs): """ Set the expert trajectories. :param expert_traj: :param num_traj :return: """ # self.expert_traj_pool = expert_traj # self.expert_traj = np.vstack(expert_traj[:num_trajs]) def set_expert2(self, expert_traj): """ Set the expert trajectories. :param expert_traj: :param num_traj :return: """ self.expert_traj = expert_traj def train(self): """ :param num_traj: :return: """ if self.expert_traj is not None: expert_traj = self.expert_traj expert_state_actions = torch.DoubleTensor(expert_traj) expert_states = expert_state_actions[:, :self.state_dim].to( self.device) expert_actions = expert_state_actions[:, self.state_dim:].to(self.device) predicted_actions = self.actor(expert_states)[0] self.actor_optimizer.zero_grad() loss = self.actor_loss(predicted_actions, expert_actions) loss.backward() self.actor_optimizer.step() return loss.to('cpu').detach().numpy() def train2(self): """ :return: """ if self.expert_traj is not None: expert_traj = self.expert_traj # expert_state_actions = torch.DoubleTensor(expert_traj) # # expert_states = expert_state_actions[:,:self.state_dim].to(self.device) # expert_actions = expert_state_actions[:,self.state_dim:].to(self.device) expert_states = torch.DoubleTensor(self.expert_traj.state) expert_actions = torch.DoubleTensor(self.expert_traj.action) predicted_actions = self.actor(expert_states)[0] self.actor_optimizer.zero_grad() loss = self.actor_loss(predicted_actions, expert_actions) loss.backward() self.actor_optimizer.step() return loss.to('cpu').detach().numpy()
state, _ = env.reset() state_dim = state.shape[0] subgoal_dim = 3 is_disc_action = len(env.action_space.shape) == 0 #running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 4) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device)
# if args.reward_running_state == 1: # running_reward = ZFilter((1,), demean=False, clip=10) # else: # running_reward = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state, = pickle.load(open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO
device = torch.device( 'cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.gpu_index) np.random.seed(args.seed) torch.manual_seed(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy """create agent""" agent = Agent(env_factory, policy_net, device,
torch.cuda.set_device(exp_args["config"]["gpu-index"]) """ environment """ env = gym.make(exp_args["config"]["env-name"]) state_dim = env.observation_space.shape[0] is_discrete_action_space = len( env.action_space.shape) == 0 # shape is empty () for discrete environments running_state = ZFilter((state_dim, ), clip=5) """ Seeding """ np.random.seed(exp_args["config"]["seed"]) torch.manual_seed(exp_args["config"]["seed"]) env.seed(exp_args["config"]["seed"]) """ define policy(actor) and critic(value function predictor) """ if is_discrete_action_space: policy_net = DiscretePolicy(state_dim, env.action_space.n, exp_args["model"]["hidden"], exp_args["model"]["activation"]) else: raise ValueError( "Policy for Continous Action Space is not implemented yet") value_net = Value(state_dim, exp_args["model"]["hidden"], exp_args["model"]["activation"]) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=exp_args["config"]["lr"]) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=exp_args["config"]["lr"])
dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device( 'cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') np.random.seed(args.seed) torch.manual_seed(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01) """create agent"""
np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.env_name == 'CarRacing-v0': num_aux = 1 else: num_aux = 0 if num_aux > 0: aux_running_state = ZFilter(num_aux, clip=5) else: aux_running_state = None if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) elif is_img_state: policy_net = CNNPolicy(state_dim, env.action_space.shape[0], cnn_options['channels'], cnn_options['kernel_sizes'], cnn_options['strides'], head_hidden_size=cnn_options['head_hidden_sizes'], num_aux=num_aux, log_std=args.log_std, resnet_first_layer=args.cnn_resnet_first_layer) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std)