def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args if is_atari: self.actor = CNNPolicy(state_dim, action_dim).to(self.device) self.critic = CNNCritic(state_dim).to(self.device) else: self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.critic = Value(state_dim).to(self.device) # initialize optimizer for actor and critic self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.learning_rate) # optimization epoch number and batch size for PPO self.optim_epochs = 10 self.optim_batch_size = 64
def __init__(self, input_dim_single_agent, action_dim_single_agent, num_agents, activation_name='tanh', hidden_dims=[16], log_std=0): super(MultiChannelPolicy, self).__init__( input_dim_single_agent, action_dim_single_agent, num_agents ) self.model = Policy( state_dim=input_dim_single_agent, action_dim=action_dim_single_agent, hidden_size=hidden_dims, activation=activation_name, log_std=log_std )
def __init__(self, state_dim, action_dim, channels, kernel_sizes, strides, paddings=None, head_hidden_size=(128, 128), num_aux=0, activation='relu', use_maxpool=False, log_std=0, resnet_first_layer=False): super().__init__(state_dim, action_dim, channels, kernel_sizes, strides, paddings, activation, use_maxpool, num_aux, resnet_first_layer) self.head = Policy(self.conv_out_size_for_fc + num_aux, action_dim, head_hidden_size, activation, log_std)
def __init__(self, args, state_dim, action_dim, is_dict_action=False, is_atari=False): self.device = args.device self.config = args self.is_dict_action = is_dict_action self.is_atari = is_atari self.state_dim = state_dim self.actor = DiscretePolicy(state_dim, action_dim).to(self.device) if is_dict_action else \ Policy(state_dim, action_dim, log_std=self.config.log_std).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.learning_rate) self.actor_loss = nn.CrossEntropyLoss( ) if self.is_dict_action else nn.MSELoss()
torch.cuda.manual_seed_all(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n, hidden_size=(20,20)) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std, hidden_size=(3,3)) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 5 optim_batch_size = 64
running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.linear: hidden_size=() else: hidden_size=(64,) if args.model_path is None: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=hidden_size) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device)
is_disc_action = len(env.action_space.shape) == 0 #running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = Policy(state_dim, subgoal_dim, log_std=args.log_std, activation_factor=5) policy_wrk = Policy(state_dim - subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim - subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01)
np.random.seed(args.seed) torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 action_dim = (1 if is_disc_action else env_dummy.action_space.shape[0]) ActionTensor = LongTensor if is_disc_action else DoubleTensor """define actor, critic and discrimiator""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0]) value_net = Value(state_dim) discrim_net = Discriminator(state_dim + action_dim) discrim_criterion = nn.BCELoss() if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() discrim_net = discrim_net.cuda() discrim_criterion = discrim_criterion.cuda() optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate)
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.WGAN: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='relu', slope=0.1, dropout=False, dprob=0.2) elif args.GEOMGAN: # new kernel #discrim_net = KernelNet(state_dim + action_dim,state_dim + action_dim) noise_dim = 64 discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', slope=0.1, dropout=False, dprob=0.2) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate / 2) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_decay) if args.WGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() # return -discrim_net(state_action).sum().item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.GEOMGAN: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): #dataSize = states.size()[0] # expert_state_actions = torch.from_numpy(expert_traj).to(dtype).to(device) exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.GEOMGAN: # tbd, no discriminaotr learning pass else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.GEOMGAN: optimizer_kernel.zero_grad() if args.WGAN: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: #mmd2_D,K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) mmd2_D, K = mix_rbf_mmd2(dis_input_real, dis_input_fake, args.sigma_list) #tbd #rewards = K[0]+K[1]-2*K[2] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.GEOMGAN: # larger, better, but slower noise_num = 100 mmd2_D, K = mix_imp_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach() errD = mmd2_D #+ args.lambda_rg * one_side_errD discrim_loss = -errD # maximize errD # prep for generator advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) if args.GEOMGAN: optimizer_kernel.step() """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.GEOMGAN: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) reirl_weights = Weights(state_dim) optim_epochs = 3 # 10 optim_batch_size = 64 state_only = True # load trajectory expert_traj = pickle.load(open(args.expert_traj_path, "rb")) running_state = lambda x: x """create agent""" policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) opponent_net = OpponentPolicy(state_dim, env.action_space.shape[0], log_std=args.log_std) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_opponent = torch.optim.Adam(opponent_net.parameters(), lr=args.learning_rate) def expert_reward(state, next, reward_type): weights = torch.from_numpy(reirl_weights.read()) state = torch.from_numpy(state) return torch.matmul(weights, state).detach().numpy()
env_dummy = env_factory(0) state_dim = 6 # env_dummy.observation_space.shape[0] action_dim = 13 # env_dummy.action_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, action_dim, hidden_size=(64, 128, 64), log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) if use_gpu: policy_net = policy_net.cuda() value_net = value_net.cuda() del env_dummy # for param in policy_net.parameters(): # nn.init.normal(param, mean=0, std=1e-2) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate)
def train(**kwargs): print('here') config = { "lr": kwargs['lr'], "gamma": kwargs['gamma'] } dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.gpu_index) """environment""" env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) # """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) # optimization epoch number and batch size for PPO optim_epochs = 10 optim_batch_size = 64 """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter, config): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size)) for _ in range(optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) def main_loop(config): optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr']) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr']) for i_iter in range(args.max_iter_num): """generate multiple trajectories that reach the minimum batch_size""" batch, log = agent.collect_samples(args.min_batch_size) t0 = time.time() update_params(batch, i_iter, config) t1 = time.time() if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward'])) if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0: to_device(torch.device('cpu'), policy_net, value_net) pickle.dump((policy_net, value_net, running_state), open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb')) to_device(device, policy_net, value_net) # """clean up gpu memory""" torch.cuda.empty_cache() return agent.evaluate() print('a') print(config) print(args) return main_loop(config)
is_disc_action = len(env.action_space.shape) == 0 #running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 4) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)
p_opts = [] v_opts = [] """define actor and critic""" if args.model_path is None: if is_disc_action: for i in range(env.n_agents): p_nets.append( DiscretePolicy(args.dec_agents, env.n_agents, state_dim, env.action_space[0].n)) v_nets.append(Value(env.n_agents, state_dim)) # add only one policy and value networks if using team unified network settings. if args.dec_agents is False: break else: policy_net = Policy(state_dim, env.action_space[0].n, log_std=args.log_std) else: p_nets, v_nets, running_state = pickle.load(open(args.model_path, "rb")) dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cpu') for i in range(env.n_agents): p_nets[i].to(device) v_nets[i].to(device) if args.dec_agents is False: break state = env.reset()
def create_networks(): """define actor and critic""" if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n, hidden_size=(64, 32), activation='relu') else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=(64, 32), activation='relu') value_net = Value(state_dim, hidden_size=(32, 16), activation='relu') if args.AL: discrim_net = SNDiscriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') elif args.EBGAN or args.GMMIL: discrim_net = AESNDiscriminator(state_dim + action_dim, hidden_size=(32, ), encode_size=64, activation='leakyrelu', slope=0.1, dropout=True, dprob=0.2) elif args.VAKLIL: noise_dim = 64 mid_dim = 32 discrim_net = VAEDiscriminator(state_dim + action_dim, num_outputs=noise_dim, sigmoid_out=False, sn=True, test=False, w_init=False, hidden_size_enc=(), hidden_size_dec=(), encode_size=mid_dim, activation='relu', dropout=False) kernel_net = NoiseNet(noise_dim, hidden_size=(32, ), encode_size=noise_dim, activation='relu', dropout=False) optimizer_kernel = torch.optim.Adam(kernel_net.parameters(), lr=args.learning_rate) scheduler_kernel = MultiStepLR(optimizer_kernel, milestones=args.milestones, gamma=args.lr_kernel_decay) else: discrim_net = Discriminator(state_dim + action_dim, hidden_size=(32, 16), activation='relu') optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) optimizer_discrim = torch.optim.Adam(discrim_net.parameters(), lr=args.learning_rate) scheduler_policy = MultiStepLR(optimizer_policy, milestones=args.milestones, gamma=args.lr_decay) scheduler_value = MultiStepLR(optimizer_value, milestones=args.milestones, gamma=args.lr_decay) scheduler_discrim = MultiStepLR(optimizer_discrim, milestones=args.milestones, gamma=args.lr_kernel_decay) if args.AL: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -discrim_net(state_action)[0].item() learned_reward = ExpertReward() elif args.EBGAN: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): _, recon_out = discrim_net(state_action) return -elementwise_loss( recon_out, state_action).item() + args.r_margin learned_reward = ExpertReward() elif args.GMMIL or args.VAKLIL: class ExpertReward(): def __init__(self): self.r_bias = 0 def expert_reward(self, state, action): with torch.no_grad(): return self.r_bias def update_XX_YY(self): self.XX = torch.diag(torch.mm(self.e_o_enc, self.e_o_enc.t())) self.YY = torch.diag(torch.mm(self.g_o_enc, self.g_o_enc.t())) learned_reward = ExpertReward() else: class ExpertReward(): def __init__(self): self.a = 0 def expert_reward(self, state, action): state_action = tensor(np.hstack([state, action]), dtype=dtype) with torch.no_grad(): return -math.log(discrim_net(state_action)[0].item()) learned_reward = ExpertReward() """create agent""" agent = Agent(env, policy_net, device, custom_reward=learned_reward, running_state=None, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter): dataSize = min(args.min_batch_size, len(batch.state)) states = torch.from_numpy(np.stack( batch.state)[:dataSize, ]).to(dtype).to(device) actions = torch.from_numpy(np.stack( batch.action)[:dataSize, ]).to(dtype).to(device) rewards = torch.from_numpy(np.stack( batch.reward)[:dataSize, ]).to(dtype).to(device) masks = torch.from_numpy(np.stack( batch.mask)[:dataSize, ]).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """estimate reward""" """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, args.gamma, args.tau, device) """update discriminator""" for _ in range(args.discriminator_epochs): exp_idx = random.sample(range(expert_traj.shape[0]), dataSize) expert_state_actions = torch.from_numpy( expert_traj[exp_idx, :]).to(dtype).to(device) dis_input_real = expert_state_actions if len(actions.shape) == 1: actions.unsqueeze_(-1) dis_input_fake = torch.cat([states, actions], 1) actions.squeeze_(-1) else: dis_input_fake = torch.cat([states, actions], 1) if args.EBGAN or args.GMMIL or args.VAKLIL: g_o_enc, g_mu, g_sigma = discrim_net(dis_input_fake, mean_mode=False) e_o_enc, e_mu, e_sigma = discrim_net(dis_input_real, mean_mode=False) else: g_o = discrim_net(dis_input_fake) e_o = discrim_net(dis_input_real) optimizer_discrim.zero_grad() if args.VAKLIL: optimizer_kernel.zero_grad() if args.AL: if args.LSGAN: pdist = l1dist(dis_input_real, dis_input_fake).mul(args.lamb) discrim_loss = LeakyReLU(e_o - g_o + pdist).mean() else: discrim_loss = torch.mean(e_o) - torch.mean(g_o) elif args.EBGAN: e_recon = elementwise_loss(e_o, dis_input_real) g_recon = elementwise_loss(g_o, dis_input_fake) discrim_loss = e_recon if (args.margin - g_recon).item() > 0: discrim_loss += (args.margin - g_recon) elif args.GMMIL: mmd2_D, K = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards.detach( ) # exp - gen, maximize (gen label negative) errD = mmd2_D discrim_loss = -errD # maximize errD advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) elif args.VAKLIL: noise_num = 20000 mmd2_D_net, _, penalty = mix_imp_with_bw_mmd2( e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) mmd2_D_rbf, _ = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) errD = (mmd2_D_net + mmd2_D_rbf) / 2 # 1e-8: small number for numerical stability i_c = 0.2 bottleneck_loss = torch.mean((0.5 * torch.sum((torch.cat( (e_mu, g_mu), dim=0)**2) + (torch.cat( (e_sigma, g_sigma), dim=0)**2) - torch.log((torch.cat( (e_sigma, g_sigma), dim=0)**2) + 1e-8) - 1, dim=1))) - i_c discrim_loss = -errD + (args.beta * bottleneck_loss) + ( args.lambda_h * penalty) else: discrim_loss = discrim_criterion(g_o, ones((states.shape[0], 1), device=device)) + \ discrim_criterion(e_o, zeros((e_o.shape[0], 1), device=device)) discrim_loss.backward() optimizer_discrim.step() if args.VAKLIL: optimizer_kernel.step() if args.VAKLIL: with torch.no_grad(): noise_num = 20000 g_o_enc, _, _ = discrim_net(dis_input_fake) e_o_enc, _, _ = discrim_net(dis_input_real) _, K_net, _ = mix_imp_with_bw_mmd2(e_o_enc, g_o_enc, noise_num, noise_dim, kernel_net, cuda, args.sigma_list) _, K_rbf = mix_rbf_mmd2(e_o_enc, g_o_enc, args.sigma_list) K = [sum(x) / 2 for x in zip(K_net, K_rbf)] rewards = K[1] - K[2] # -(exp - gen): -(kxy-kyy)=kyy-kxy rewards = -rewards #.detach() advantages, returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / args.ppo_batch_size)) for _ in range(args.generator_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), \ fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice( i * args.ppo_batch_size, min((i + 1) * args.ppo_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) return rewards if args.VAKLIL: return policy_net,value_net,discrim_net,kernel_net,optimizer_policy,optimizer_value,optimizer_discrim,optimizer_kernel,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim,scheduler_kernel else: return policy_net,value_net,discrim_net,optimizer_policy,optimizer_value,optimizer_discrim,agent,update_params \ ,scheduler_policy,scheduler_value,scheduler_discrim
def learn_model(args): print("RL result will be saved at %s" % args.rl_filename) print("RL model will be saved at %s" % args.rl_model_filename) if use_gpu: print("Using CUDA.") torch.manual_seed(args.rl_seed) if use_gpu: torch.cuda.manual_seed_all(args.rl_seed) torch.backends.cudnn.deterministic = True np.random.seed(args.rl_seed) random.seed(args.rl_seed) env = gym.make(args.env_name) env.seed(args.rl_seed) env_test = gym.make(args.env_name) env_test.seed(args.rl_seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] a_bound = np.asscalar(env.action_space.high[0]) a_low = np.asscalar(env.action_space.low[0]) assert a_bound == -a_low ## Binary flag for manually cliping actions for step function after adding Gaussian noise. clip = (args.env_name == "LunarLanderContinuous-v2" or args.env_name == "BipedalWalker-v2") print(env.observation_space) print(env.action_space) """define actor and critic""" policy_net = Policy(state_dim, action_dim, log_std=args.log_std, a_bound=a_bound, hidden_size=args.hidden_size, activation=args.activation).to(device) value_net = Value(state_dim, hidden_size=args.hidden_size, activation=args.activation).to(device) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate_v) decayed_lambda_td = args.lambda_td def update_params_c(batch, i_iter): states = torch.from_numpy(np.stack(batch.state)).float().to(device) actions = torch.from_numpy(np.stack(batch.action)).float().to(device) rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device) masks = torch.from_numpy(np.stack(batch.mask).astype( np.float32)).to(device) """get advantage estimation from the trajectories""" values = value_net(states).data advantages, lambda_returns, mc_returns = estimate_advantages( rewards, masks, values, args.gamma, args.tau) if args.lamret: returns = lambda_returns else: returns = mc_returns """perform critic update""" #gae_step(value_net, optimizer_value, states, lambda_returns, args.l2_reg) # full batch GD gae_step_epoch(value_net, optimizer_value, states, returns, args.l2_reg) # Stochastic GD """ Function to update the parameters of value and policy networks""" def update_params_p(batch, i_iter): nonlocal decayed_lambda_td states = torch.from_numpy(np.stack(batch.state)).float().to(device) actions = torch.from_numpy(np.stack(batch.action)).float().to(device) next_states = torch.from_numpy(np.stack( batch.next_state)).float().to(device) rewards = torch.from_numpy(np.stack(batch.reward)).float().to(device) masks = torch.from_numpy(np.stack(batch.mask).astype( np.float32)).to(device) """get advantage estimation from the trajectories, this is done after gae_step update""" values = value_net(states).data advantages, lambda_returns, mc_returns = estimate_advantages( rewards, masks, values, gamma=args.gamma, tau=args.tau) if args.method_name == "TRPO-RET-MC": returns = mc_returns.detach( ) # detach() does not matter since we back prop policy network only. elif args.method_name == "TRPO-RET-GAE": returns = lambda_returns.detach( ) # detach() does not matter actually. else: returns = 0 # returns is not used for TRPO and TRPO-TD. # standardize or not ? if args.mgae: advantages = (advantages - advantages.mean() ) / advantages.std() # this will be m-std version else: advantages = advantages / advantages.std( ) # this will be std version trpo_step_td(policy_net=policy_net, value_net=value_net, states=states, actions=actions, next_states=next_states, rewards=rewards, masks=masks, gamma=args.gamma, advantages=advantages, \ max_kl=args.max_kl, damping=args.damping, \ lambda_td=decayed_lambda_td, method_name=args.method_name, returns=returns, mtd=args.mtd) """ decay the td_reg parameter after update """ decayed_lambda_td = decayed_lambda_td * args.decay_td """create agent""" agent = Agent(env, policy_net, render=False) agent_test = Agent(env_test, policy_net, mean_action=True, render=args.render) """ The actual learning loop""" for i_iter in range(args.rl_max_iter_num): """ Save the learned policy model """ if ( (i_iter) % args.rl_save_model_interval == 0 and args.rl_save_model_interval > 0 ) \ or (i_iter == args.rl_max_iter_num + 1) or i_iter == 0: policy_net = policy_net.to(device_cpu) value_net = value_net.to(device_cpu) pickle.dump((policy_net, value_net), open(args.rl_model_filename + ("_I%d.p" % (i_iter)), 'wb')) policy_net = policy_net.to(device) value_net = value_net.to(device) """ Test the policy before update """ if i_iter % args.log_interval == 0 or i_iter + 1 == args.rl_max_iter_num: _, log_test = agent_test.collect_samples_test(max_num_episodes=20, render=args.render, clip=clip) """generate multiple trajectories that reach the minimum batch_size""" t0 = time.time() batch, log = agent.collect_samples_train( args.min_batch_size, render=False, clip=clip) # this is on-policy samples t1 = time.time() """ update parameters """ t0_d = time.time() update_params_c(batch, i_iter) #critic update update_params_p(batch, i_iter) #actor update t1_d = time.time() """ Print out result to stdout and save it to a text file for later usage""" if i_iter % args.log_interval == 0: result_text = t_format("Iter %6d (%2.2fs)+(%2.2fs)" % (i_iter, t1 - t0, t1_d - t0_d)) result_text += " | [R] " + t_format( "Avg: %.2f (%.2f)" % (log['avg_reward'], log['std_reward']), 2) result_text += " | [R_test] " + t_format("min: %.2f" % log_test['min_reward'], 1) + t_format("max: %.2f" % log_test['max_reward'], 1) \ + t_format("Avg: %.2f (%.2f)" % (log_test['avg_reward'], log_test['std_reward']), 2) print(result_text) with open(args.rl_filename, 'a') as f: print(result_text, file=f)
env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 ActionTensor = LongTensor if is_disc_action else DoubleTensor running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], hidden_size=(500, 500), activation='relu', log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) print('loaded pre_trained model!') if args.based_model is True: policy_net.load_state_dict( torch.load(assets_dir() + '/MB_model/net_params2_bestA01-2.pkl')) #work print('loaded net params from model-training.') if use_gpu:
# load trajectory subsampled_expert_traj, running_state = pickle.load( open(args.expert_traj_path, "rb")) running_state.fix = True print(running_state.clip) print(subsampled_expert_traj.shape) expert_traj = [] for t in subsampled_expert_traj: for t_i in t: expert_traj.append(t_i) expert_traj = np.asarray(expert_traj) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] policy_net = Policy(state_dim, env.action_space.shape[0]) to_device(device, policy_net) policy_optimiser = torch.optim.Adam(policy_net.parameters(), lr=0.0001, betas=(0.0, 0.999)) agent = Agent( env, policy_net, device, mean_action=False, running_state=running_state, render=args.render, num_threads=args.num_threads, )
# ActionTensor = LongTensor if is_disc_action else DoubleTensor ActionTensor = LongTensor if is_disc_action else FloatTensor running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" policy_net = [] value_net = [] if args.model_path is None: if is_disc_action: for i in range(env_dummy.n): policy_net.append(DiscretePolicy(obs_shape_n[i], act_shape_n[i])) # print(policy_net[i]) else: policy_net = Policy(obs_shape_n[i], env_dummy.action_space.shape[0], log_std=args.log_std) # value_net = Value(state_dim) for i in range(env_dummy.n): value_net.append(Value(obs_shape_n[i]*env_dummy.n)) # print(value_net[i]) else: # TODO policy_net, value_net = pickle.load(open(args.model_path, "rb")) # policy_net = [env_dummy.observation_space[i].shape[0] for i in range(env_dummy.n)] if use_gpu: # policy_net = policy_net.cuda() # value_net = value_net.cuda() for i in range(env_dummy.n): policy_net[i].cuda() value_net[i].cuda()
is_disc_action = len(env.action_space.shape) == 0 #running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) running_state = None """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 7) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) # optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) # optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) # optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) # optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01)
default=5, metavar='N', help="pretrain discriminator iteration (default: 30)") args = parser.parse_args() use_gpu = True np.random.seed(args.seed) torch.manual_seed(args.seed) if use_gpu: torch.cuda.manual_seed_all(args.seed) is_disc_action = False action_dim = 10 ActionTensor = DoubleTensor """define actor, critic and discrimiator""" policy_net = Policy(10, 256, 10, num_layers=2) value_net = Value(10, 256, num_layers=3) discrim_net = Discriminator(10, 256, 10, num_layers=3) discrim_criterion = nn.BCELoss() ##################################################### ### Load Models load_models = True if load_models: print("Loading Models") policy_net, value_net, discrim_net = pickle.load( open('learned_models/nextaction_pretrain_sigpolicy.p', 'rb')) #_, _, discrim_net = pickle.load(open('learned_models/nextaction_trained_sigpolicy.p', 'rb')) print("Loading Models Finished") #####################################################
'cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') np.random.seed(args.seed) torch.manual_seed(args.seed) env_dummy = env_factory(0) state_dim = env_dummy.observation_space.shape[0] is_disc_action = len(env_dummy.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01) """create agent""" agent = Agent(env_factory, policy_net, device,
"""environment""" env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] running_state = ZFilter((state_dim, ), clip=5) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) """define actor and critic""" if args.linear: hidden_size = () else: hidden_size = (64, ) if args.model_path is None: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std, hidden_size=hidden_size) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=args.learning_rate) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=args.learning_rate) # optimization epoch number and batch size for PPO optim_epochs = 10