initialise_weights(critic) # Define optimisers opt_gen = optim.Adam(gen.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9)) opt_critic = optim.Adam(critic.parameters(), lr=LEARNING_RATE, betas=(0.0, 0.9)) # Setup torchvision fixed_noise = torch.randn(32, Z_DIM, 1, 1).to(device) writer_real = SummaryWriter('logs/real') writer_fake = SummaryWriter('logs/fake') step = 0 gen.train() critic.train() # Network training for epoch in range(NUM_EPOCHS): # Clear CPU and GPU cache each epoch gc.collect() torch.cuda.empty_cache() print('Epoch {}/{}'.format(epoch + 1, NUM_EPOCHS)) # Save model every 50 epochs if (epoch + 1) % 50 == 0: torch.save(gen, 'gen_epoch{}'.format(epoch + 1)) # For each batch_idx... for batch_idx, (real, _) in enumerate(loader):
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 self.train_actions = args.train_actions if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'use_bn_affine':args.bn_affine, 'init_method':args.init_method } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): if self.pic: # print(s_t.shape) s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) self.eval() # print(s_t.shape) if self.pic: action = to_numpy( self.actor(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon if self.use_cuda: tmp = nn.Parameter(torch.FloatTensor(np.array([action])).cuda()) else: tmp = nn.Parameter(torch.FloatTensor(np.array([action]))) optim = Adam([tmp], lr=1e-3) if self.train_actions == True: for i in range(10): optim.zero_grad() loss = -self.critic([ to_tensor(np.array([s_t])), tmp ]) loss.backward() optim.step() tmp = torch.clamp(tmp, -1., 1.) Q1 = self.critic([ to_tensor(np.array([s_t]), volatile=True), to_tensor(np.array([action]), volatile=True) ]) Q2 = self.critic([ to_tensor(np.array([s_t]), volatile=True), tmp ]) if self.writer != None: self.writer.add_scalar('train/d_Q', (Q2 - Q1).data.cpu().numpy(), self.select_time) self.select_time += 1 for j in range(self.nb_actions): action[j] = tmp[0][j] action = np.clip(action, -1., 1.) self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] print('state size:', state_size) print('action size:', action_size) actor = Actor(state_size, action_size, args) critic = Critic(state_size, args) critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) writer = SummaryWriter(args.logdir) recent_rewards = deque(maxlen=100) episodes = 0 for iter in range(args.max_iter_num): trajectories = deque() steps = 0 while steps < args.total_sample_size: done = False score = 0 episodes += 1 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state)) action = get_action(mu, std) next_state, reward, done, _ = env.step(action) mask = 0 if done else 1 trajectories.append((state, action, reward, mask)) next_state = np.reshape(next_state, [1, state_size]) state = next_state score += reward if done: recent_rewards.append(score) actor.train(), critic.train() train_model(actor, critic, critic_optimizer, trajectories, state_size, action_size) writer.add_scalar('log/score', float(score), episodes) if iter % args.log_interval == 0: print('{} iter | {} episode | score_avg: {:.2f}'.format( iter, episodes, np.mean(recent_rewards))) if np.mean(recent_rewards) > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(actor.state_dict(), ckpt_path) print('Recent rewards exceed -300. So end') break
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1': args.hidden1, 'hidden2': args.hidden2, 'init_method': args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor=True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array( np.mean([ np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters() ])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num))) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num))) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num)) torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num)) if self.use_cuda: self.actor.cuda() self.critic.cuda()
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) vdb = VDB(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) vdb_optim = optim.Adam(vdb.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) vdb.load_state_dict(ckpt['vdb']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(vdb, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{} episode score is {:.2f}'.format(episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), vdb.train() train_vdb(vdb, memory, vdb_optim, demonstrations, 0, args) train_ppo(actor, critic, memory, actor_optim, critic_optim, args)
class DDPG: def __init__(self, env, args): ob_space = env.observation_space goal_dim = env.goal_dim ob_dim = ob_space.shape[0] self.ob_dim = ob_dim self.ac_dim = ac_dim = 7 self.goal_dim = goal_dim self.num_iters = args.num_iters self.random_prob = args.random_prob self.tau = args.tau self.reward_scale = args.reward_scale self.gamma = args.gamma self.log_interval = args.log_interval self.save_interval = args.save_interval self.rollout_steps = args.rollout_steps self.env = env self.batch_size = args.batch_size self.train_steps = args.train_steps self.closest_dist = np.inf self.warmup_iter = args.warmup_iter self.max_grad_norm = args.max_grad_norm self.use_her = args.her self.k_future = args.k_future self.model_dir = os.path.join(args.save_dir, 'model') self.pretrain_dir = args.pretrain_dir os.makedirs(self.model_dir, exist_ok=True) self.global_step = 0 self.actor = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) if args.resume or args.test or args.pretrain_dir is not None: self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir) if not args.test: self.actor_target = Actor(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.critic_target = Critic(ob_dim=ob_dim, act_dim=ac_dim, hid1_dim=args.hid1_dim, hid2_dim=args.hid2_dim, hid3_dim=args.hid3_dim, init_method=args.init_method) self.actor_optim = self.construct_optim(self.actor, lr=args.actor_lr) cri_w_decay = args.critic_weight_decay self.critic_optim = self.construct_optim(self.critic, lr=args.critic_lr, weight_decay=cri_w_decay) self.hard_update(self.actor_target, self.actor) self.hard_update(self.critic_target, self.critic) self.actor_target.eval() self.critic_target.eval() if args.noise_type == 'ou_noise': mu = np.zeros(ac_dim) sigma = float(args.ou_noise_std) * np.ones(ac_dim) self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu, sigma=sigma) elif args.noise_type == 'uniform': low_limit = args.uniform_noise_low high_limit = args.uniform_noise_high dec_step = args.max_noise_dec_step self.action_noise = UniformNoise(low_limit=low_limit, high_limit=high_limit, dec_step=dec_step) elif args.noise_type == 'gaussian': mu = np.zeros(ac_dim) sigma = args.normal_noise_std * np.ones(ac_dim) self.action_noise = NormalActionNoise(mu=mu, sigma=sigma) self.memory = Memory(limit=int(args.memory_limit), action_shape=(int(ac_dim), ), observation_shape=(int(ob_dim), )) self.critic_loss = nn.MSELoss() self.ob_norm = args.ob_norm if self.ob_norm: self.obs_oms = OnlineMeanStd(shape=(1, ob_dim)) else: self.obs_oms = None self.cuda() def test(self, render=False, record=True, slow_t=0): dist, succ_rate = self.rollout(render=render, record=record, slow_t=slow_t) print('Final step distance: ', dist) def train(self): self.net_mode(train=True) tfirststart = time.time() epoch_episode_rewards = deque(maxlen=1) epoch_episode_steps = deque(maxlen=1) total_rollout_steps = 0 for epoch in range(self.global_step, self.num_iters): episode_reward = 0 episode_step = 0 self.action_noise.reset() obs = self.env.reset() obs = obs[0] epoch_actor_losses = [] epoch_critic_losses = [] if self.use_her: ep_experi = { 'obs': [], 'act': [], 'reward': [], 'new_obs': [], 'ach_goals': [], 'done': [] } for t_rollout in range(self.rollout_steps): total_rollout_steps += 1 ran = np.random.random(1)[0] if self.pretrain_dir is None and epoch < self.warmup_iter or \ ran < self.random_prob: act = self.random_action().flatten() else: act = self.policy(obs).flatten() new_obs, r, done, info = self.env.step(act) ach_goals = new_obs[1].copy() new_obs = new_obs[0].copy() episode_reward += r episode_step += 1 self.memory.append(obs, act, r * self.reward_scale, new_obs, ach_goals, done) if self.use_her: ep_experi['obs'].append(obs) ep_experi['act'].append(act) ep_experi['reward'].append(r * self.reward_scale) ep_experi['new_obs'].append(new_obs) ep_experi['ach_goals'].append(ach_goals) ep_experi['done'].append(done) if self.ob_norm: self.obs_oms.update(new_obs) obs = new_obs epoch_episode_rewards.append(episode_reward) epoch_episode_steps.append(episode_step) if self.use_her: for t in range(episode_step - self.k_future): ob = ep_experi['obs'][t] act = ep_experi['act'][t] new_ob = ep_experi['new_obs'][t] ach_goal = ep_experi['ach_goals'][t] k_futures = np.random.choice(np.arange( t + 1, episode_step), self.k_future - 1, replace=False) k_futures = np.concatenate((np.array([t]), k_futures)) for future in k_futures: new_goal = ep_experi['ach_goals'][future] her_ob = np.concatenate( (ob[:-self.goal_dim], new_goal), axis=0) her_new_ob = np.concatenate( (new_ob[:-self.goal_dim], new_goal), axis=0) res = self.env.cal_reward(ach_goal.copy(), new_goal, act) her_reward, _, done = res self.memory.append(her_ob, act, her_reward * self.reward_scale, her_new_ob, ach_goal.copy(), done) self.global_step += 1 if epoch >= self.warmup_iter: for t_train in range(self.train_steps): act_loss, cri_loss = self.train_net() epoch_critic_losses.append(cri_loss) epoch_actor_losses.append(act_loss) if epoch % self.log_interval == 0: tnow = time.time() stats = {} if self.ob_norm: stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy()) stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy()) stats['total_rollout_steps'] = total_rollout_steps stats['rollout/return'] = safemean( [rew for rew in epoch_episode_rewards]) stats['rollout/ep_steps'] = safemean( [l for l in epoch_episode_steps]) if epoch >= self.warmup_iter: stats['actor_loss'] = np.mean(epoch_actor_losses) stats['critic_loss'] = np.mean(epoch_critic_losses) stats['epoch'] = epoch stats['actor_lr'] = self.actor_optim.param_groups[0]['lr'] stats['critic_lr'] = self.critic_optim.param_groups[0]['lr'] stats['time_elapsed'] = tnow - tfirststart for name, value in stats.items(): logger.logkv(name, value) logger.dumpkvs() if (epoch == 0 or epoch >= self.warmup_iter) and \ self.save_interval and\ epoch % self.save_interval == 0 and \ logger.get_dir(): mean_final_dist, succ_rate = self.rollout() logger.logkv('epoch', epoch) logger.logkv('test/total_rollout_steps', total_rollout_steps) logger.logkv('test/mean_final_dist', mean_final_dist) logger.logkv('test/succ_rate', succ_rate) tra_mean_dist, tra_succ_rate = self.rollout(train_test=True) logger.logkv('train/mean_final_dist', tra_mean_dist) logger.logkv('train/succ_rate', tra_succ_rate) # self.log_model_weights() logger.dumpkvs() if mean_final_dist < self.closest_dist: self.closest_dist = mean_final_dist is_best = True else: is_best = False self.save_model(is_best=is_best, step=self.global_step) def train_net(self): batch_data = self.memory.sample(batch_size=self.batch_size) for key, value in batch_data.items(): batch_data[key] = torch.from_numpy(value) obs0_t = batch_data['obs0'] obs1_t = batch_data['obs1'] obs0_t = self.normalize(obs0_t, self.obs_oms) obs1_t = self.normalize(obs1_t, self.obs_oms) obs0 = Variable(obs0_t).float().cuda() with torch.no_grad(): vol_obs1 = Variable(obs1_t).float().cuda() rewards = Variable(batch_data['rewards']).float().cuda() actions = Variable(batch_data['actions']).float().cuda() terminals = Variable(batch_data['terminals1']).float().cuda() cri_q_val = self.critic(obs0, actions) with torch.no_grad(): target_net_act = self.actor_target(vol_obs1) target_net_q_val = self.critic_target(vol_obs1, target_net_act) # target_net_q_val.volatile = False target_q_label = rewards target_q_label += self.gamma * target_net_q_val * (1 - terminals) target_q_label = target_q_label.detach() self.actor.zero_grad() self.critic.zero_grad() cri_loss = self.critic_loss(cri_q_val, target_q_label) cri_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optim.step() self.critic.zero_grad() self.actor.zero_grad() net_act = self.actor(obs0) net_q_val = self.critic(obs0, net_act) act_loss = -net_q_val.mean() act_loss.backward() if self.max_grad_norm is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optim.step() self.soft_update(self.actor_target, self.actor, self.tau) self.soft_update(self.critic_target, self.critic, self.tau) return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy() def normalize(self, x, stats): if stats is None: return x return (x - stats.mean) / stats.std def denormalize(self, x, stats): if stats is None: return x return x * stats.std + stats.mean def net_mode(self, train=True): if train: self.actor.train() self.critic.train() else: self.actor.eval() self.critic.eval() def load_model(self, step=None, pretrain_dir=None): model_dir = self.model_dir if pretrain_dir is not None: ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth') else: if step is None: ckpt_file = os.path.join(model_dir, 'model_best.pth') else: ckpt_file = os.path.join(model_dir, 'ckpt_{:08d}.pth'.format(step)) if not os.path.isfile(ckpt_file): raise ValueError("No checkpoint found at '{}'".format(ckpt_file)) mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file)) checkpoint = torch.load(ckpt_file) if pretrain_dir is not None: actor_dict = self.actor.state_dict() critic_dict = self.critic.state_dict() actor_pretrained_dict = { k: v for k, v in checkpoint['actor_state_dict'].items() if k in actor_dict } critic_pretrained_dict = { k: v for k, v in checkpoint['critic_state_dict'].items() if k in critic_dict } actor_dict.update(actor_pretrained_dict) critic_dict.update(critic_pretrained_dict) self.actor.load_state_dict(actor_dict) self.critic.load_state_dict(critic_dict) self.global_step = 0 else: self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.global_step = checkpoint['global_step'] if step is None: mutils.print_yellow('Checkpoint step: {}' ''.format(checkpoint['ckpt_step'])) self.warmup_iter += self.global_step mutils.print_yellow('Checkpoint loaded...') def save_model(self, is_best, step=None): if step is None: step = self.global_step ckpt_file = os.path.join(self.model_dir, 'ckpt_{:08d}.pth'.format(step)) data_to_save = { 'ckpt_step': step, 'global_step': self.global_step, 'actor_state_dict': self.actor.state_dict(), 'actor_optimizer': self.actor_optim.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'critic_optimizer': self.critic_optim.state_dict() } mutils.print_yellow('Saving checkpoint: %s' % ckpt_file) torch.save(data_to_save, ckpt_file) if is_best: torch.save(data_to_save, os.path.join(self.model_dir, 'model_best.pth')) def rollout(self, train_test=False, render=False, record=False, slow_t=0): test_conditions = self.env.train_test_conditions \ if train_test else self.env.test_conditions done_num = 0 final_dist = [] episode_length = [] for idx in range(test_conditions): if train_test: obs = self.env.train_test_reset(cond=idx) else: obs = self.env.test_reset(cond=idx) for t_rollout in range(self.rollout_steps): obs = obs[0].copy() act = self.policy(obs, stochastic=False).flatten() obs, r, done, info = self.env.step(act) if render: self.env.render() if slow_t > 0: time.sleep(slow_t) if done: done_num += 1 break if record: print('dist: ', info['dist']) final_dist.append(info['dist']) episode_length.append(t_rollout) final_dist = np.array(final_dist) mean_final_dist = np.mean(final_dist) succ_rate = done_num / float(test_conditions) if record: with open('./test_data.json', 'w') as f: json.dump(final_dist.tolist(), f) print('\nDist statistics:') print("Minimum: {0:9.4f} Maximum: {1:9.4f}" "".format(np.min(final_dist), np.max(final_dist))) print("Mean: {0:9.4f}".format(mean_final_dist)) print("Standard Deviation: {0:9.4f}".format(np.std(final_dist))) print("Median: {0:9.4f}".format(np.median(final_dist))) print("First quartile: {0:9.4f}" "".format(np.percentile(final_dist, 25))) print("Third quartile: {0:9.4f}" "".format(np.percentile(final_dist, 75))) print('Success rate:', succ_rate) if render: while True: self.env.render() return mean_final_dist, succ_rate def log_model_weights(self): for name, param in self.actor.named_parameters(): logger.logkv('actor/' + name, param.clone().cpu().data.numpy()) for name, param in self.actor_target.named_parameters(): logger.logkv('actor_target/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic.named_parameters(): logger.logkv('critic/' + name, param.clone().cpu().data.numpy()) for name, param in self.critic_target.named_parameters(): logger.logkv('critic_target/' + name, param.clone().cpu().data.numpy()) def random_action(self): act = np.random.uniform(-1., 1., self.ac_dim) return act def policy(self, obs, stochastic=True): self.actor.eval() ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1) act = self.actor(ob) act = act.cpu().data.numpy() if stochastic: act = self.action_noise(act) self.actor.train() return act def cuda(self): self.critic.cuda() self.actor.cuda() if hasattr(self, 'critic_target'): self.critic_target.cuda() self.actor_target.cuda() self.critic_loss.cuda() def construct_optim(self, net, lr, weight_decay=None): if weight_decay is None: weight_decay = 0 params = mutils.add_weight_decay([net], weight_decay=weight_decay) optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay) return optimizer def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
episodes += 1 state = env.reset() state = running_state(state) score = 0 for _ in range(10000): if episodes % 50 == 0: env.render() steps += 1 mu, std, _ = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) next_state = running_state(next_state) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) score += reward state = next_state if done: break scores.append(score) score_avg = np.mean(scores) print('{} episode score is {:.2f}'.format(episodes, score_avg)) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.writer = writer self.select_time = 0 # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'init_method':args.init_method } self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = nn.MSELoss()(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() self.actor.zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) if train_actor: self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() # print(s_t.shape) action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) action = action * (1 - noise_level) + (self.random_process.sample() * noise_level) action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.actor.cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_status, nb_actions, args): self.num_actor = 3 self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn } if args.pic: self.cnn = CNN(3, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)] self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)] self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) for i in range(self.num_actor): hard_update(self.actor_targets[i], self.actors[i]) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self, train_actor = True): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) print('label 1') print('size = ', state_batch.shape) state_batch = self.cnn(state_batch) print('label 2') next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: index = np.random.randint(low=0, high=self.num_actor) next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_targets[index](to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() sum_policy_loss = 0 for i in range(self.num_actor): self.actors[i].zero_grad() policy_loss = -self.critic([ to_tensor(state_batch), self.actors[i](to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if train_actor: self.actor_optims[i].step() sum_policy_loss += policy_loss # Target update soft_update(self.actor_targets[i], self.actors[i], self.tau) soft_update(self.critic_target, self.critic, self.tau) return -sum_policy_loss / self.num_actor, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def cuda(self): for i in range(self.num_actor): self.actors[i].cuda() self.actor_targets[i].cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete: return action.argmax() else: return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): actions = [] status = [] tot_score = [] for i in range(self.num_actor): action = to_numpy(self.actors[i](to_tensor(np.array([s_t]), volatile=True))).squeeze(0) noise_level = noise_level * max(self.epsilon, 0) action = action + self.random_process.sample() * noise_level status.append(s_t) actions.append(action) tot_score.append(0.) scores = self.critic([to_tensor(np.array(status), volatile=True), to_tensor(np.array(actions), volatile=True)]) for j in range(self.num_actor): tot_score[j] += scores.data[j][0] best = np.array(tot_score).argmax() if decay_epsilon: self.epsilon -= self.depsilon self.a_t = actions[best] return actions[best] def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=0): if output is None: return for i in range(self.num_actor): actor = self.actors[i] actor_target = self.actor_targets[i] actor.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) actor_target.load_state_dict( torch.load('{}/actor{}_{}.pkl'.format(output, num, i)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: for i in range(self.num_actor): self.actors[i].cpu() self.critic.cpu() for i in range(self.num_actor): torch.save( self.actors[i].state_dict(), '{}/actor{}_{}.pkl'.format(output, num, i) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: for i in range(self.num_actor): self.actors[i].cuda() self.critic.cuda()
class DDPG(object): def __init__(self, nb_states, nb_actions, args): if args.seed > 0: self.seed(args.seed) self.nb_states = nb_states self.nb_actions = nb_actions actor_net_cfg = { 'hidden1': 32, 'hidden2': 32, 'hidden3': 32, 'init_w': args.init_w } critic_net_cfg = { 'hidden1': 64, 'hidden2': 64, 'hidden3': 64, 'init_w': args.init_w } self.actor = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_target = Actor(self.nb_states, self.nb_actions, **actor_net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_target = Critic(self.nb_states, self.nb_actions, **critic_net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) #Create replay buffer self.memory = SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=args.ou_theta, mu=args.ou_mu, sigma=args.ou_sigma) # Hyper-parameters self.batch_size = args.bsize self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.is_training = True self.best_reward = -10 def update_policy(self, shared_model, args): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_and_split(self.batch_size, shared=args.use_more_states, num_states=args.num_states) # Prepare for the target q batch next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount*to_tensor(terminal_batch.astype(np.float))*next_q_values # Critic update self.critic_optim.zero_grad() q_batch = self.critic( [to_tensor(state_batch), to_tensor(action_batch)]) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() if args.shared: ensure_shared_grads(self.critic, shared_model.critic) self.critic_optim.step() # Actor update self.actor_optim.zero_grad() policy_loss = -self.critic( [to_tensor(state_batch), self.actor(to_tensor(state_batch))]) policy_loss = policy_loss.mean() policy_loss.backward() if args.shared: ensure_shared_grads(self.actor, shared_model.actor) self.actor_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def share_memory(self): self.critic.share_memory() self.actor.share_memory() def add_optim(self, actor_optim, critic_optim): self.actor_optim = actor_optim self.critic_optim = critic_optim def observe(self, r_t, s_t1, done): if self.is_training: self.memory.append(self.s_t, self.a_t, r_t, done) self.s_t = s_t1 def update_models(self, agent): self.actor = deepcopy(agent.actor) self.actor_target = deepcopy(agent.actor_target) self.critic = deepcopy(agent.critic) self.critic_target = deepcopy(agent.critic_target) self.actor_optim = deepcopy(agent.actor_optim) self.critic_optim = deepcopy(agent.critic_optim) def random_action(self): action = np.random.uniform(-1., 1., self.nb_actions) self.a_t = action return action def train(self): self.critic.train() self.actor.train() def state_dict(self): return [ self.actor.state_dict(), self.actor_target.state_dict(), self.critic.state_dict(), self.critic_target.state_dict() ] def load_state_dict(self, list_of_dicts): self.actor.load_state_dict(list_of_dicts[0]) self.actor_target.load_state_dict(list_of_dicts[1]) self.critic.load_state_dict(list_of_dicts[2]) self.critic_target.load_state_dict(list_of_dicts[3]) def select_action(self, s_t, decay_epsilon=True): action = to_numpy(self.actor(to_tensor(np.array([s_t])))).squeeze(0) action += self.is_training * max(self.epsilon, 0) * self.random_process.sample() action = np.clip(action, -1., 1.) if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action return action def reset(self, obs): self.s_t = obs self.random_process.reset_states() def load_weights(self, output): if output is None: return self.actor.load_state_dict(torch.load('{}/actor.pkl'.format(output))) self.critic.load_state_dict(torch.load('{}/critic.pkl'.format(output))) def save_model(self, output): torch.save(self.actor.state_dict(), '{}/actor.pkl'.format(output)) torch.save(self.critic.state_dict(), '{}/critic.pkl'.format(output)) def seed(self, s): torch.manual_seed(s)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = Memory(capacity=BUFFER_SIZE, e=E, a=A, beta=BETA, beta_increment_per_sampling=BETA_INCREMENT_PER_SAMPLING, batch_size=BATCH_SIZE) return None def append_sample(self, state, action, reward, next_state, done): """Calculate Error and append the tuple to Memory """ state = torch.from_numpy(state).float().to(device) action = torch.from_numpy(action).float().to(device) next_state = torch.from_numpy(next_state).float().to(device) self.actor_target.eval() with torch.no_grad(): actions_next = self.actor_target(next_state) self.actor_target.train() self.critic_target.eval() with torch.no_grad(): Q_targets_next = self.critic_target(next_state, actions_next) self.critic_target.train() # Compute Q targets for current states (y_i) Q_targets = reward + (GAMMA * Q_targets_next * (1 - done)) # Compute critic loss self.critic_local.eval() with torch.no_grad(): Q_expected = self.critic_local(state, action) self.critic_local.train() critic_loss = torch.abs(Q_expected - Q_targets).detach().numpy() # clip loss critic_loss = min(1.0, critic_loss) # add to Memory self.memory.add(critic_loss, (state, action, reward, next_state, done)) return None def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward #self.memory.add(state, action, reward, next_state, done) self.append_sample(state, action, reward, next_state, done) self.update() return None def update(self): """Train Agent """ # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, idxs, is_weights = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) #critic_loss = F.mse_loss(Q_expected, Q_targets) critic_loss = weighted_mse_loss(Q_expected, Q_targets, is_weights) # update priority in memory errors = torch.abs(Q_expected - Q_targets).detach().numpy() for i in range(BATCH_SIZE): idx = idxs[i] # clip errors self.memory.update(idx, min(1.0, errors[i])) # end of updating priority in memory # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPG(object): def __init__(self, env, mem_size=7 * int(1e3), lr_critic=1e-3, lr_actor=1e-4, epsilon=1., max_epi=1500, epsilon_decay=1. / (1e5), gamma=.99, target_update_frequency=200, batch_size=64, random_process=True, max_step=None): self.CUDA = torch.cuda.is_available() self.orig_env = env #for recording if max_step is not None: self.orig_env._max_episode_steps = max_step self.env = self.orig_env self.N_S = self.env.observation_space.shape[0] self.N_A = self.env.action_space.shape[0] self.MAX_EPI = max_epi self.LOW = self.env.action_space.low self.HIGH = self.env.action_space.high self.actor = Actor(self.N_S, self.N_A) self.critic = Critic(self.N_S, self.N_A) self.target_actor = Actor(self.N_S, self.N_A) self.target_critic = Critic(self.N_S, self.N_A) self.target_actor.eval() self.target_critic.eval() self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) if self.CUDA: self.actor.cuda() self.critic.cuda() self.target_actor.cuda() self.target_critic.cuda() self.exp = Experience(mem_size) self.optim_critic = optim.Adam(self.critic.parameters(), lr=lr_critic) self.optim_actor = optim.Adam(self.actor.parameters(), lr=-lr_actor) self.random_process = OrnsteinUhlenbeckProcess(\ size=self.N_A, theta=.15, mu=0, sigma=.2) self.EPSILON = epsilon self.EPSILON_DECAY = epsilon_decay self.GAMMA = gamma self.TARGET_UPDATE_FREQUENCY = target_update_frequency self.BATCH_SIZE = batch_size title = {common.S_EPI: [], common.S_TOTAL_R: []} self.data = pd.DataFrame(title) self.RAND_PROC = random_process def train(self, dir=None, interval=1000): if dir is not None: self.env = wrappers.Monitor(self.orig_env, '{}/train_record'.format(dir), force=True) os.mkdir(os.path.join(dir, 'models')) update_counter = 0 epsilon = self.EPSILON for epi in trange(self.MAX_EPI, desc='train epi', leave=True): self.random_process.reset_states() o = self.env.reset() counter = 0 acc_r = 0 while True: counter += 1 #if dir is not None: # self.env.render() a = self.choose_action(o) if self.RAND_PROC: a += max(epsilon, 0) * self.random_process.sample() a = np.clip(a, -1., 1.) epsilon -= self.EPSILON_DECAY o_, r, done, info = self.env.step(self.map_to_action(a)) self.exp.push(o, a, r, o_, done) if epi > 0: self.update_actor_critic() update_counter += 1 if update_counter % self.TARGET_UPDATE_FREQUENCY == 0: self.update_target() acc_r += r o = o_ if done: break if dir is not None: if (epi + 1) % interval == 0: self.save(os.path.join(dir, 'models'), str(epi + 1), save_data=False) s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R]) self.data = self.data.append(s, ignore_index=True) def choose_action(self, state): self.actor.eval() s = Variable(torch.Tensor(state)).unsqueeze(0) if self.CUDA: s = s.cuda() a = self.actor(s).data.cpu().numpy()[0].astype('float64') self.actor.train() return a def map_to_action(self, a): return (self.LOW + self.HIGH) / 2 + a * (self.HIGH - self.LOW) / 2 def update_target(self): self.target_actor.load_state_dict(self.actor.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) def update_actor_critic(self): # sample minibatch minibatch = common.Transition(*zip(*self.exp.sample(self.BATCH_SIZE))) bat_o = Variable(torch.Tensor(minibatch.state)) bat_a = Variable(torch.Tensor(minibatch.action)) bat_r = Variable(torch.Tensor(minibatch.reward)).unsqueeze(1) bat_o_ = Variable(torch.Tensor(minibatch.next_state)) bat_not_done_mask = list( map(lambda done: 0 if done else 1, minibatch.done)) bat_not_done_mask = Variable( torch.ByteTensor(bat_not_done_mask)).unsqueeze(1) if self.CUDA: bat_o = bat_o.cuda() bat_a = bat_a.cuda() bat_r = bat_r.cuda() bat_o_ = bat_o_.cuda() bat_not_done_mask = bat_not_done_mask.cuda() # update critic bat_a_o_ = self.target_actor(bat_o_) Gt = bat_r Gt[bat_not_done_mask] += self.GAMMA * self.target_critic( bat_o_, bat_a_o_)[bat_not_done_mask] Gt.detach_() eval_o = self.critic(bat_o, bat_a) criterion = nn.MSELoss() if self.CUDA: criterion.cuda() loss = criterion(eval_o, Gt) self.optim_critic.zero_grad() loss.backward() self.optim_critic.step() # update actor self.critic.eval() bat_a_o = self.actor(bat_o) obj = torch.mean(self.critic(bat_o, bat_a_o)) self.optim_actor.zero_grad() obj.backward() self.optim_actor.step() self.critic.train() def test(self, dir=None, n=1): if dir is not None: self.env = wrappers.Monitor(self.orig_env, '{}/test_record'.format(dir), force=True, video_callable=lambda episode_id: True) title = {common.S_EPI: [], common.S_TOTAL_R: []} df = pd.DataFrame(title) for epi in trange(n, desc='test epi', leave=True): o = self.env.reset() acc_r = 0 while True: #if dir is not None: # self.env.render() a = self.choose_action(o) o_, r, done, info = self.env.step(self.map_to_action(a)) acc_r += r o = o_ if done: break s = pd.Series([epi, acc_r], index=[common.S_EPI, common.S_TOTAL_R]) df = df.append(s, ignore_index=True) if dir is not None: df.to_csv('{}/test_data.csv'.format(dir)) else: print df def save(self, dir, suffix='', save_data=True): torch.save(self.actor.state_dict(), '{}/actor{}.pt'.format(dir, suffix)) torch.save(self.critic.state_dict(), '{}/critic{}.pt'.format(dir, suffix)) if save_data: self.data.to_csv('{}/train_data{}.csv'.format(dir, suffix)) def load_actor(self, dir): self.actor.load_state_dict(torch.load(dir)) def load_critic(self, dir): self.critic.load_state_dict(torch.load(dir)) def get_data(self): return self.data
class Policy: def __init__(self, agent_index, state_size, action_size, hidden_dims, device, random_seed=7, buffer_size=1000000, batch_size=100, actor_learning_rate=1e-3, gamma=0.99, tau=1e-3, critic_learning_rate=1e-4): super(Policy, self).__init__() self.agent_index = agent_index self.tau = tau self.gamma = gamma self.seed = random_seed self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.action_size = action_size self.single_agent_state_size = state_size // 2 self.single_agent_action_size = action_size // 2 # actor networks - work as single agents self.actor = Actor(state_size=self.single_agent_state_size, action_size=self.single_agent_action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) self.target_actor = Actor(state_size=self.single_agent_state_size, action_size=self.single_agent_action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) # set actor and target_actor with same weights & biases for local_param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(local_param.data) # critic networks - combine both agents self.critic = Critic(state_size=state_size, action_size=action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) self.target_critic = Critic(state_size=state_size, action_size=action_size, seed=self.seed, hidden_dims=hidden_dims).to(device) # set critic_local and critic_target with same weights & biases for local_param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(local_param.data) # optimizers self.actor_optimizer = Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = Adam(self.critic.parameters(), lr=critic_learning_rate, weight_decay=0) # Replay memory self.memory = ReplayBuffer(action_size=action_size, buffer_size=self.buffer_size, batch_size=self.batch_size, seed=self.seed, device=self.device) self.t_update = 0 def get_weights(self): """get the weights for the actor and critic models""" return self.actor.state_dict(), self.target_actor.state_dict(), \ self.critic.state_dict(), self.target_critic.state_dict() def load_weights(self, values): """load the weights for the actor and critic models""" w1, w2, w3, w4 = values self.actor.load_state_dict(w1) self.target_actor.load_state_dict(w2) self.critic.load_state_dict(w3) self.target_critic.load_state_dict(w4) def step(self, states, actions, rewards, next_states, dones): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if self.num_agents > 1: for agent in range(self.num_agents): self.memory.add(states[agent, :], actions[agent, :], rewards[agent], next_states[agent, :], dones[agent]) else: self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, use_target=False, add_noise=True, noise_value=None): """Returns actions for given state as per current policy. Arguments: state (Tensor): input state use_target (bool): if True then use the target actor network, otherwise use the local one add_noise (bool): if True then add noise to the actions obtained noise_value (float): noise value to add (if adding noise) Returns: action (Tensor): action of shape (action_size) # (2) """ state = ensure_is_tensor(state, self.device) if use_target: actor_net = self.target_actor else: actor_net = self.actor actor_net.eval() with torch.no_grad(): action = actor_net(state) if add_noise: action = action.cpu().data.numpy() action = np.clip(action + noise_value, -1, 1) action = ensure_is_tensor(action, self.device) actor_net.train() return action def learn(self, experiences, other_agent): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Arguments: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples other_agent (Policy): the other agent """ states, actions, rewards, next_states, dones = experiences self.t_update += 1 if self.agent_index == 0: # states, actions, next_actions of the agent states_self = ensure_is_tensor( states[:, :self.single_agent_state_size], self.device) action_self = ensure_is_tensor( actions[:, :self.single_agent_action_size], self.device) next_states_self = ensure_is_tensor( next_states[:, :self.single_agent_state_size], self.device) # states, actions, next_actions of the other agent states_other = ensure_is_tensor( states[:, self.single_agent_state_size:], self.device) action_other = ensure_is_tensor( actions[:, self.single_agent_action_size:], self.device) next_states_other = ensure_is_tensor( next_states[:, self.single_agent_state_size:], self.device) # rewards and dones rewards = ensure_is_tensor(rewards[:, 0].reshape((-1, 1)), self.device) dones = ensure_is_tensor(dones[:, 0].reshape((-1, 1)), self.device) elif self.agent_index == 1: # states, actions, next_actions of the agent states_self = ensure_is_tensor( states[:, self.single_agent_state_size:], self.device) action_self = ensure_is_tensor( actions[:, self.single_agent_action_size:], self.device) next_states_self = ensure_is_tensor( next_states[:, self.single_agent_state_size:], self.device) # states, actions, next_actions of the other agent states_other = ensure_is_tensor( states[:, :self.single_agent_state_size], self.device) action_other = ensure_is_tensor( actions[:, :self.single_agent_action_size], self.device) next_states_other = ensure_is_tensor( next_states[:, :self.single_agent_state_size], self.device) # rewards and dones rewards = ensure_is_tensor(rewards[:, 1].reshape((-1, 1)), self.device) dones = ensure_is_tensor(dones[:, 1].reshape((-1, 1)), self.device) # s, a, s' for both agents states = ensure_is_tensor(states, self.device) actions = ensure_is_tensor(actions, self.device) next_states = ensure_is_tensor(next_states, self.device) # ---------------------------- update critic ---------------------------- # next_actions_self = self.act(next_states_self, use_target=True, add_noise=False) next_actions_other = other_agent.act(next_states_other, use_target=True, add_noise=False) # combine the next actions from both agents if self.agent_index == 0: actions_next = torch.cat([next_actions_self, next_actions_other], dim=1).float().detach().to(self.device) elif self.agent_index == 1: actions_next = torch.cat([next_actions_other, next_actions_self], dim=1).float().detach().to(self.device) # Get predicted next-state actions and Q values from target models self.target_critic.eval() with torch.no_grad(): Q_targets_next = self.target_critic( next_states, actions_next).detach().to(self.device) self.target_critic.train() # Compute Q targets for current states (y_i) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) # get the current action-value for the states and actions Q_expected = self.critic(states, actions) # Minimize the loss self.critic_optimizer.zero_grad() # Compute critic loss critic_loss = F.smooth_l1_loss(Q_expected, Q_targets.detach()) # back propagate through the network critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # if self.agent_index == 0: actions_pred = torch.cat([ self.actor(states_self), other_agent.act( states_other, use_target=False, add_noise=False) ], dim=1) elif self.agent_index == 1: actions_pred = torch.cat([ other_agent.act( states_other, use_target=False, add_noise=False), self.actor(states_self) ], dim=1) # Compute actor loss and minimize it self.actor_optimizer.zero_grad() actor_loss = -self.critic(states, actions_pred).mean() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic, self.target_critic, self.tau) self.soft_update(self.actor, self.target_actor, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Arguments: local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) print(1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, time_step, agent_list): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward if time_step % 2: for idx in range(state.shape[0]): agent_list[idx].memory.add(state[idx], action[idx], reward[idx], next_state[idx], done[idx]) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if time_step % 2 == 0: for agent in agent_list: experiences = agent.memory.sample() agent.learn(experiences, GAMMA) #import ipdb; ipdb.set_trace() #experiences = self.memory.sample() #self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" actions = [] for idx in range(state.shape[0]): state = state.float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local( state[idx, ...].unsqueeze(0)).cpu().data.numpy() if add_noise: action += self.noise.sample() actions.append(np.clip(action, -1, 1)) return np.asarray(actions) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models self.critic_local.train() with torch.no_grad(): actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) #rewards = rewards - rewards.mean() #rewards = rewards / rewards.std() Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() self.critic_local.eval() # ---------------------------- update actor ---------------------------- # # Compute actor loss self.actor_local.train() actions_pred = self.actor_local(states) torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) actor_loss = -self.critic_local(states, actions_pred).mean() # include q value normalization for actor to learn faster #actor_actions = -self.critic_local(states, actions_pred) #actor_no_mean = actor_actions - actor_actions.mean() #actor_loss = (actor_no_mean/actor_no_mean.std()).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor_local.parameters(), 1) self.actor_optimizer.step() self.actor_local.eval() # ----------------------- update target networks ----------------------- # with torch.no_grad(): self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) writer = SummaryWriter(comment="-ppo_iter-" + str(args.max_iter_num)) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) if done: mask = 0 else: mask = 1 memory.append([state, action, reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train() train_model(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] print('state size:', state_size) print('action size:', action_size) actor = Actor(state_size, action_size, args) critic = Critic(state_size, action_size, args) target_critic = Critic(state_size, action_size, args) actor_optimizer = optim.Adam(actor.parameters(), lr=args.actor_lr) critic_optimizer = optim.Adam(critic.parameters(), lr=args.critic_lr) hard_target_update(critic, target_critic) # initialize automatic entropy tuning target_entropy = -torch.prod(torch.Tensor(action_size)).item() log_alpha = torch.zeros(1, requires_grad=True) alpha = torch.exp(log_alpha) alpha_optimizer = optim.Adam([log_alpha], lr=args.alpha_lr) writer = SummaryWriter(args.logdir) replay_buffer = deque(maxlen=10000) recent_rewards = deque(maxlen=100) steps = 0 for episode in range(args.max_iter_num): done = False score = 0 state = env.reset() state = np.reshape(state, [1, state_size]) while not done: if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state)) action = get_action(mu, std) next_state, reward, done, _ = env.step(action) next_state = np.reshape(next_state, [1, state_size]) mask = 0 if done else 1 replay_buffer.append((state, action, reward, next_state, mask)) state = next_state score += reward if steps > args.batch_size: mini_batch = random.sample(replay_buffer, args.batch_size) actor.train(), critic.train(), target_critic.train() alpha = train_model(actor, critic, target_critic, mini_batch, actor_optimizer, critic_optimizer, alpha_optimizer, target_entropy, log_alpha, alpha) soft_target_update(critic, target_critic, args.tau) if done: recent_rewards.append(score) if episode % args.log_interval == 0: print('{} episode | score_avg: {:.2f}'.format( episode, np.mean(recent_rewards))) writer.add_scalar('log/score', float(score), episode) if np.mean(recent_rewards) > args.goal_score: if not os.path.isdir(args.save_path): os.makedirs(args.save_path) ckpt_path = args.save_path + 'model.pth.tar' torch.save(actor.state_dict(), ckpt_path) print('Recent rewards exceed -300. So end') break
def main(): env = gym.make(args.env_name) env.seed(args.seed) torch.manual_seed(args.seed) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] running_state = ZFilter((num_inputs,), clip=5) print('state size:', num_inputs) print('action size:', num_actions) actor = Actor(num_inputs, num_actions, args) critic = Critic(num_inputs, args) discrim = Discriminator(num_inputs + num_actions, args) actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate) critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, weight_decay=args.l2_rate) discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate) # load demonstrations expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) demonstrations = np.array(expert_demo) print("demonstrations.shape", demonstrations.shape) writer = SummaryWriter(args.logdir) if args.load_model is not None: saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model)) ckpt = torch.load(saved_ckpt_path) actor.load_state_dict(ckpt['actor']) critic.load_state_dict(ckpt['critic']) discrim.load_state_dict(ckpt['discrim']) running_state.rs.n = ckpt['z_filter_n'] running_state.rs.mean = ckpt['z_filter_m'] running_state.rs.sum_square = ckpt['z_filter_s'] print("Loaded OK ex. Zfilter N {}".format(running_state.rs.n)) episodes = 0 train_discrim_flag = True for iter in range(args.max_iter_num): actor.eval(), critic.eval() memory = deque() steps = 0 scores = [] while steps < args.total_sample_size: state = env.reset() score = 0 state = running_state(state) for _ in range(10000): if args.render: env.render() steps += 1 mu, std = actor(torch.Tensor(state).unsqueeze(0)) action = get_action(mu, std)[0] next_state, reward, done, _ = env.step(action) irl_reward = get_reward(discrim, state, action) if done: mask = 0 else: mask = 1 memory.append([state, action, irl_reward, mask]) next_state = running_state(next_state) state = next_state score += reward if done: break episodes += 1 scores.append(score) score_avg = np.mean(scores) print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg)) writer.add_scalar('log/score', float(score_avg), iter) actor.train(), critic.train(), discrim.train() if train_discrim_flag: expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: train_discrim_flag = False train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args) if iter % 100: score_avg = int(score_avg) model_path = os.path.join(os.getcwd(),'save_model') if not os.path.isdir(model_path): os.makedirs(model_path) ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar') save_checkpoint({ 'actor': actor.state_dict(), 'critic': critic.state_dict(), 'discrim': discrim.state_dict(), 'z_filter_n':running_state.rs.n, 'z_filter_m': running_state.rs.mean, 'z_filter_s': running_state.rs.sum_square, 'args': args, 'score': score_avg }, filename=ckpt_path)
class DDPG(object): def __init__(self, nb_status, nb_actions, args, writer): self.clip_actor_grad = args.clip_actor_grad self.nb_status = nb_status * args.window_length self.nb_actions = nb_actions self.discrete = args.discrete self.pic = args.pic self.writer = writer self.select_time = 0 if self.pic: self.nb_status = args.pic_status # Create Actor and Critic Network net_cfg = { 'hidden1':args.hidden1, 'hidden2':args.hidden2, 'use_bn':args.bn, 'init_method':args.init_method } if args.pic: self.cnn = CNN(1, args.pic_status) self.cnn_target = CNN(1, args.pic_status) self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate) self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg) self.actor_optim = Adam(self.actor.parameters(), lr=args.prate) self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg) self.critic_optim = Adam(self.critic.parameters(), lr=args.rate) hard_update(self.actor_target, self.actor) # Make sure target is with the same weight hard_update(self.critic_target, self.critic) if args.pic: hard_update(self.cnn_target, self.cnn) #Create replay buffer self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length) self.random_process = Myrandom(size=nb_actions) # Hyper-parameters self.batch_size = args.batch_size self.tau = args.tau self.discount = args.discount self.depsilon = 1.0 / args.epsilon # self.epsilon = 1.0 self.s_t = None # Most recent state self.a_t = None # Most recent action self.use_cuda = args.cuda # if self.use_cuda: self.cuda() def normalize(self, pic): pic = pic.swapaxes(0, 2).swapaxes(1, 2) return pic def update_policy(self): # Sample batch state_batch, action_batch, reward_batch, \ next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size) # Prepare for the target q batch if self.pic: state_batch = np.array([self.normalize(x) for x in state_batch]) state_batch = to_tensor(state_batch, volatile=True) state_batch = self.cnn(state_batch) next_state_batch = np.array([self.normalize(x) for x in next_state_batch]) next_state_batch = to_tensor(next_state_batch, volatile=True) next_state_batch = self.cnn_target(next_state_batch) next_q_values = self.critic_target([ next_state_batch, self.actor_target(next_state_batch) ]) else: next_q_values = self.critic_target([ to_tensor(next_state_batch, volatile=True), self.actor_target(to_tensor(next_state_batch, volatile=True)), ]) # print('batch of picture is ok') next_q_values.volatile = False target_q_batch = to_tensor(reward_batch) + \ self.discount * to_tensor((1 - terminal_batch.astype(np.float))) * next_q_values # Critic update self.critic.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False q_batch = self.critic([state_batch, to_tensor(action_batch)]) else: q_batch = self.critic([to_tensor(state_batch), to_tensor(action_batch)]) # print(reward_batch, next_q_values*self.discount, target_q_batch, terminal_batch.astype(np.float)) value_loss = criterion(q_batch, target_q_batch) value_loss.backward() self.critic_optim.step() if self.pic: self.cnn_optim.step() self.actor.zero_grad() if self.pic: self.cnn.zero_grad() if self.pic: state_batch.volatile = False policy_loss = -self.critic([ state_batch, self.actor(state_batch) ]) else: policy_loss = -self.critic([ to_tensor(state_batch), self.actor(to_tensor(state_batch)) ]) policy_loss = policy_loss.mean() policy_loss.backward() if self.clip_actor_grad is not None: torch.nn.utils.clip_grad_norm(self.actor.parameters(), float(self.clip_actor_grad)) if self.writer != None: mean_policy_grad = np.array(np.mean([np.linalg.norm(p.grad.data.cpu().numpy().ravel()) for p in self.actor.parameters()])) #print(mean_policy_grad) self.writer.add_scalar('train/mean_policy_grad', mean_policy_grad, self.select_time) self.actor_optim.step() if self.pic: self.cnn_optim.step() # Target update soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) if self.pic: soft_update(self.cnn_target, self.cnn, self.tau) return -policy_loss, value_loss def eval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() if(self.pic): self.cnn.eval() self.cnn_target.eval() def train(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() if(self.pic): self.cnn.train() self.cnn_target.train() def cuda(self): self.cnn.cuda() self.cnn_target.cuda() self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() def observe(self, r_t, s_t1, done): self.memory.append([self.s_t, self.a_t, r_t, s_t1, done]) self.s_t = s_t1 def random_action(self, fix=False): action = np.random.uniform(-1.,1.,self.nb_actions) self.a_t = action if self.discrete and fix == False: action = action.argmax() # if self.pic: # action = np.concatenate((softmax(action[:16]), softmax(action[16:]))) return action def select_action(self, s_t, decay_epsilon=True, return_fix=False, noise_level=0): self.eval() if self.pic: s_t = self.normalize(s_t) s_t = self.cnn(to_tensor(np.array([s_t]))) if self.pic: action = to_numpy( self.actor_target(s_t) ).squeeze(0) else: action = to_numpy( self.actor(to_tensor(np.array([s_t]))) ).squeeze(0) self.train() noise_level = noise_level * max(self.epsilon, 0) if np.random.uniform(0, 1) < noise_level: action = self.random_action(fix=True) # episilon greedy if decay_epsilon: self.epsilon -= self.depsilon self.a_t = action if return_fix: return action if self.discrete: return action.argmax() else: return action def reset(self, obs): self.s_t = obs self.random_process.reset_status() def load_weights(self, output, num=1): if output is None: return self.actor.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.actor_target.load_state_dict( torch.load('{}/actor{}.pkl'.format(output, num)) ) self.critic.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) self.critic_target.load_state_dict( torch.load('{}/critic{}.pkl'.format(output, num)) ) def save_model(self, output, num): if self.use_cuda: self.cnn.cpu() self.actor.cpu() self.critic.cpu() torch.save( self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num) ) torch.save( self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num) ) if self.use_cuda: self.cnn.cuda() self.actor.cuda() self.critic.cuda()
class Agent: def __init__(self, env, gamma, batch_size, buffer_size, lr_rate, tau): self.env = env self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.action_bound = env.action_space.high[0] self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size self.actor = Actor(self.state_dim, self.action_dim, self.action_bound, lr_rate[0], tau) self.critic = Critic(self.state_dim, self.action_dim, lr_rate[1], tau) self.buffer = ReplayBuffer(self.buffer_size) self.save_epi_reward = [] def ou_noise(self, x, rho=0.15, mu=0., dt=1e-1, sigma=0.2, dim=1): rho = torch.FloatTensor([rho]) mu = torch.FloatTensor([mu]) dt = torch.FloatTensor([dt]) return x + rho * (mu - x) * dt + torch.sqrt(dt) * torch.normal( 0., sigma, size=(dim, )) def td_target(self, rewards, q_values, dones): y_k = torch.zeros(q_values.shape) for i in range(q_values.shape[0]): if dones[i]: y_k[i] = rewards[i] else: y_k[i] = rewards[i] + self.gamma * q_values[i] return y_k def train(self, max_episode_num, save_path, save_names): self.actor.update_target_network() self.critic.update_target_network() for episode in range(max_episode_num): time, episode_reward, done = 0, 0, False state = self.env.reset() state = torch.from_numpy(state).type(torch.FloatTensor) pre_noise = torch.zeros(self.action_dim) while not done: #env.render() action = self.actor.predict(state)[0] noise = self.ou_noise(pre_noise, dim=self.action_dim) action = np.array([action.item()]) action = np.clip(action, -self.action_bound, self.action_bound) next_state, reward, done, _ = self.env.step(action) next_state = torch.from_numpy(next_state).type( torch.FloatTensor) action = torch.from_numpy(action).type(torch.FloatTensor) reward = torch.FloatTensor([reward]) train_reward = torch.FloatTensor([(reward + 8) / 8]) state = state.view(1, self.state_dim) next_state = next_state.view(1, self.state_dim) action = action.view(1, self.action_dim) reward = reward.view(1, 1) train_reward = reward.view(1, 1) self.buffer.add_buffer(state, action, train_reward, next_state, done) if self.buffer.buffer_size > 1000: states, actions, rewards, next_states, dones = self.buffer.sample_batch( self.batch_size) actions_ = self.actor.target_predict(next_states) actions_ = actions_.view(next_states.shape[0], self.action_dim) target_qs = self.critic.target_predict( next_states, actions_) y_i = self.td_target(rewards, target_qs, dones) self.critic.train(states, actions, y_i) s_actions = self.actor.predict(states) policy_loss = self.critic.predict(states, s_actions) self.actor.train(policy_loss) self.actor.update_target_network() self.critic.update_target_network() pre_noise = noise state = next_state[0] episode_reward += reward[0] time += 1 self.save_epi_reward.append(episode_reward.item()) if len(self.save_epi_reward) < 20: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward)) else: print('Episode:', episode + 1, 'Time:', time, 'Reward(ave of recent20):', np.mean(self.save_epi_reward[-20:])) if episode % 10 == 0: self.actor.save(save_path, save_names[0]) self.critic.save(save_path, save_names[1])