def run(): env = gym.make('CorridorSmall-v10') action_space = list(range(env.action_space.n)) q = Approximator_ResidualBoosting(action_space) initial_learning_rate = 0.15 learning_rate = initial_learning_rate initial_epsilon = 0.15 epsilon = initial_epsilon batch_size = 10 for learning_iteration in range(1000): policy = Policy_EpsilonGreedy(q, epsilon) episodes = [rollout(policy, env) for _ in range(batch_size)] targets = TD0_targets(episodes, q) X, Y_target = zip(*targets) Y_target = np.reshape(Y_target, (-1, 1)) learning_rate = decay(initial_learning_rate, learning_iteration) epsilon = decay(initial_epsilon, learning_iteration) q.learn(learning_rate, X, Y_target) if learning_iteration % 1 == 0: greedy_policy = Policy_Greedy(q) reward_sum = avg( test_policy(greedy_policy, env) for _ in range(10)) print( f"Episode {learning_iteration*batch_size} Reward {reward_sum} lr {learning_rate} epsilon {epsilon}" )
G = Qs[tau % n] for k in range(tau, min(tau + n - 1, T - 1)): G += Z * deltas[k % n] Z = gamma * Z * ((1 - sigma) * pis[(k + 1) % n] + sigma) p = p * (1 - sigma + sigma * ratios[k % n]) s = states[tau % n] a = actions[tau % n] # Update state-action value function. Q[s, a] += alpha * p * (G - Q[s, a]) action_values = [Q[s, i] for i in range(4)] policy[s] = np.argmax(action_values) t += 1 epsilon = decay(epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy if __name__ == '__main__': n = 4 alpha = 0.0001 gamma = 1 sigma = 0.5 epsilon = 1 n_episodes = 50000 n_tests = 10 env = GridWorld() policy = n_step_Q_sigma(env, n, alpha, gamma, sigma, epsilon, n_episodes) test_policy(env, policy, n_tests)
# coding: utf-8 import time from utils import test_policy if __name__ == '__main__': test_policy("test-app", [(10, 0.5), (16, 0.5), (12, 0.5), (7, 0.5), (23, 0.4), (25, 0.5), (14, 0.5), (10, 1)])
from metaworld.policies.sawyer_door_lock_v1_policy import SawyerDoorLockV1Policy import metaworld import random from utils import test_policy ml45 = metaworld.ML45() name = "door-lock-v1" env_cls = ml45.test_classes[name] policy = SawyerDoorLockV1Policy() all_tasks = [task for task in ml45.test_tasks if task.env_name == name] env = env_cls() query_task = random.choice(all_tasks[25:]) env.set_task(query_task) env.max_path_length = 200 test_policy(env, policy, render=True, stop=False)
# coding: utf-8 import time from utils import test_policy, send_retrain if __name__ == '__main__': test_policy("test-app", [(6, 3), (10, 0.2), (5, 1)]) send_retrain("test-app") time.sleep(2) send_retrain("test-app") test_policy("test-app", [(4, 1), (2, 0.2), (25, 2)]) send_retrain("test-app")
"-ld", "--log_dir", type=str, required=False, help="directory to store log file in", ) parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true") args = parser.parse_args() env = utils.make_env(args.env_name) observation_size = env.observation_space.shape[0] action_size = env.action_space.n policy = MlpPolicy(observation_size, action_size, args.policy_hidden_dim) checkpoint = torch.load(args.policy_path) policy.load_state_dict(checkpoint["policy_state_dict"]) utils.test_policy( policy, env, args.num_episodes, args.deterministic, args.max_episode_len, args.log_dir, args.verbose, ) env.close()
import gym import torch from env_wrappers import ActionNormalizedEnv from models import DDPG_Actor from utils import test_policy model_name = 'ddpg_01' env_id = "Pendulum-v0" identity = model_name + '_' + env_id env = ActionNormalizedEnv(gym.make(env_id)) obs_size = env.observation_space.shape[0] act_size = env.action_space.shape[0] act_net = DDPG_Actor(obs_size, act_size) act_net.load_state_dict(torch.load(identity + '_act.pth')) mean_return = test_policy(act_net, env, True) print('mean_return: %.3f' % mean_return)
# coding: utf-8 import time from utils import test_policy if __name__ == '__main__': test_policy("test-app", [(6, 8), (10, 0.2), (5, 8), (4, 8), (2, 0.2), (25, 8)])
tau = t - n + 1 if tau > -1: Z = 1 G = Qs[tau % n] for k in range(tau, min(tau + n - 1, T - 1)): G += Z * deltas[k % n] Z *= gamma * Z * pis[(k + 1) % n] s = states[tau % n] a = actions[tau % n] # Update state-action value function. Q[s, a] += alpha * (G - Q[s, a]) # Make policy greedy w.r.t. Q. action_values = [Q[s, i] for i in range(4)] policy[s] = np.argmax(action_values) epsilon = decay(epsilon) if episode % 100 == 0: print_episode(episode, n_episodes) print_episode(n_episodes, n_episodes) return policy if __name__ == '__main__': n = 4 alpha = 0.01 gamma = 1 epsilon = 1 n_episodes = 1000 env = GridWorld() policy = n_step_tree_backup(env, n, alpha, gamma, epsilon, n_episodes) test_policy(env, policy, 10)
def _learn(self): try: update_count = 0 if self.log_path is not None: writer = SummaryWriter(self.log_path) writer.add_text("hyperparameters", f"{self.hp}") while update_count < self.hp.max_updates: if self.hp.verbose >= 2: print( f"[learner_{self.id}] Beginning Update_{update_count + 1}" ) # set up tracking variables traj_count = 0 value_fn_loss = 0.0 policy_loss = 0.0 policy_entropy = 0.0 loss = torch.zeros(1, device=device, dtype=dtype, requires_grad=True) reward = 0.0 # process batch of trajectories while traj_count < self.hp.batch_size: try: traj = self.q.get(timeout=self.timeout) except queue.Empty as e: print( f"[learner_{self.id}] No trajectory recieved for {self.timeout}" f" seconds. Exiting!") if self.log_path is not None: writer.close() self.completion.set() raise e if self.hp.verbose >= 2: print(f"[learner_{self.id}] Processing traj_{traj.id}") traj_len = len(traj.r) obs = torch.stack(traj.obs) actions = torch.stack(traj.a) r = torch.stack(traj.r) reward += torch.sum(r).item() / self.hp.batch_size disc = self.hp.gamma * (~torch.stack(traj.d)) # compute value estimates and logits for observed states v = self.value_fn(obs).squeeze(1) curr_logits = self.policy(obs[:-1]) # compute log probs for current and old policies curr_log_probs = action_log_probs(curr_logits, actions) traj_log_probs = action_log_probs(torch.stack(traj.logits), actions) # computing v trace targets recursively with torch.no_grad(): imp_sampling = torch.exp(curr_log_probs - traj_log_probs).squeeze(1) rho = torch.clamp(imp_sampling, max=self.hp.rho_bar) c = torch.clamp(imp_sampling, max=self.hp.c_bar) delta = rho * (r + self.hp.gamma * v[1:] - v[:1]) vt = torch.zeros(traj_len + 1, device=device, dtype=dtype) for i in range(traj_len - 1, -1, -1): vt[i] = delta[i] + disc[i] * c[i] * (vt[i + 1] - v[i + 1]) vt = torch.add(vt, v) # vt = (vt - torch.mean(vt)) / torch.std(vt) pg_adv = rho * (r + disc * vt[1:] - v[:-1]) # print(f"v: {v}") # print(f"vt: {vt}") # print(f"pg_adv: {pg_adv}") # print(f"rho: {rho}") # compute loss as sum of value loss, policy loss and entropy # traj_value_fn_loss = 0.5 * torch.sum(torch.pow(v - vt, 2)) # traj_policy_loss = torch.sum(curr_log_probs * pg_adv.detach()) # traj_policy_entropy = -1 * torch.sum( # F.softmax(curr_logits, dim=-1) # * F.log_softmax(curr_logits, dim=-1) # ) traj_value_fn_loss = compute_baseline_loss(v - vt) traj_policy_loss = compute_policy_gradient_loss( curr_logits, actions, pg_adv) traj_policy_entropy = -1 * compute_entropy_loss( curr_logits) traj_loss = (self.hp.v_loss_c * traj_value_fn_loss + self.hp.policy_loss_c * traj_policy_loss - self.hp.entropy_c * traj_policy_entropy) loss = torch.add(loss, traj_loss / self.hp.batch_size) value_fn_loss += traj_value_fn_loss.item( ) / self.hp.batch_size policy_loss += traj_policy_loss.item() / self.hp.batch_size policy_entropy += traj_policy_entropy.item( ) / self.hp.batch_size traj_count += 1 if self.hp.verbose >= 2: print(f"[learner_{self.id}] Updating model weights " f" for Update {update_count + 1}") # backpropogating loss and updating weights # self.policy_optimizer.zero_grad() # self.value_fn_optimizer.zero_grad() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), self.hp.max_norm) torch.nn.utils.clip_grad_norm_(self.value_fn.parameters(), self.hp.max_norm) self.optimizer.step() self.scheduler.step() # self.policy_optimizer.step() # self.value_fn_optimizer.step() # log to console if self.hp.verbose >= 1: print( f"[learner_{self.id}] Update {update_count + 1} | " f"Batch Mean Reward: {reward:.2f} | Loss: {loss.item():.2f}" ) # evaluate current policy if self.hp.eval_every is not None: if (update_count + 1) % self.hp.eval_every == 0: eval_r, eval_std = utils.test_policy( self.policy, self.hp.env_name, self.hp.eval_eps, True, self.hp.max_timesteps, ) if self.hp.verbose >= 1: print( f"[learner_{self.id}] Update {update_count + 1} | " f"Evaluation Reward: {eval_r:.2f}, Std Dev: {eval_std:.2f}" ) if self.log_path is not None: writer.add_scalar( f"learner_{self.id}/rewards/evaluation_reward", eval_r, update_count + 1, ) # log to tensorboard if self.log_path is not None: writer.add_scalar( f"learner_{self.id}/rewards/batch_mean_reward", reward, update_count + 1, ) writer.add_scalar( f"learner_{self.id}/loss/policy_loss", policy_loss, update_count + 1, ) writer.add_scalar( f"learner_{self.id}/loss/value_fn_loss", value_fn_loss, update_count + 1, ) writer.add_scalar( f"learner_{self.id}/loss/policy_entropy", policy_entropy, update_count + 1, ) writer.add_scalar(f"learner_{self.id}/loss/total_loss", loss, update_count + 1) # save model weights every given interval if (update_count + 1) % self.hp.save_every == 0: path = self.log_path / Path( f"IMPALA_{self.hp.env_name}_l{self.id}_{update_count+1}.pt" ) self.save(path) print(f"[learner_{self.id}] Saved model weights at " f"update {update_count+1} to {path}") # increment update counter self.update_counter.increment() update_count = self.update_counter.value if self.log_path is not None: writer.close() print(f"[learner_{self.id}] Finished learning") self.completion.set() return except KeyboardInterrupt: print(f"[learner_{self.id}] Interrupted") if self.log_path is not None: writer.close() self.completion.set() return except Exception as e: if self.log_path is not None: writer.close() print(f"[learner_{self.id}] Encoutered exception") raise e
def train_dqn(args): """Runs DQN training procedure. """ # setup TensorBoard logging writer = SummaryWriter(log_dir=args.logdir) # make environment env = gym.make(args.env_ID) env.seed(args.random_seed) # instantiate reward model + buffers and optimizers for training DQN q_net, q_target, replay_buffer, optimizer_agent, epsilon_schedule = init_dqn( args) # begin training ep_return = 0 i_episode = 0 state = env.reset() for step in range(args.n_agent_steps): # agent interact with env epsilon = epsilon_schedule.value(step) action = q_net.act(state, epsilon) next_state, rew, done, _ = env.step(action) # record step info replay_buffer.push(state, action, rew, next_state, done) ep_return += rew # prepare for next step state = next_state if done: state = env.reset() writer.add_scalar('1.ep_return', ep_return, step) ep_return = 0 i_episode += 1 # q_net gradient step at end of each episode if step >= args.agent_learning_starts and len( replay_buffer ) >= 3 * args.batch_size_agent: # we now make learning updates at the end of every episode loss = q_learning_loss(q_net, q_target, replay_buffer, args) optimizer_agent.zero_grad() loss.backward() optimizer_agent.step() writer.add_scalar('3.loss', loss, step) writer.add_scalar('4.epsilon', epsilon, step) if args.epsilon_annealing_scheme == 'exp': epsilon_schedule.step() # update q_target if step % args.target_update_period == 0: # update target parameters for target_param, local_param in zip(q_target.parameters(), q_net.parameters()): target_param.data.copy_(q_net.tau * local_param.data + (1.0 - q_net.tau) * target_param.data) # evalulate agent performance if step > 0 and step % args.agent_test_period == 0 or step == args.n_agent_steps - 1: logging.info( "Agent has taken {} steps. Testing performance for 100 episodes" .format(step)) mean_ep_return = test_policy(q_net, args, writer) writer.add_scalar('2.mean_ep_return_test', mean_ep_return, step) # save current policy save_policy(q_net, optimizer_agent, step, args) # Possibly end training if mean_ep_return is above the threshold if env.spec.reward_threshold != None and mean_ep_return >= env.spec.reward_threshold: raise SystemExit( "Environment solved after {} episodes!".format(i_episode)) writer.close()