def evaluate(net, save_domains=False, baseline=None): test_env = SubprocVecEnv([ lambda: gym.make('SysAdmin-v0', save_domain=save_domains) for i in range(config.eval_batch) ], in_series=(config.eval_batch // config.cpus), context='fork') tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' problems') with torch.no_grad(): net.eval() r_tot = 0. problems_finished = 0. rewards = [] steps = 0 s = test_env.reset() while problems_finished < config.eval_problems: steps += 1 if not baseline: a, v, pi, pi_full = net(s) else: a = random_action(s, baseline, config.multi) s, r, d, i = test_env.step(a) r_tot += np.sum(r) problems_finished += np.sum(d) rewards += [x['reward_total'] for x in itertools.compress(i, d)] tqdm_val.update(np.sum(d)) r_avg_ps = r_tot / (steps * config.eval_batch ) # average reward per step r_avg_pp = r_tot / problems_finished # average reward per problem net.train() if args.print_raw: rew_mean = np.mean(rewards) rew_ci95 = 1.96 * scipy.stats.sem(rewards) print(f"{rew_mean:.2f} ± {rew_ci95:.2f}") tqdm_val.close() test_env.close() eval_log = { 'reward_per_step': r_avg_ps, 'reward_per_problem': r_avg_pp, 'rewards': rewards, 'problems_finished': problems_finished, } return eval_log
def evaluate(net, split='valid', subset=None): test_env = SubprocVecEnv([lambda: gym.make('Sokograph-v0', split=split, subset=subset) for i in range(config.eval_batch)], in_series=(config.eval_batch // config.cpus), context='fork') tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' steps') with torch.no_grad(): net.eval() r_tot = 0. problems_solved = 0 problems_finished = 0 steps = 0 s = test_env.reset() while problems_finished < config.eval_problems: steps += 1 a, n, v, pi = net(s) actions = to_action(a, n, s, size=config.soko_size) s, r, d, i = test_env.step(actions) # print(r) r_tot += np.sum(r) problems_solved += sum('all_boxes_on_target' in x and x['all_boxes_on_target'] == True for x in i) problems_finished += np.sum(d) tqdm_val.update() r_avg = r_tot / (steps * config.eval_batch) # average reward per step problems_solved_ps = problems_solved / (steps * config.eval_batch) problems_solved_avg = problems_solved / problems_finished net.train() tqdm_val.close() test_env.close() return r_avg, problems_solved_ps, problems_solved_avg, problems_finished
def evaluate(net, planner): test_env = SubprocVecEnv([ lambda: gym.make('Boxworld-v0', plan=planner) for i in range(config.eval_batch) ], in_series=(config.eval_batch // config.cpus), context='fork') tqdm_val = tqdm(desc='Validating', total=config.eval_problems, unit=' problems') with torch.no_grad(): net.eval() r_tot = 0. problems_solved = 0. problems_finished = 0. problems_timeout = 0. steps = 0 opt_all = [] opt_solved = [] s = test_env.reset() while problems_finished < config.eval_problems: steps += 1 # for step in range(1e9): a, v, pi = net(s) s, r, d, i = test_env.step(a) # print(r) r_tot += np.sum(r) problems_solved += np.array( sum(x['d_true'] for x in i) ) # conversion to numpy for easier ZeroDivision handling (-> nan) problems_finished += np.sum(d) if planner is not None: # print([x['path_len'] / x['steps'] if x['d_true'] else 0. for x in i if x['done']]) opt_all += [ x['path_len'] / x['steps'] if x['d_true'] else 0. for x in i if x['done'] ] opt_solved += [ x['path_len'] / x['steps'] for x in i if x['d_true'] ] tqdm_val.update(np.sum(d)) problems_solved_ps = problems_solved / (steps * config.eval_batch) problems_solved_avg = problems_solved / problems_finished r_avg_ps = r_tot / (steps * config.eval_batch ) # average reward per step r_avg_pp = r_tot / problems_finished # average reward per problem opt_all_avg = np.mean(opt_all) opt_all_sem = scipy.stats.sem(opt_all) opt_solved_avg = np.mean(opt_solved) opt_solved_sem = scipy.stats.sem(opt_solved) avg_steps_to_solve = (steps * config.eval_batch) / problems_finished net.train() tqdm_val.close() test_env.close() eval_log = { 'reward_per_step': r_avg_ps, 'reward_per_problem': r_avg_pp, 'problems_solved': problems_solved_avg, 'problems_finished': problems_finished, 'solved_per_step': problems_solved_ps, 'steps_per_problem': avg_steps_to_solve, 'optimality_all': opt_all_avg, 'optimality_all_sem': opt_all_sem, 'optimality_solved': opt_solved_avg, 'optimality_solved_sem': opt_solved_sem, } return eval_log
in_series=(config.batch // config.cpus), context='fork') # job_name = f"{config.soko_size[0]}x{config.soko_size[1]}-{config.soko_boxes} mp-{config.mp_iterations} nn-{config.emb_size} b-{config.batch}" job_name = None wandb.init(project="rrl-boxworld", name=job_name, config=config) wandb.save("*.pt") wandb.watch(net, log='all') # print(net) tot_env_steps = 0 tot_el_env_steps = 0 tqdm_main = tqdm(desc='Training', unit=' steps') s = env.reset() for step in itertools.count(start=1): a, v, pi = net(s) s, r, d, i = env.step(a) # print(r, d) # print(s) s_true = [x['s_true'] for x in i] d_true = [x['d_true'] for x in i] n_stacks = list(len(x['raw_state']) for x in i) # for the entropy regularization # update network loss, loss_pi, loss_v, loss_h, entropy, norm = net.update(
def main(): os.environ['OMP_NUM_THREADS'] = '1' envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) actor_critic = Policy(obs_numel, envs.action_space) # Maxime: log some info about the model and its size modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs): shape_dim0 = envs.observation_space.shape[0] obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: current_obs = current_obs.cuda() rollouts.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Obser reward and next obs obs, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) elif current_obs.dim() == 3: current_obs *= masks.unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[:-1].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, mean/median reward {:.2f}/{:.2f}, min/max reward {:.2f}/{:.2f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) if args.vis and j % args.vis_interval == 0: win = visdom_plot(total_num_steps, final_rewards.mean())
def main(): print("#######") print( "WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards" ) print("#######") os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) # Maxime: commented this out because it very much changes the behavior # of the code for seemingly arbitrary reasons #if len(envs.observation_space.shape) == 1: # envs = VecNormalize(envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) elif args.recurrent_policy: actor_critic = RecMLPPolicy(obs_numel, envs.action_space) else: actor_critic = MLPPolicy(obs_numel, envs.action_space) # Maxime: log some info about the model and its size # call function PPO.modelsize() for this to happen ''' modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize ''' if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.cuda: actor_critic.cuda() rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) if args.algo == 'a2c': Agent = A2C(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.alpha, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) elif args.algo == 'ppo': Agent = PPO(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) elif args.algo == 'acktr': Agent = ACKTR(actor_critic, rollouts, args.lr, args.eps, args.num_processes, obs_shape, args.use_gae, args.gamma, args.tau, args.recurrent_policy, args.num_mini_batch, args.cuda, args.log_interval, args.vis, args.env_name, args.log_dir, args.entropy_coef, args.num_stack, args.num_steps, args.ppo_epoch, args.clip_param, args.max_grad_norm, args.alpha, args.save_dir, args.vis_interval, args.save_interval, num_updates, action_shape, args.value_loss_coef) print(str(actor_critic)) print('Total model size: %d' % Agent.modelsize()) obs = envs.reset() Agent.update_current_obs(obs, envs) Agent.rollouts.observations[0].copy_(Agent.current_obs) # These variables are used to compute average rewards for all processes. Agent.train(envs)
def train(params, model_name, save_interval=1000, eval_interval=200, record_episodes=True, restart=False): try: # Create test env print("[INFO] Creating test environment") test_env = gym.make(env_name) # Traning parameters initial_lr = params["initial_lr"] discount_factor = params["discount_factor"] gae_lambda = params["gae_lambda"] ppo_epsilon = params["ppo_epsilon"] value_scale = params["value_scale"] entropy_scale = params["entropy_scale"] horizon = params["horizon"] num_epochs = params["num_epochs"] batch_size = params["batch_size"] num_envs = params["num_envs"] # Training parameters def lr_scheduler(step_idx): return initial_lr * \ 0.85 ** (step_idx // 10000) # Environment constants frame_stack_size = 4 input_shape = (84, 84, frame_stack_size) num_actions = test_env.action_space.shape[0] action_min = test_env.action_space.low action_max = test_env.action_space.high # Create model print("[INFO] Creating model") model = PPO(input_shape, num_actions, action_min, action_max, epsilon=ppo_epsilon, value_scale=value_scale, entropy_scale=entropy_scale, model_name=model_name) print("[INFO] Creating environments") envs = SubprocVecEnv([make_env for _ in range(num_envs)]) initial_frames = envs.reset() envs.get_images() frame_stacks = [FrameStack(initial_frames[i], stack_size=frame_stack_size, preprocess_fn=preprocess_frame) for i in range(num_envs)] print("[INFO] Training loop") while True: # While there are running environments states, taken_actions, values, rewards, dones = [], [], [], [], [] # Simulate game for some number of steps for _ in range(horizon): # Predict and value action given state # π(a_t | s_t; θ_old) states_t = [frame_stacks[i].get_state() for i in range(num_envs)] actions_t, values_t = model.predict(states_t) # Sample action from a Gaussian distribution envs.step_async(actions_t) frames, rewards_t, dones_t, _ = envs.step_wait() envs.get_images() # render # Store state, action and reward # [T, N, 84, 84, 4] states.append(states_t) taken_actions.append(actions_t) # [T, N, 3] values.append(np.squeeze(values_t, axis=-1)) # [T, N] rewards.append(rewards_t) # [T, N] dones.append(dones_t) # [T, N] # Get new state for i in range(num_envs): # Reset environment's frame stack if done if dones_t[i]: for _ in range(frame_stack_size): frame_stacks[i].add_frame(frames[i]) else: frame_stacks[i].add_frame(frames[i]) # Calculate last values (bootstrap values) states_last = [frame_stacks[i].get_state() for i in range(num_envs)] last_values = np.squeeze(model.predict( states_last)[1], axis=-1) # [N] advantages = compute_gae( rewards, values, last_values, dones, discount_factor, gae_lambda) advantages = (advantages - advantages.mean()) / \ (advantages.std() + 1e-8) # Move down one line? returns = advantages + values # Flatten arrays states = np.array(states).reshape( (-1, *input_shape)) # [T x N, 84, 84, 4] taken_actions = np.array(taken_actions).reshape( (-1, num_actions)) # [T x N, 3] # [T x N] returns = returns.flatten() # [T X N] advantages = advantages.flatten() T = len(rewards) N = num_envs assert states.shape == ( T * N, input_shape[0], input_shape[1], frame_stack_size) assert taken_actions.shape == (T * N, num_actions) assert returns.shape == (T * N,) assert advantages.shape == (T * N,) # Train for some number of epochs model.update_old_policy() # θ_old <- θ for _ in range(num_epochs): num_samples = len(states) indices = np.arange(num_samples) np.random.shuffle(indices) for i in range(int(np.ceil(num_samples / batch_size))): # Evaluate model if model.step_idx % eval_interval == 0: print("[INFO] Running evaluation...") avg_reward, value_error = evaluate( model, test_env, discount_factor, frame_stack_size, make_video=True) model.write_to_summary("eval_avg_reward", avg_reward) model.write_to_summary("eval_value_error", value_error) # Save model if model.step_idx % save_interval == 0: model.save() # Sample mini-batch randomly begin = i * batch_size end = begin + batch_size if end > num_samples: end = None mb_idx = indices[begin:end] # Optimize network model.train(states[mb_idx], taken_actions[mb_idx], returns[mb_idx], advantages[mb_idx]) except KeyboardInterrupt: model.save()
def main(): # Create test env print("Creating test environment") test_env = gym.make(env_name) # Traning parameters lr_scheduler = Scheduler(initial_value=3e-4, interval=1000, decay_factor=1) #0.75) std_scheduler = Scheduler(initial_value=2.0, interval=1000, decay_factor=0.75) discount_factor = 0.99 gae_lambda = 0.95 ppo_epsilon = 0.2 t_max = 10 #180 num_epochs = 10 batch_size = 40 #64 save_interval = 500 eval_interval = 100 training = True # Environment constants frame_stack_size = 4 input_shape = (84, 84, frame_stack_size) num_actions = 1 #envs.action_space.shape[0] action_min = np.array([-1.0]) #np.array([-1.0, 0.0, 0.0]) action_max = np.array([1.0]) #np.array([ 1.0, 1.0, 1.0]) # Create model print("Creating model") model_checkpoint = None #"./models/CarRacing-v0/run2/episode0_step455000.ckpt" model = PPO(num_actions, input_shape, action_min, action_max, ppo_epsilon, value_scale=0.5, entropy_scale=0.0001, model_checkpoint=model_checkpoint, model_name="CarRacing-v0") if training: print("Creating environments") num_envs = 4 envs = SubprocVecEnv([make_env for _ in range(num_envs)]) initial_frames = envs.reset() initial_frames = envs.get_images() frame_stacks = [ FrameStack(initial_frames[i], preprocess_fn=preprocess_frame) for i in range(num_envs) ] print("Main loop") step = 0 while training: # While there are running environments print("Training...") states, taken_actions, values, rewards, dones = [], [], [], [], [] learning_rate = np.maximum(lr_scheduler.get_value(), 1e-6) std = np.maximum(std_scheduler.get_value(), 0.2) # Simulate game for some number of steps for _ in range(t_max): # Predict and value action given state # π(a_t | s_t; θ_old) states_t = [ frame_stacks[i].get_state() for i in range(num_envs) ] actions_t, values_t = model.predict(states_t, use_old_policy=True, std=std) for i in range(num_envs): actions_t[i] = 0 if actions_t[i] < 0 else 1 actions_t = np.squeeze(actions_t.astype(np.int32), axis=-1) # Sample action from a Gaussian distribution envs.step_async(actions_t) frames, rewards_t, dones_t, infos = envs.step_wait() frames = envs.get_images() # render # Store state, action and reward states.append(states_t) # [T, N, 84, 84, 1] taken_actions.append(actions_t) # [T, N, 3] values.append(np.squeeze(values_t, axis=-1)) # [T, N] rewards.append(rewards_t) # [T, N] dones.append(dones_t) # [T, N] # Get new state for i in range(num_envs): frame_stacks[i].add_frame(frames[i]) # Calculate last values (bootstrap values) states_last = [ frame_stacks[i].get_state() for i in range(num_envs) ] last_values = np.squeeze(model.predict(states_last)[-1], axis=-1) # [N] # Compute returns returns = compute_returns(rewards, last_values, dones, discount_factor) # Compute advantages advantages = compute_gae(rewards, values, last_values, dones, discount_factor, gae_lambda) # Normalize advantages advantages = (advantages - np.mean(advantages)) / np.std(advantages) # Flatten arrays states = np.array(states).reshape( (-1, *input_shape)) # [T x N, 84, 84, 1] taken_actions = np.array(taken_actions).reshape( (-1, num_actions)) # [T x N, 3] returns = returns.flatten() # [T x N] advantages = advantages.flatten() # [T X N] # Train for some number of epochs model.update_old_policy() # θ_old <- θ for _ in range(num_epochs): # Sample mini-batch randomly and train mb_idx = np.random.choice(len(states), batch_size, replace=False) # Optimize network model.train(states[mb_idx], taken_actions[mb_idx], returns[mb_idx], advantages[mb_idx], learning_rate=learning_rate, std=std) # Reset environment's frame stack if done for i, done in enumerate(dones_t): if done: frame_stacks[i].add_frame(frames[i]) # Save model step += 1 if step % save_interval == 0: model.save() if step % eval_interval == 0: avg_reward = evaluate(model, test_env, 10) model.write_to_summary("eval_avg_reward", avg_reward) # Training complete, evaluate model avg_reward = evaluate(model, test_env, 10) print("Model achieved a final reward of:", avg_reward)
def main(): os.environ['OMP_NUM_THREADS'] = '1' if args.vis: from visdom import Visdom viz = Visdom() win = None envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] test_envs = [ make_env(args.env_name, args.seed, i, args.log_dir, args.start_container) for i in range(args.num_processes) ] if args.num_processes > 1: envs = SubprocVecEnv(envs) test_envs = SubprocVecEnv(test_envs) else: envs = DummyVecEnv(envs) test_envs = DummyVecEnv(test_envs) obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) if args.saved_encoder_model: obs_shape = (args.num_stack, args.latent_space_size) obs_numel = reduce(operator.mul, obs_shape, 1) if len(obs_shape) == 3 and obs_numel > 1024: actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy) else: assert not args.recurrent_policy, \ "Recurrent policy is not implemented for the MLP controller" actor_critic = MLPPolicy(obs_numel, envs.action_space) modelSize = 0 for p in actor_critic.parameters(): pSize = reduce(operator.mul, p.size(), 1) modelSize += pSize print(str(actor_critic)) print('Total model size: %d' % modelSize) if envs.action_space.__class__.__name__ == "Discrete": action_shape = 1 else: action_shape = envs.action_space.shape[0] if args.resume_experiment: print("\n############## Loading saved model ##############\n") actor_critic, ob_rms = torch.load( os.path.join(save_path, args.env_name + args.save_tag + ".pt")) tr.load(os.path.join(log_path, args.env_name + args.save_tag + ".p")) if args.cuda: actor_critic.cuda() if args.algo == 'a2c': optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha) elif args.algo == 'ppo': optimizer = optim.Adam(actor_critic.parameters(), args.lr, eps=args.eps) elif args.algo == 'acktr': optimizer = KFACOptimizer(actor_critic) print(obs_shape) rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) rollouts_test = RolloutStorage(args.num_steps_test, args.num_processes, obs_shape, envs.action_space, actor_critic.state_size) current_obs = torch.zeros(args.num_processes, *obs_shape) current_obs_test = torch.zeros(args.num_processes, *obs_shape) def update_current_obs(obs, test=False): shape_dim0 = envs.observation_space.shape[0] if args.saved_encoder_model: shape_dim0 = 1 obs, _ = vae.encode(Variable(torch.cuda.FloatTensor(obs))) obs = obs.data.cpu().numpy() obs = torch.from_numpy(obs).float() if not test: if args.num_stack > 1: current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:] current_obs[:, -shape_dim0:] = obs else: if args.num_stack > 1: current_obs_test[:, : -shape_dim0] = current_obs_test[:, shape_dim0:] current_obs_test[:, -shape_dim0:] = obs obs = envs.reset() update_current_obs(obs) rollouts.observations[0].copy_(current_obs) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) reward_avg = 0 if args.cuda: current_obs = current_obs.cuda() current_obs_test = current_obs_test.cuda() rollouts.cuda() rollouts_test.cuda() start = time.time() for j in range(num_updates): for step in range(args.num_steps): # Sample actions value, action, action_log_prob, states = actor_critic.act( Variable(rollouts.observations[step], volatile=True), Variable(rollouts.states[step], volatile=True), Variable(rollouts.masks[step], volatile=True)) cpu_actions = action.data.squeeze(1).cpu().numpy() # Observation, reward and next obs obs, reward, done, info = envs.step(cpu_actions) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward = np.clip(reward, a_min=0, a_max=None) / 400 reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks tr.episodes_done += args.num_processes - masks.sum() if args.cuda: masks = masks.cuda() if current_obs.dim() == 4: current_obs *= masks.unsqueeze(2).unsqueeze(2) else: current_obs *= masks update_current_obs(obs) rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks) next_value = actor_critic( Variable(rollouts.observations[-1], volatile=True), Variable(rollouts.states[-1], volatile=True), Variable(rollouts.masks[-1], volatile=True))[0].data rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) tr.iterations_done += 1 if args.algo in ['a2c', 'acktr']: values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(rollouts.observations[:-1].view(-1, *obs_shape)), Variable(rollouts.states[0].view(-1, actor_critic.state_size)), Variable(rollouts.masks[:-1].view(-1, 1)), Variable(rollouts.actions.view(-1, action_shape))) values = values.view(args.num_steps, args.num_processes, 1) action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1) advantages = Variable(rollouts.returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() if args.algo == 'acktr' and optimizer.steps % optimizer.Ts == 0: # Sampled fisher, see Martens 2014 actor_critic.zero_grad() pg_fisher_loss = -action_log_probs.mean() value_noise = Variable(torch.randn(values.size())) if args.cuda: value_noise = value_noise.cuda() sample_values = values + value_noise vf_fisher_loss = -(values - Variable(sample_values.data)).pow(2).mean() fisher_loss = pg_fisher_loss + vf_fisher_loss optimizer.acc_stats = True fisher_loss.backward(retain_graph=True) optimizer.acc_stats = False optimizer.zero_grad() (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward() if args.algo == 'a2c': nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() elif args.algo == 'ppo': advantages = rollouts.returns[:-1] - rollouts.value_preds[:-1] advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-5) for e in range(args.ppo_epoch): if args.recurrent_policy: data_generator = rollouts.recurrent_generator( advantages, args.num_mini_batch) else: data_generator = rollouts.feed_forward_generator( advantages, args.num_mini_batch) for sample in data_generator: observations_batch, states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, \ adv_targ = sample # Reshape to do in a single forward pass for all steps values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions( Variable(observations_batch), Variable(states_batch), Variable(masks_batch), Variable(actions_batch)) adv_targ = Variable(adv_targ) ratio = torch.exp(action_log_probs - Variable(old_action_log_probs_batch)) surr1 = ratio * adv_targ surr2 = torch.clamp(ratio, 1.0 - args.clip_param, 1.0 + args.clip_param) * adv_targ action_loss = -torch.min( surr1, surr2).mean() # PPO's pessimistic surrogate (L^CLIP) value_loss = (Variable(return_batch) - values).pow(2).mean() optimizer.zero_grad() (value_loss + action_loss - dist_entropy * args.entropy_coef).backward() nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm) optimizer.step() rollouts.after_update() if j % args.save_interval == 0 and args.save_dir != "": # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None ] torch.save( save_model, os.path.join(save_path, args.env_name + args.save_tag + ".pt")) total_test_reward_list = [] step_test_list = [] for _ in range(args.num_tests): test_obs = test_envs.reset() update_current_obs(test_obs, test=True) rollouts_test.observations[0].copy_(current_obs_test) step_test = 0 total_test_reward = 0 while step_test < args.num_steps_test: value_test, action_test, action_log_prob_test, states_test = actor_critic.act( Variable(rollouts_test.observations[step_test], volatile=True), Variable(rollouts_test.states[step_test], volatile=True), Variable(rollouts_test.masks[step_test], volatile=True)) cpu_actions_test = action_test.data.squeeze( 1).cpu().numpy() # Observation, reward and next obs obs_test, reward_test, done_test, info_test = test_envs.step( cpu_actions_test) # masks here doesn't really matter, but still masks_test = torch.FloatTensor( [[0.0] if done_test_ else [1.0] for done_test_ in done_test]) # Maxime: clip the reward within [0,1] for more reliable training # This code deals poorly with large reward values reward_test = np.clip(reward_test, a_min=0, a_max=None) / 400 total_test_reward += reward_test[0] reward_test = torch.from_numpy( np.expand_dims(np.stack(reward_test), 1)).float() update_current_obs(obs_test) rollouts_test.insert(step_test, current_obs_test, states_test.data, action_test.data, action_log_prob_test.data,\ value_test.data, reward_test, masks_test) step_test += 1 if done_test: break #rollouts_test.reset() # Need to reinitialise with .cuda(); don't forget total_test_reward_list.append(total_test_reward) step_test_list.append(step_test) append_to(tr.test_reward, tr, sum(total_test_reward_list) / args.num_tests) append_to(tr.test_episode_len, tr, sum(step_test_list) / args.num_tests) logger.log_scalar_rl( "test_reward", tr.test_reward[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "test_episode_len", tr.test_episode_len[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) # Saving all the MyContainer variables tr.save( os.path.join(log_path, args.env_name + args.save_tag + ".p")) if j % args.log_interval == 0: reward_avg = 0.99 * reward_avg + 0.01 * final_rewards.mean() end = time.time() tr.global_steps_done = (j + 1) * args.num_processes * args.num_steps print( "Updates {}, num timesteps {}, FPS {}, running avg reward {:.3f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, tr.global_steps_done, int(tr.global_steps_done / (end - start)), reward_avg, dist_entropy.data[0], value_loss.data[0], action_loss.data[0])) append_to(tr.pg_loss, tr, action_loss.data[0]) append_to(tr.val_loss, tr, value_loss.data[0]) append_to(tr.entropy_loss, tr, dist_entropy.data[0]) append_to(tr.train_reward_avg, tr, reward_avg) logger.log_scalar_rl( "train_pg_loss", tr.pg_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_val_loss", tr.val_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_entropy_loss", tr.entropy_loss[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) logger.log_scalar_rl( "train_reward_avg", tr.train_reward_avg[0], args.sliding_wsize, [tr.episodes_done, tr.global_steps_done, tr.iterations_done]) """ print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}". format( j, total_num_steps, int(total_num_steps / (end - start)), final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), dist_entropy.data[0], value_loss.data[0], action_loss.data[0]) ) """ if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo) except IOError: pass