def main(): # env = PybulletPhonebotSubprocEnv() action_size = 8 num_env = 4 def get_env(index: int): # env = PybulletPhonebotEnv(sim_settings=PybulletSimulatorSettings( # render=False, random_orientation=True)) env = PybulletPhonebotSubprocEnv( PybulletSimulatorSettings(render=False, random_orientation=True)) env.set_seed(index) env.reset() return env env = MultiEnv(get_env, num_env) while True: print(env.sense()) res = env.step([np.zeros(action_size) for _ in range(num_env)]) print(res[0], res[1], res[2], res[3]) time.sleep(0.1) break
def evaluate_saved_model(): args = parse_a2c_args() args2 = parse_a2c_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int( args.num_frames) // args.num_steps // args.num_environments # Writer will output to ./runs/ directory by default writer = torch.utils.tensorboard.SummaryWriter() train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True) # Création des environnements de test des niveaux classiques args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/" args2.scenario = "custom_scenario_test{:003}.cfg" classic_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) # Création des environnements de test des niveaux peignes args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/" little_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/" medium_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) obs_shape = train_envs.obs_shape policy = CNNPolicy(obs_shape, args).to(device) agent = A2CAgent(policy, args.hidden_size, value_weight=args.value_loss_coef, entropy_weight=args.entropy_coef, num_steps=args.num_steps, num_parallel=args.num_environments, gamma=args.gamma, lr=args.learning_rate, opt_alpha=args.alpha, opt_momentum=args.momentum, max_grad_norm=args.max_grad_norm) obs = little_combs_test_envs.reset() num_checkpoints = 355 for j in range(num_checkpoints): if j % 8 == 0: checkpoint_filename = '/home/adam/Bureau/Transfer Learning/FINAL/checkpoint_{}.pth.tar'.format( str(j + 1)) agent.load_model(checkpoint_filename) total_num_steps = (j + 1) * args.num_environments * args.num_steps mean_rewards_classic, game_times_classic = agent.evaluate( classic_test_envs, j, total_num_steps) mean_rewards_little, game_times_little = agent.evaluate( little_combs_test_envs, j, total_num_steps) mean_rewards_medium, game_times_medium = agent.evaluate( medium_combs_test_envs, j, total_num_steps) writer.add_scalar("Reward classic levels", mean_rewards_classic, (j + 1) * 100) writer.add_scalar("Reward little combs levels", mean_rewards_little, (j + 1) * 100) writer.add_scalar("Reward medium combs levels", mean_rewards_medium, (j + 1) * 100) print(j)
v_weight = 0.5 max_grad_norm = 0.5 lr = 3e-4 lr_decay = 0.99 eps = 1e-5 n_iter = 5000 disp_step = 10 save_step = 100 is_render = args.render env_id = args.env save_dir = "./save_" + env_id #Create multiple environments #---------------------------- env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)]) a_dim = env.ac_space.shape[0] s_dim = env.ob_space.shape[0] a_low = env.ac_space.low[0] a_high = env.ac_space.high[0] runner = MultiEnvRunner(env, s_dim, a_dim, n_step, gamma, lamb) #Placeholders #---------------------------- #action_ph: (mb_size, a_dim) #old_neg_logprob_ph: (mb_size) #old_v_pred_ph: (mb_size) #adv_ph: (mb_size) #return_ph: (mb_size) action_ph = tf.placeholder(tf.float32, [None, a_dim], name="action")
ent_weight = 0.01 v_weight = 0.5 max_grad_norm = 0.5 lr = 2e-4 lr_decay = 0.99 eps = 1e-5 n_iter = 30000 disp_step = 10 save_step = 100 is_render = args.render env_id = args.env save_dir = "./save_" + env_id #Create multiple environments #---------------------------- env = MultiEnv( [make_env(i, env_id=env_id, unwrap=args.unwrap) for i in range(n_env)]) img_height, img_width, c_dim = env.ob_space.shape a_dim = env.ac_space.n runner = MultiEnvRunner(env, img_height, img_width, c_dim, n_step, n_stack, gamma, lamb) #Create the model #---------------------------- config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=n_env, inter_op_parallelism_threads=n_env) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, img_height, img_width, c_dim * n_stack, a_dim, "policy")
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 beta = 0.1 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" expert_path = "../save/{}_traj.pkl".format(args.env) #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Load expert trajectories #---------------------------- if os.path.exists(expert_path): s_real, a_real = pkl.load(open(expert_path, "rb")) sa_real = [] if args.conti: for i in range(len(s_real)): sa_real.append(np.concatenate([s_real[i], a_real[i]], 1)) else: for i in range(len(s_real)): a_real_onehot = np.zeros((len(a_real[i]), a_dim), dtype=np.float32) for j in range(len(a_real[i])): a_real_onehot[j, a_real[i][j]] = 1 sa_real.append(np.concatenate([s_real[i], a_real_onehot], 1)) sa_real = np.concatenate(sa_real, 0) else: print("ERROR: No expert trajectory file found") sys.exit(1) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) dis_net = DiscriminatorNet(s_dim + a_dim).to(device) agent = PPO(policy_net, value_net, dis_net, a_dim, beta, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device, conti=args.conti) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) dis_net.load_state_dict(checkpoint["DiscriminatorNet"]) agent.beta = checkpoint["beta"] start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net, dis_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent, dis_loss, dis_real, dis_fake, avg_kl = agent.train( policy_net, value_net, dis_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps, sa_real) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_true_return, std_true_return, mean_return, std_return, mean_len = runner.get_performance( ) policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("dis loss = {:.6f}".format(dis_loss)) print("entropy = {:.6f}".format(ent)) print("avg_kl = {:.6f}".format(avg_kl)) print("beta = {:.6f}".format(agent.beta)) print("mean true return = {:.6f}".format(mean_true_return)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print("dis_real = {:.3f}".format(dis_real)) print("dis_fake = {:.3f}".format(dis_fake)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "beta": agent.beta, "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict(), "DiscriminatorNet": dis_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def main(): #Parse arguments #---------------------------- parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v0") parser.add_argument("--conti", action="store_true") parser.add_argument("--unwrap", action="store_true") args = parser.parse_args() #Parameters #---------------------------- n_env = 8 n_step = 128 mb_size = n_env * n_step sample_mb_size = 64 sample_n_epoch = 4 clip_val = 0.2 lamb = 0.95 gamma = 0.99 ent_weight = 0.0 max_grad_norm = 0.5 lr = 1e-4 n_iter = 30000 disp_step = 30 save_step = 300 save_dir = "./save" device = "cuda:0" #Create multiple environments #---------------------------- env = MultiEnv([ make_env(i, env_id=args.env, unwrap=args.unwrap, rand_seed=int(time.time())) for i in range(n_env) ]) if args.conti: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.shape[0] else: s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = EnvRunner(env, s_dim, a_dim, n_step, gamma, lamb, device=device, conti=args.conti) #Create model #---------------------------- policy_net = PolicyNet(s_dim, a_dim, conti=args.conti).to(device) value_net = ValueNet(s_dim).to(device) agent = PPO(policy_net, value_net, lr, max_grad_norm, ent_weight, clip_val, sample_n_epoch, sample_mb_size, mb_size, device=device) #Load model #---------------------------- if not os.path.exists(save_dir): os.mkdir(save_dir) if os.path.exists(os.path.join(save_dir, "{}.pt".format(args.env))): print("Loading the model ... ", end="") checkpoint = torch.load( os.path.join(save_dir, "{}.pt".format(args.env))) policy_net.load_state_dict(checkpoint["PolicyNet"]) value_net.load_state_dict(checkpoint["ValueNet"]) start_it = checkpoint["it"] print("Done.") else: start_it = 0 #Start training #---------------------------- t_start = time.time() policy_net.train() value_net.train() for it in range(start_it, n_iter): #Run the environment with torch.no_grad(): mb_obs, mb_actions, mb_old_a_logps, mb_values, mb_returns = runner.run( policy_net, value_net) mb_advs = mb_returns - mb_values mb_advs = (mb_advs - mb_advs.mean()) / (mb_advs.std() + 1e-6) #Train pg_loss, v_loss, ent = agent.train(policy_net, value_net, mb_obs, mb_actions, mb_values, mb_advs, mb_returns, mb_old_a_logps) #Print the result if it % disp_step == 0: agent.lr_decay(it, n_iter) policy_net.eval() value_net.eval() n_sec = time.time() - t_start fps = int((it - start_it) * n_env * n_step / n_sec) mean_return, std_return, mean_len = runner.get_performance() policy_net.train() value_net.train() print("[{:5d} / {:5d}]".format(it, n_iter)) print("----------------------------------") print("Timesteps = {:d}".format((it - start_it) * mb_size)) print("Elapsed time = {:.2f} sec".format(n_sec)) print("FPS = {:d}".format(fps)) print("actor loss = {:.6f}".format(pg_loss)) print("critic loss = {:.6f}".format(v_loss)) print("entropy = {:.6f}".format(ent)) print("mean return = {:.6f}".format(mean_return)) print("mean length = {:.2f}".format(mean_len)) print() #Save model if it % save_step == 0: print("Saving the model ... ", end="") torch.save( { "it": it, "PolicyNet": policy_net.state_dict(), "ValueNet": value_net.state_dict() }, os.path.join(save_dir, "{}.pt".format(args.env))) print("Done.") print() env.close()
def train(): args = parse_a2c_args() args2 = parse_a2c_args() output_dir = initialize_logging(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int( args.num_frames) // args.num_steps // args.num_environments # Create the train and test environments with Multiple processes train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True) #Création des environnements de test des niveaux classiques args2.scenario_dir = "scenarios_transfer_learning/mazes_classic_test/" args2.scenario = "custom_scenario_test{:003}.cfg" classic_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) #Création des environnements de test des niveaux peignes args2.scenario_dir = "scenarios_transfer_learning/little_combs_test/" little_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) args2.scenario_dir = "scenarios_transfer_learning/medium_combs_test/" medium_combs_test_envs = MultiEnv(args.simulator, args.num_environments, args2, is_train=False) test_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=False) # Writer will output to ./runs/ directory by default writer = torch.utils.tensorboard.SummaryWriter() obs_shape = train_envs.obs_shape # The agent's policy network and training algorithm A2C policy = CNNPolicy(obs_shape, args).to(device) agent = A2CAgent(policy, args.hidden_size, value_weight=args.value_loss_coef, entropy_weight=args.entropy_coef, num_steps=args.num_steps, num_parallel=args.num_environments, gamma=args.gamma, lr=args.learning_rate, opt_alpha=args.alpha, opt_momentum=args.momentum, max_grad_norm=args.max_grad_norm) start_j = 0 if args.reload_model: checkpoint_idx = args.reload_model.split(',')[1] checkpoint_filename = '{}models/base_line.pth.tar'.format(output_dir) agent.load_model(checkpoint_filename) start_j = 0 #(int(checkpoint_idx) // args.num_steps // args.num_environments) + 1 obs = train_envs.reset() start = time.time() nb_of_saves = 0 for j in range(start_j, num_updates): print("------", j / num_updates * 100, "-------") # Test des performances du modèle if not args.skip_eval and j % args.eval_freq == 0: total_num_steps = (j + 1) * args.num_environments * args.num_steps mean_rewards_classic, game_times_classic = agent.evaluate( classic_test_envs, j, total_num_steps) mean_rewards_little, game_times_little = agent.evaluate( little_combs_test_envs, j, total_num_steps) mean_rewards_medium, game_times_medium = agent.evaluate( medium_combs_test_envs, j, total_num_steps) # succes_classic = sum([1 if i!=525 else 0 for i in game_times_classic])/16 # succes_little = sum([1 if i!=525 else 0 for i in game_times_little])/16 # succes_medium = sum([1 if i!=525 else 0 for i in game_times_medium])/16 writer.add_scalar("Reward classic levels", mean_rewards_classic, j) writer.add_scalar("Reward little combs levels", mean_rewards_little, j) writer.add_scalar("Reward medium combs levels", mean_rewards_medium, j) # writer.add_scalar("Success rate classic levels", succes_classic, j) # writer.add_scalar("Success rate little combs levels", succes_little, j) # writer.add_scalar("Success rate medium combs levels", succes_medium, j) for step in range(args.num_steps): action = agent.get_action(obs, step) obs, reward, done, info = train_envs.step(action) agent.add_rewards_masks(reward, done, step) report = agent.update(obs) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_environments * args.num_steps save_num_steps = (start_j) * args.num_environments * args.num_steps FPS = int((total_num_steps - save_num_steps) / (end - start)), logging.info(report.format(j, total_num_steps, FPS)) if j % args.model_save_rate == 0: nb_of_saves += 1 agent.save_policy2(nb_of_saves, args, output_dir) # cancel the env processes train_envs.cancel() test_envs.cancel()
def train(model, optim, env_fn, num_envs, num_stack, num_steps, num_updates, gamma, value_loss_coef, entropy_coef, max_grad_norm, log_freq=10): envs = MultiEnv(env_fn, num_envs) model.cuda() obs_shape = envs.observation_space.shape obs_shape = (obs_shape[0] * num_stack, obs_shape[1], obs_shape[2]) states = torch.zeros(num_steps + 1, num_envs, *obs_shape) current_state = torch.zeros(num_envs, *obs_shape) def update_current_state(state): state = torch.from_numpy(np.stack(state)).float() current_state[:, :-1] = current_state[:, 1:] current_state[:, -1] = state state = envs.reset() update_current_state(state) rewards = torch.zeros(num_steps, num_envs, 1) value_preds = torch.zeros(num_steps + 1, num_envs, 1) old_log_probs = torch.zeros(num_steps, num_envs, envs.action_space.n) returns = torch.zeros(num_steps + 1, num_envs, 1) actions = torch.LongTensor(num_steps, num_envs) masks = torch.zeros(num_steps, num_envs, 1) # These variables are used to compute reward stats for all processes. episode_rewards = torch.zeros([num_envs, 1]) final_rewards = torch.zeros([num_envs, 1]) states = states.cuda() current_state = current_state.cuda() rewards = rewards.cuda() value_preds = value_preds.cuda() old_log_probs = old_log_probs.cuda() returns = returns.cuda() actions = actions.cuda() masks = masks.cuda() for j in range(num_updates): for step in range(num_steps): # Sample actions value, logits = model(Variable(states[step], volatile=True)) probs = F.softmax(logits) log_probs = F.log_softmax(logits).data actions[step] = probs.multinomial().data cpu_actions = actions[step].cpu() cpu_actions = cpu_actions.numpy() # Observe reward and next state state, reward, done, info = envs.step(cpu_actions) reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward np_masks = np.array([0.0 if done_ else 1.0 for done_ in done]) # If done then clean the history of observations. pt_masks = torch.from_numpy( np_masks.reshape(np_masks.shape[0], 1, 1, 1)).float() pt_masks = pt_masks.cuda() current_state *= pt_masks update_current_state(state) states[step + 1].copy_(current_state) value_preds[step].copy_(value.data) old_log_probs[step].copy_(log_probs) rewards[step].copy_(reward) masks[step].copy_(torch.from_numpy(np_masks).unsqueeze(1)) final_rewards *= masks[step].cpu() final_rewards += (1 - masks[step].cpu()) * episode_rewards episode_rewards *= masks[step].cpu() returns[-1] = model(Variable(states[-1], volatile=True))[0].data for step in reversed(range(num_steps)): returns[step] = returns[step + 1] * \ gamma * masks[step] + rewards[step] # Reshape to do in a single forward pass for all steps values, logits = model( Variable(states[:-1].view(-1, *states.size()[-3:]))) log_probs = F.log_softmax(logits) # Unreshape logits_size = (num_steps, num_envs, logits.size(-1)) log_probs = F.log_softmax(logits).view(logits_size) probs = F.softmax(logits).view(logits_size) values = values.view(num_steps, num_envs, 1) logits = logits.view(logits_size) action_log_probs = log_probs.gather(2, Variable(actions.unsqueeze(2))) dist_entropy = -(log_probs * probs).sum(-1).mean() advantages = Variable(returns[:-1]) - values value_loss = advantages.pow(2).mean() action_loss = -(Variable(advantages.data) * action_log_probs).mean() loss = value_loss * value_loss_coef + action_loss - dist_entropy * entropy_coef optim.zero_grad() loss.backward() nn.utils.clip_grad_norm(model.parameters(), max_grad_norm) optim.step() states[0].copy_(states[-1]) if j % log_freq == 0: print( "Updates {}, num frames {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}" .format(j, j * num_envs * num_steps, final_rewards.mean(), final_rewards.median(), final_rewards.min(), final_rewards.max(), -dist_entropy.data[0], value_loss.data[0], action_loss.data[0]))
def train(model, create_env, num_envs, optimizer, gamma, num_updates, max_episode_length, steps_per_update): # torch.manual_seed(args.seed) # env.seed(args.seed) model.train() env = MultiEnv(create_env, num_envs) state = env.reset() # list of states for each concurrent env state = torch.from_numpy(state) episode_done = True episode_length = 0 update = 0 while update < num_updates: episode_length += 1 values = [] log_action_probs = [] rewards = [] entropies = [] for step in range(steps_per_update): # list of values and action logits for each concurrent env value, action_logit = model(Variable(state)) action_prob = F.softmax(action_logit) log_action_prob = F.log_softmax(action_logit) entropy = -(log_action_prob * action_prob).sum(1) entropies.append(entropy) action = action_prob.multinomial().data log_action_prob = log_action_prob.gather(1, Variable(action)) state, reward, episode_done, _ = env.step(action.numpy()) if episode_length >= max_episode_length: episode_done = True reward = max(min(reward, 1), -1) state = torch.from_numpy(state) values.append(value) log_action_probs.append(log_action_prob) rewards.append(reward) if episode_done: episode_length = 0 state = env.reset() break R = torch.zeros(1, 1) if not episode_done: value, _ = model(Variable(state)) R = value.data values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) advantage = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) advantage = Variable(advantage * gamma + rewards[i] + gamma * values[i + 1].data - values[i].data) policy_loss = policy_loss - log_action_probs[ i] * advantage - 0.01 * entropies[i] loss = policy_loss + 0.5 * value_loss optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm(model.parameters(), 40) optimizer.step() update += 1
def train(opts: Settings): # === INSTANTIATE ENVIRONMENT === # gym.make() but with imports configured as specified in arg. _gym_make = partial(gym_make, opts.imports) subproc_gym_make = subproc(_gym_make) # If `opts.subproc==True`, invoke gym.make() in a subprocess, # and treat the resultant instance as a `gym.Env`. make_env = subproc_gym_make if opts.subproc else _gym_make def get_env(index: int): env = make_env(opts.env_id) env.seed(index) env.reset() return env env = MultiEnv(get_env, opts.num_envs) entry_type = [ ('state', env.observation_space.dtype, env.observation_space.shape), ('action', env.action_space.dtype, env.action_space.shape), ('reward', np.float32, (1, )), # ('state1', env.observation_space.dtype, env.observation_space.shape), ('done', np.bool, (1, )), ('value', np.float32, (1, )), ('log_prob', np.float32, env.action_space.shape) ] # === NORMALIZERS FOR INPUTS === reward_normalizer = ExponentialMovingGaussian( alpha=opts.reward_normalizer_alpha) state_normalizer = ExponentialMovingGaussian( alpha=opts.state_normalizer_alpha) # === INSTANTIATE MEMORY === memory = ContiguousRingBuffer(capacity=opts.update_steps, dims=(opts.num_envs, ), dtype=entry_type) # === INSTANTIATE POLICY === # FIXME(ycho): Instead of assuming 1D box spaces, # explicitly wrap envs with flatten()... device = th.device(opts.device) policy = AC(env.observation_space.shape[0], env.action_space.shape[0], opts.ac).to(device) # === INSTANTIATE AGENT === ppo = PPO(policy, memory, device, opts.ppo) # === TRAIN === states = env.reset() dones = np.full((opts.num_envs, 1), False, dtype=np.bool) returns = np.zeros(opts.num_envs, dtype=np.float32) # === LOGGER === # TODO(ycho): Configure logger writer = SummaryWriter() writer.add_graph(policy, th.as_tensor(states).float().to(device)) # === CALLBACKS === save_cb = SaveCallback( opts.save_steps, opts.ckpt_path, lambda: { 'settings': opts, 'state_dict': policy.state_dict(), 'reward_normalizer': reward_normalizer.params(), 'state_normalizer': state_normalizer.params() }) # === VARIABLES FOR DEBUGGING / LOG TRACKING === reset_count = 0 start_time = time.time() # === START TRAINING === step = 0 while step < opts.max_steps: # Reset any env that has reached termination. # FIXME(ycho): assumes isinstance(env, MultiEnv), of course. for i in range(opts.num_envs): if not dones[i]: continue states[i][:] = env.envs[i].reset() returns[i] = 0.0 reset_count += 1 # NOTE(ycho): Workaround for the current limitation of `MultiEnv`. # action = [env.action_space.sample() for _ in range(opts.num_envs)] # sanitize `states` arg. states = np.asarray(states).astype(np.float32) # Add states to stats for normalization. for s in states: state_normalizer.add(s) # Normalize states in-place. states = state_normalizer.normalize(states) states = np.clip(states, -10.0, 10.0) # clip to +-10 stddev with th.no_grad(): action, value, log_prob = ppo.act(states, True) # NOTE(ycho): Clip action within valid domain... clipped_action = np.clip(action, env.action_space.low, env.action_space.high) # Step according to above action. out = env.step(clipped_action) # Format entry. nxt_states, rewards, dones, _ = out # Add rewards to stats for normalization. # returns[np.asarray(dones).reshape(-1).astype(np.bool)] = 0.0 returns = returns * opts.gae.gamma + np.reshape(rewards, -1) # NOTE(ycho): collect stats on `returns` instead of `rewards`. # for r in rewards: # reward_normalizer.add(r) for r in returns: reward_normalizer.add(r) # Train if buffer full ... if memory.is_full: writer.add_scalar('reward_mean', reward_normalizer.mean, global_step=step) writer.add_scalar('reward_var', reward_normalizer.var, global_step=step) writer.add_scalar('log_std', policy.log_std.detach().cpu().numpy()[0], global_step=step) writer.add_scalar('fps', step / (time.time() - start_time), global_step=step) # NOTE(ycho): Don't rely on printed reward stats for tracking # training progress ... use tensorboard instead. print('== step {} =='.format(step)) # Log reward before overwriting with normalized values. print('rew = mean {} min {} max {} std {}'.format( memory['reward'].mean(), memory['reward'].min(), memory['reward'].max(), memory['reward'].std())) # print('rm {} rv {}'.format(reward_normalizer.mean, # reward_normalizer.var)) # NOTE(ycho): States have already been normalized, # since those states were utilized as input for PPO action. # After that, the normalized states were inserted in memory. # memory['state'] = state_normalizer.normalize(memory['state']) # NOTE(ycho): I think it's fine to delay reward normalization to this point. # memory['reward'] = reward_normalizer.normalize(memory['reward']) # NOTE(ycho): maybe the proper thing to do is: # memory['reward'] = (memory['reward'] - reward_normalizer.mean) / np.sqrt(return_normalizer.var) memory['reward'] /= np.sqrt(reward_normalizer.var) memory['reward'] = np.clip(memory['reward'], -10.0, 10.0) # Create training data slices from memory ... dones = np.asarray(dones).reshape(opts.num_envs, 1) advs, rets = gae(memory, value, dones, opts.gae) # print('std = {}'.format(ppo.policy.log_std.exp())) ucount = 0 info = None for _ in range(opts.num_epochs): for i in range(0, len(memory), opts.batch_size): # Prepare current minibatch dataset ... exp = memory[i:i + opts.batch_size] act = exp['action'] obs = exp['state'] old_lp = exp['log_prob'] # old_v = exp['value'] # NOTE(ycho): unused adv = advs[i:i + opts.batch_size] ret = rets[i:i + opts.batch_size] # Evaluate what had been done ... # NOTE(ycho): wouldn't new_v == old_v # and new_lp == old_lp for the very first one in the batch?? # hmm .... new_v, new_lp, entropy = ppo.evaluate( obs.copy(), act.copy()) info_i = {} loss = ppo.compute_loss(obs.copy(), act.copy(), old_lp.copy(), new_v, new_lp, entropy, adv, ret, info_i) # NOTE(ycho): Below, only required for logging if True: with th.no_grad(): if info is None: info = info_i else: for k in info.keys(): info[k] += info_i[k] ucount += 1 # Optimization step ppo.optimizer.zero_grad() loss.backward() # Clip grad norm th.nn.utils.clip_grad_norm_(ppo.policy.parameters(), opts.ppo.max_grad_norm) ppo.optimizer.step() for k, v in info.items(): writer.add_scalar(k, v.detach().cpu().numpy() / ucount, global_step=step) # Empty the memory ! memory.reset() # Append to memory. entry = list( zip(*( states, action, rewards, # nxt_states, dones, value, log_prob))) memory.append(entry) # Cache `states`, update steps and continue. states = nxt_states step += opts.num_envs save_cb.on_step(step) writer.close() # Save ... th.save( { 'settings': opts, 'state_dict': policy.state_dict(), 'reward_normalizer': reward_normalizer.params(), 'state_normalizer': state_normalizer.params() }, opts.model_path)
def train(): args = parse_a2c_args() output_dir = initialize_logging(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_updates = int(args.num_frames) // args.num_steps // args.num_environments # Create the train and test environments with Multiple processes train_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=True) test_envs = MultiEnv(args.simulator, args.num_environments, args, is_train=False) obs_shape = train_envs.obs_shape # The agent's policy network and training algorithm A2C policy = CNNPolicy(obs_shape, args).to(device) agent = A2CAgent(policy, args.hidden_size, value_weight=args.value_loss_coef, entropy_weight=args.entropy_coef, num_steps=args.num_steps, num_parallel=args.num_environments, gamma=args.gamma, lr=args.learning_rate, opt_alpha=args.alpha, opt_momentum=args.momentum, max_grad_norm=args.max_grad_norm) start_j = 0 if args.reload_model: checkpoint_idx = args.reload_model.split(',')[1] checkpoint_filename = '{}models/checkpoint_{}.pth.tar'.format(output_dir, checkpoint_idx) agent.load_model(checkpoint_filename) start_j = (int(checkpoint_idx) // args.num_steps // args.num_environments) + 1 obs = train_envs.reset() start = time.time() for j in range(start_j, num_updates): if not args.skip_eval and j % args.eval_freq == 0: total_num_steps = (j + 1) * args.num_environments * args.num_steps mean_rewards, game_times = agent.evaluate(test_envs, j, total_num_steps) logging.info(mean_rewards) logging.info(game_times) for step in range(args.num_steps): action = agent.get_action(obs, step) obs, reward, done, info = train_envs.step(action) agent.add_rewards_masks(reward, done, step) report = agent.update(obs) if j % args.log_interval == 0: end = time.time() total_num_steps = (j + 1) * args.num_environments * args.num_steps save_num_steps = (start_j) * args.num_environments * args.num_steps FPS = int((total_num_steps - save_num_steps) / (end - start)), logging.info(report.format(j, total_num_steps, FPS)) if j % args.model_save_rate == 0: total_num_steps = (j + 1) * args.num_environments * args.num_steps agent.save_policy(total_num_steps, args, output_dir) # cancel the env processes train_envs.cancel() test_envs.cancel()