def paralle_train(args): logger = SummaryWriter(log_dir='results/{}_{}_{}'.format( args.env, args.seed, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) np.random.seed(args.seed) torch.manual_seed(args.seed) env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) workers, parent_conns, children_conns = workers_initialize(args) obs = np.zeros(shape=[args.num_worker, 4, 84, 84], dtype=np.float32) #initialize obs_normalizer print('Start initialize obs normalizer....') next_obs_batch = [] for step in range(args.initialize_episode * args.max_episode_step): actions = np.random.randint(0, env_params['a_dim'], size=(args.num_worker)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: obs_, r, done, info = parent_conn.recv() next_obs_batch.append(obs_) if len(next_obs_batch) % (10 * args.num_worker) == 0: next_obs_batch = np.stack(next_obs_batch) agent.normalizer_obs.update(next_obs_batch) next_obs_batch = [] print('End initialize obs normalizer....') log_reward_ex = 0 log_reward_in = 0 log_step = 0 log_episode = 0 for i_epoch in range(args.max_epoch): epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob = [], [], [], [], [], [], [] for i_step in range(args.rollout_len): actions, log_probs = agent.choose_action(obs) for action, parent_conn in zip(actions, parent_conns): parent_conn.send(action) batch_re, batch_mask, batch_next_obs = [], [], [] for parent_conn in parent_conns: obs_, r_e, done, info = parent_conn.recv() batch_next_obs.append(obs_) batch_re.append(r_e) batch_mask.append(0 if done else 1) batch_next_obs = np.stack(batch_next_obs) batch_re = np.stack(batch_re) batch_mask = np.stack(batch_mask) batch_ri = agent.compute_intrinsic_reward(batch_next_obs.copy()) #for log log_reward_ex += batch_re[args.log_env_idx] log_reward_in += batch_ri[args.log_env_idx] log_step += 1 if batch_mask[args.log_env_idx] == 0: log_episode += 1 logger.add_scalar('Indicator/Reward_ex', log_reward_ex, log_episode) logger.add_scalar('Indicator/Reward_in', log_reward_in, log_episode) log_reward_ex = 0 log_reward_in = 0 epoch_obs.append(obs) epoch_action.append(actions) epoch_next_obs.append(batch_next_obs) epoch_ri.append(batch_ri) epoch_re.append(batch_re) epoch_mask.append(batch_mask) epoch_logprob.append(log_probs) obs = batch_next_obs[:, :, :, :] epoch_obs = np.stack(epoch_obs) epoch_action = np.stack(epoch_action) epoch_ri = np.stack(epoch_ri) epoch_re = np.stack(epoch_re) epoch_mask = np.stack(epoch_mask) epoch_next_obs = np.stack(epoch_next_obs) epoch_logprob = np.stack(epoch_logprob) epoch_obs = np.transpose(epoch_obs, axes=[1, 0, 2, 3, 4]) epoch_action = np.transpose(epoch_action, axes=[1, 0]) epoch_ri = np.transpose(epoch_ri, axes=[1, 0]) epoch_re = np.transpose(epoch_re, axes=[1, 0]) epoch_mask = np.transpose(epoch_mask, axes=[1, 0]) epoch_next_obs = np.transpose(epoch_next_obs, axes=[1, 0, 2, 3, 4]) epoch_logprob = np.transpose(epoch_logprob, axes=[1, 0]) loss_rnd, loss_a, loss_c = agent.update(epoch_obs, epoch_action, epoch_ri, epoch_re, epoch_mask, epoch_next_obs, epoch_logprob) used_sample_num = args.rollout_len * args.num_worker * i_epoch logger.add_scalar('Loss/loss_RND', loss_rnd, used_sample_num) logger.add_scalar('Loss/loss_a', loss_a, used_sample_num) logger.add_scalar('Loss/loss_c', loss_c, used_sample_num) if i_epoch % args.save_model_interval == 0: agent.save_model(remark='{}'.format(i_epoch))
def train(args): device = 'cuda:0' if torch.cuda.is_available() else 'cpu' envs = MultiprocessEnvironment.create_mario_env(num_envs=args.jobs, world=args.world, stage=args.stage) actor_critic = RecurrentPolicy(state_frame_channels=envs.observation_shape[0], action_space_size=envs.action_space_size, hidden_layer_size=args.hidden_size, prev_actions_out_size=args.prev_actions_hidden_size, recurrent_hidden_size=args.recurrent_hidden_size, device=device) experience = ExperienceStorage(num_steps=args.steps_per_update, num_envs=args.jobs, observation_shape=envs.observation_shape, recurrent_hidden_size=args.recurrent_hidden_size, device=device) initial_observations = envs.reset() experience.insert_initial_observations(initial_observations) tb_writer = SummaryWriter() num_updates = args.steps // (args.jobs * args.steps_per_update) agent = PPOAgent(actor_critic, lr=args.lr, lr_lambda=lambda step: 1 - (step / float(num_updates)), policy_loss_coef=args.policy_loss_coef, value_loss_coef=args.value_loss_coef, entropy_loss_coef=args.entropy_loss_coef, max_grad_norm=args.max_grad_norm, clip_threshold=args.ppo_clip_threshold, epochs=args.ppo_epochs, minibatches=args.ppo_minibatches) for update_step in tqdm(range(num_updates)): episode_rewards = [] for step in range(args.steps_per_update): with torch.no_grad(): actor_input = experience.get_actor_input(step) (values, actions, action_log_probs, _, # Action disribution entropy is not needed. recurrent_hidden_states) = actor_critic.act(*actor_input) observations, rewards, done_values, info_dicts = envs.step(actions) masks = 1 - done_values experience.insert(observations, actions, action_log_probs, rewards, values, masks, recurrent_hidden_states) for done, info in zip(done_values, info_dicts): if done: level_completed_percentage = info['x_pos'] / MAX_X episode_rewards.append(level_completed_percentage) with torch.no_grad(): critic_input = experience.get_critic_input() next_value = actor_critic.value(*critic_input) experience.compute_gae_returns(next_value, gamma=args.discount, gae_lambda=args.gae_lambda) losses = agent.update(experience) if episode_rewards: with torch.no_grad(): cumulative_reward = experience.rewards.sum((0, 2)) mean_reward = cumulative_reward.mean() std_reward = cumulative_reward.std() tb_writer.add_scalar('mario/lr', agent.current_lr(), update_step) tb_writer.add_scalars('mario/level_progress', { 'min': np.min(episode_rewards), 'max': np.max(episode_rewards), 'mean': np.mean(episode_rewards), 'median': np.median(episode_rewards), }, update_step) tb_writer.add_scalars('mario/reward', {'mean': mean_reward, 'std': std_reward}, update_step) tb_writer.add_scalars('mario/loss', { 'policy': losses['policy_loss'], 'value': losses['value_loss'], }, update_step) tb_writer.add_scalar('mario/action_dist_entropy', losses['action_dist_entropy'], update_step) if np.min(episode_rewards) == 1.0: model_path = 'models/super_model_{}.bin'.format(update_step + 1) torch.save(actor_critic.state_dict(), model_path) save_model = (update_step % args.save_interval) == (args.save_interval - 1) if save_model: model_path = 'models/model_{}.bin'.format(update_step + 1) torch.save(actor_critic.state_dict(), model_path) tb_writer.close()