def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') seed = np.random.randint(0, 100) env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed, retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300) env._flattener = ActionFlattener([2, 3, 2, 1]) env._action_space = env._flattener.action_space input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, 'main.model') predictor_path = os.path.join(args.save_dir, 'main.pred') target_path = os.path.join(args.save_dir, 'main.target') writer = SummaryWriter()#log_dir=args.log_dir) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: "Loading model..." if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment( args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 print("Load RMS =", args.load_rms) if args.load_rms: print("Loading RMS values for observation and reward normalization") with open('reward_rms.pkl', 'rb') as f: reward_rms = dill.load(f) with open('obs_rms.pkl', 'rb') as f: obs_rms = dill.load(f) else: reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) # normalize observation print('Initializing observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv() next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward(rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path) """ checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))]) if len(checkpoint_list) == 0: last_checkpoint = -1 else: last_checkpoint = checkpoint_list.max() next_checkpoint = last_checkpoint + 1 print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint)) incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model') incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred') incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target') with open(incre_model_path, 'wb') as f: torch.save(model.state_dict(), f) with open(incre_predictor_path, 'wb') as f: torch.save(rnd.predictor.state_dict(), f) with open(incre_target_path, 'wb') as f: torch.save(rnd.target.state_dict(), f) """ if args.terminate and (global_step > args.terminate_steps): with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) break
class Runner: def __init__(self, env: Any, agent: Any, save_interval: int = 1000, train_episode: int = 10**9, num_eval_episode: int = 3, episode_len: int = 3000, pre_step: int = 10000, gamma: float = 0.995, int_gamma: float = 0.995, lam: float = 0.97, device=torch.device('cpu'), int_coef: float = 1, ext_coef: float = 0.3, eval_interval: int = 10**4, seed: int = 0): self.save_interval = save_interval self.eval_interval = eval_interval # prepare envs self.env = env self.env.seed(seed) self.env_test = deepcopy(env) self.env_test.seed(2**31 - seed) self.agent = agent # pepare steps self.global_step = 0 self.step_in_episode = 0 self.episode_so_far = 0 self.episode_len = episode_len # length of an episode self.num_eval_episode = num_eval_episode self.train_episode = train_episode self.pre_step = pre_step # number of steps used to measure variance of states self.reward_rms = RunningMeanStd() obs_sampled = self.env.reset() self.obs_rms = RunningMeanStd(shape=[1] + list(obs_sampled.shape)) self.device = device self.lam = lam self.gamma = gamma self.int_gamma = int_gamma # gamma for intrinsic reward # ratio of intrinsic and extrinsic rewards self.int_coef = int_coef self.ext_coef = ext_coef self.reward_in_episode = 0.0 self.returns = {'step': [], 'return': []} def run_episode(self): total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy = \ [], [], [], [], [], [], [], [], [], [] self.step_in_episode = 0 self.reward_in_episode = 0 obs = self.env.reset() done = False for _ in range(self.episode_len): action, policy, value_ext, value_int = self.agent.get_action(obs) obs_next, reward, done, info = env.step(2 * action) self.reward_in_episode += reward self.global_step += 1 self.step_in_episode += 1 int_reward = agent.calc_intrinsic_reward( (obs_next - self.obs_rms.mean) / np.sqrt(self.obs_rms.var).clip(-5, 5)) total_next_obs.append(obs_next) total_int_reward.append(int_reward) total_state.append(obs) total_reward.append(reward) total_done.append(done) total_action.append(action) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) obs = obs_next _, _, value_ext, value_int = agent.get_action(obs) total_ext_values.append(value_ext) total_int_values.append(value_int) total_state = np.stack(total_state) # (num_episode, state_shape) total_action = np.stack(total_action) # (num_episode) total_done = np.stack(total_done) # (num_episode, ) total_next_obs = np.stack(total_next_obs) # (num_episode, state_shape) total_int_reward = np.stack(total_int_reward) # normalize intrinsic reward mean, std, count = np.mean(total_reward), np.std(total_reward), len( total_reward) self.reward_rms.update_from_moments(mean, std**2, count) total_int_reward /= self.reward_rms.var ext_target, ext_adv = self.gae(reward=total_reward, done=total_done, value=total_ext_values, gamma=self.gamma, num_step=self.episode_len) int_target, int_adv = self.gae(reward=total_int_reward, done=[0] * self.episode_len, value=total_int_values, gamma=self.int_gamma, num_step=self.episode_len) total_adv = int_adv * self.int_coef + ext_adv * self.ext_coef self.obs_rms.update(total_next_obs) agent.train_model( states=np.float32(total_state), target_ext=ext_target, target_int=int_target, actions=total_action, advs=total_adv, next_states=((total_next_obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var)).clip(-5, 5), log_pi_old=total_policy, # TODO: fix this num_step=self.episode_len) def evaluate(self, steps): """ 複数エピソード環境を動かし,平均収益を記録する. """ returns = [] for _ in range(self.num_eval_episode): state = self.env_test.reset() done = False episode_return = 0.0 step = 0 while (not done): step += 1 action = self.agent.exploit(state) state, reward, done, _ = self.env_test.step(2 * action) episode_return += reward returns.append(episode_return) mean_return = np.mean(returns) self.returns['step'].append(steps) self.returns['return'].append(mean_return) print(f'Num steps: {steps:<6} ' f'Num episode: {self.episode_so_far} ' f'Return: {mean_return:<5.1f} ' f'Time: {self.time}') def plot(self): """ 平均収益のグラフを描画する. """ fig = plt.figure(figsize=(8, 6)) plt.plot(self.returns['step'], self.returns['return']) plt.xlabel('Steps', fontsize=24) plt.ylabel('Return', fontsize=24) plt.tick_params(labelsize=18) plt.title(f'{self.env.unwrapped.spec.id}', fontsize=24) plt.tight_layout() plt.savefig('figure.png') def start(self): self.start_time = time() self.prepare_normalization_coeff() print('Start Training') for episode in range(self.train_episode): self.episode_so_far = episode self.run_episode() if episode % self.eval_interval: self.evaluate(steps=self.global_step) if episode % (self.eval_interval * 10): self.plot() print('Finished') @property def time(self): return str(timedelta(seconds=int(time() - self.start_time))) def prepare_normalization_coeff(self): states = [] for _ in range(self.pre_step): action = self.env.action_space.sample() state, reward, done, info = self.env.step(action) states.append(state) states = np.array(states) self.obs_rms.update(states) def gae(self, reward: Sequence, done: Sequence, value: Sequence, gamma: float, num_step: int): """Returns (discounted_return, advantage)""" adv_tmp = 0 discounted_return = [None] * num_step for t in range(num_step - 1, -1, -1): delta = reward[t] + gamma * value[t + 1] * (1 - done[t]) - value[t] adv_tmp = delta + gamma * self.lam * (1 - done[t]) * adv_tmp discounted_return[t] = adv_tmp + value[t] discounted_return = np.array(discounted_return, dtype='float32') adv = discounted_return - np.array(value[:-1], dtype='float32') return discounted_return, adv
def main(): args = parse_arguments() train_method = args.train_method env_id = args.env_id env_type = args.env_type if env_type == 'atari': env = gym.make(env_id) input_size = env.observation_space.shape output_size = env.action_space.n env.close() else: raise NotImplementedError is_load_model = False is_render = False os.makedirs('models', exist_ok=True) model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) results_dir = os.path.join('outputs', args.env_id) os.makedirs(results_dir, exist_ok=True) logger = Logger(results_dir) writer = SummaryWriter(os.path.join(results_dir, 'tensorboard', args.env_id)) use_cuda = args.use_gpu use_gae = args.use_gae use_noisy_net = args.use_noisynet lam = args.lam num_worker = args.num_worker num_step = args.num_step ppo_eps = args.ppo_eps epoch = args.epoch mini_batch = args.minibatch batch_size = int(num_step * num_worker / mini_batch) learning_rate = args.learning_rate entropy_coef = args.entropy gamma = args.gamma int_gamma = args.int_gamma clip_grad_norm = args.clip_grad_norm ext_coef = args.ext_coef int_coef = args.int_coef sticky_action = args.sticky_action action_prob = args.action_prob life_done = args.life_done pre_obs_norm_step = args.obs_norm_step reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(int_gamma) if args.train_method == 'RND': agent = RNDAgent else: raise NotImplementedError if args.env_type == 'atari': env_type = AtariEnvironment else: raise NotImplementedError agent = agent( input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net ) logger.info('Start to initialize workers') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, max_step_per_episode=args.max_step_per_episode) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs logger.info('Start to initailize observation normalization parameter.....') next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] logger.info('End to initalize...') pbar = tqdm.tqdm(total=args.total_frames) while True: logger.info('Iteration: {}'.format(global_update)) total_state, total_reward, total_done, total_next_state, \ total_action, total_int_reward, total_next_obs, total_ext_values, \ total_int_values, total_policy, total_policy_np = \ [], [], [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = \ [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/returns_vs_frames', sample_rall, global_step) writer.add_scalar('data/lengths_vs_frames', sample_step, global_step) writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # Step 4. update obs normalize param obs_rms.update(total_next_obs) # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy) if args.save_models and global_update % 1000 == 0: torch.save(agent.model.state_dict(), 'models/{}-{}.model'.format(env_id, global_update)) logger.info('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path) pbar.update(num_worker * num_step) if global_step >= args.total_frames: break pbar.close()
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = gym.make(args.env_name) input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in args.env_name: output_size -= 1 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, args.env_name + '.model') predictor_path = os.path.join(args.save_dir, args.env_name + '.pred') target_path = os.path.join(args.save_dir, args.env_name + '.target') writer = SummaryWriter(log_dir=args.log_dir) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize observation print('Initializes observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action( model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv( ) next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward( rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path)
class RolloutStorage(object): def __init__(self, num_steps, num_processes, obs_shape, action_space, recurrent_hidden_state_size, norm_rew=False): self.obs = torch.zeros(num_steps + 1, num_processes, *obs_shape) self.recurrent_hidden_states = torch.zeros( num_steps + 1, num_processes, recurrent_hidden_state_size) self.rewards = torch.zeros(num_steps, num_processes, 1) self.value_preds = torch.zeros(num_steps + 1, num_processes, 1) self.returns = torch.zeros(num_steps + 1, num_processes, 1) self.action_log_probs = torch.zeros(num_steps, num_processes, 1) self.norm_rew = norm_rew if self.norm_rew: self.ret_running_mean_std = RunningMeanStd() if action_space.__class__.__name__ == 'Discrete': action_shape = 1 self.n_actions = action_space.n else: action_shape = action_space.shape[0] self.n_actions = None self.actions = torch.zeros(num_steps, num_processes, action_shape) if action_space.__class__.__name__ == 'Discrete': self.actions = self.actions.long() self.masks = torch.ones(num_steps + 1, num_processes, 1) self.num_steps = num_steps self.step = 0 def to(self, device): self.obs = self.obs.to(device) self.recurrent_hidden_states = self.recurrent_hidden_states.to(device) self.rewards = self.rewards.to(device) self.value_preds = self.value_preds.to(device) self.returns = self.returns.to(device) self.action_log_probs = self.action_log_probs.to(device) self.actions = self.actions.to(device) self.masks = self.masks.to(device) def insert(self, obs, recurrent_hidden_states, actions, action_log_probs, value_preds, rewards, masks): self.obs[self.step + 1].copy_(obs) self.recurrent_hidden_states[self.step + 1].copy_(recurrent_hidden_states) self.actions[self.step].copy_(actions) self.action_log_probs[self.step].copy_(action_log_probs) self.value_preds[self.step].copy_(value_preds) self.rewards[self.step].copy_(rewards) self.masks[self.step + 1].copy_(masks) self.step = (self.step + 1) % self.num_steps def after_update(self): self.obs[0].copy_(self.obs[-1]) self.recurrent_hidden_states[0].copy_(self.recurrent_hidden_states[-1]) self.masks[0].copy_(self.masks[-1]) def compute_returns(self, next_value, use_gae, gamma, tau): if self.norm_rew: # NOTE: Not adding the estimated value after last time step here r_gamma_sum = torch.zeros(self.returns.size()).to( self.returns.device) for step in reversed(range(self.rewards.size(0))): r_gamma_sum[step] = r_gamma_sum[step + 1] * \ gamma * self.masks[step + 1] + self.rewards[step] r_gamma_sum_flat = r_gamma_sum.view(-1) ret_mean = torch.mean(r_gamma_sum_flat).detach() ret_std = torch.std(r_gamma_sum_flat).detach() ret_count = r_gamma_sum_flat.shape[0] self.ret_running_mean_std.update_from_moments( ret_mean, ret_std**2, ret_count) self.rewards /= torch.sqrt(self.ret_running_mean_std.var) if use_gae: self.value_preds[-1] = next_value gae = 0 for step in reversed(range(self.rewards.size(0))): delta = self.rewards[step] + gamma * self.value_preds[ step + 1] * self.masks[step + 1] - self.value_preds[step] gae = delta + gamma * tau * self.masks[step + 1] * gae self.returns[step] = gae + self.value_preds[step] else: self.returns[-1] = next_value for step in reversed(range(self.rewards.size(0))): self.returns[step] = self.returns[step + 1] * \ gamma * self.masks[step + 1] + self.rewards[step] def feed_forward_generator(self, advantages, num_mini_batch): num_steps, num_processes = self.rewards.size()[0:2] batch_size = num_processes * num_steps assert batch_size >= num_mini_batch, ( "PPO requires the number of processes ({}) " "* number of steps ({}) = {} " "to be greater than or equal to the number of PPO mini batches ({})." "".format(num_processes, num_steps, num_processes * num_steps, num_mini_batch)) mini_batch_size = batch_size // num_mini_batch sampler = BatchSampler(SubsetRandomSampler(range(batch_size)), mini_batch_size, drop_last=False) for indices in sampler: obs_batch = self.obs[:-1].view(-1, *self.obs.size()[2:])[indices] recurrent_hidden_states_batch = self.recurrent_hidden_states[:-1].view( -1, self.recurrent_hidden_states.size(-1))[indices] actions_batch = self.actions.view(-1, self.actions.size(-1))[indices] return_batch = self.returns[:-1].view(-1, 1)[indices] masks_batch = self.masks[:-1].view(-1, 1)[indices] old_action_log_probs_batch = self.action_log_probs.view(-1, 1)[indices] adv_targ = advantages.view(-1, 1)[indices] yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, adv_targ, None, None def recurrent_generator(self, advantages, num_mini_batch): num_processes = self.rewards.size(1) assert num_processes >= num_mini_batch, ( "PPO requires the number of processes ({}) " "to be greater than or equal to the number of " "PPO mini batches ({}).".format(num_processes, num_mini_batch)) num_envs_per_batch = num_processes // num_mini_batch perm = torch.randperm(num_processes) for start_ind in range(0, num_processes, num_envs_per_batch): obs_batch = [] recurrent_hidden_states_batch = [] actions_batch = [] return_batch = [] masks_batch = [] old_action_log_probs_batch = [] adv_targ = [] for offset in range(num_envs_per_batch): ind = perm[start_ind + offset] obs_batch.append(self.obs[:-1, ind]) recurrent_hidden_states_batch.append( self.recurrent_hidden_states[0:1, ind]) actions_batch.append(self.actions[:, ind]) return_batch.append(self.returns[:-1, ind]) masks_batch.append(self.masks[:-1, ind]) old_action_log_probs_batch.append(self.action_log_probs[:, ind]) adv_targ.append(advantages[:, ind]) T, N = self.num_steps, num_envs_per_batch # These are all tensors of size (T, N, -1) obs_batch = torch.stack(obs_batch, 1) actions_batch = torch.stack(actions_batch, 1) return_batch = torch.stack(return_batch, 1) masks_batch = torch.stack(masks_batch, 1) old_action_log_probs_batch = torch.stack( old_action_log_probs_batch, 1) adv_targ = torch.stack(adv_targ, 1) # States is just a (N, -1) tensor recurrent_hidden_states_batch = torch.stack( recurrent_hidden_states_batch, 1).view(N, -1) # Flatten the (T, N, ...) tensors to (T * N, ...) obs_batch = _flatten_helper(T, N, obs_batch) actions_batch = _flatten_helper(T, N, actions_batch) return_batch = _flatten_helper(T, N, return_batch) masks_batch = _flatten_helper(T, N, masks_batch) old_action_log_probs_batch = _flatten_helper(T, N, \ old_action_log_probs_batch) adv_targ = _flatten_helper(T, N, adv_targ) yield obs_batch, recurrent_hidden_states_batch, actions_batch, \ return_batch, masks_batch, old_action_log_probs_batch, adv_targ, T, N
def main(): if 'NAME' in os.environ.keys(): NAME = os.environ['NAME'] else: raise ValueError('set NAME via env variable') try: env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r')) except: env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r')) if 'home-test' not in NAME: wandb.init( project='CarRacing_RND', reinit=True, name=f'rnd_{NAME}', config={'env_config': env_settings, 'agent_config': default_config}, ) # print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] # env_type = default_config['EnvType'] # if env_type == 'mario': # env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) # elif env_type == 'atari': # env = gym.make(env_id) # else: # raise NotImplementedError seed = np.random.randint(0, 2 ** 16 - 1) print(f'use name : {NAME}') print(f"use env config : {default_config['CarIntersectConfigPath']}") print(f'use seed : {seed}') print(f"use device : {os.environ['DEVICE']}") os.chdir('..') env = makeCarIntersect(env_settings) eval_env = create_eval_env(makeCarIntersect(env_settings)) # input_size = env.observation_space.shape # 4 input_size = env.observation_space.shape assert isinstance(env.action_space, gym.spaces.Box) action_size = env.action_space.shape[0] # 2 env.close() is_load_model = True is_render = False # model_path = 'models/{}.model'.format(NAME) # predictor_path = 'models/{}.pred'.format(NAME) # target_path = 'models/{}.target'.format(NAME) # writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent( input_size, action_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net, device=os.environ['DEVICE'], ) # if is_load_model: # print('load model...') # if use_cuda: # agent.model.load_state_dict(torch.load(model_path)) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) # agent.rnd.target.load_state_dict(torch.load(target_path)) # else: # agent.model.load_state_dict(torch.load(model_path, map_location='cpu')) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu')) # agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu')) # print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, settings=env_settings) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) os.chdir('rnd_continues') states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 logger = Logger(None, use_console=True, use_wandb=True, log_interval=1) print('Test evaluater:') evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=False, exp_class='RND', exp_name=NAME, debug=True, ) print('end evaluater test.') # normalize obs print('Start to initailize observation normalization parameter.....') # print('ALERT! pass section') # assert 'home-test' in NAME next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.uniform(-1, 1, size=(num_worker, action_size)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \ [], [], [], [], [], [], [], [], [], [], [] # Step 1. n-step rollout for _ in range(num_step): global_step += num_worker # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action.cpu().numpy()) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions.cpu().numpy()) total_ext_values.append(value_ext) total_int_values.append(value_int) # total_policy.append(policy) # total_policy_np.append(policy.cpu().numpy()) total_policy_log_prob.extend(policy_log_prob.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) # writer.add_scalar('data/step', sample_step, sample_episode) logger.log_it({ 'reward_per_episode': sample_rall, 'intrinsic_reward': sample_i_rall, 'episode_steps': sample_step, 'global_step_cnt': global_step, 'updates_cnt': global_update, }) logger.publish_logs(step=global_step) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) # total_action = np.stack(total_action).transpose().reshape([-1, action_size]) total_action = np.array(total_action).reshape((-1, action_size)) # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1)) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() # total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- global_update += 1 # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy_log_prob) # if global_step % (num_worker * num_step * 100) == 0: # print('Now Global Step :{}'.format(global_step)) # torch.save(agent.model.state_dict(), model_path) # torch.save(agent.rnd.predictor.state_dict(), predictor_path) # torch.save(agent.rnd.target.state_dict(), target_path) if global_update % 100 == 0: evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=True, exp_class='RND', exp_name=NAME, ) logger.publish_logs(step=global_step)