class SC2Environment(environment.Environment): def __init__(self, env_args): super(SC2Environment, self).__init__() env = partial(make_sc2env, **env_args) self.conn, child_conn = Pipe() self.proc = Process(target=worker, args=(child_conn, CloudpickleWrapper(env))) self.proc.start() self.reset() @staticmethod def get_action_size(): return len(FUNCTIONS) def reset(self): self.conn.send([COMMAND_RESET, None]) return [self.conn.recv()] def close(self): self.conn.send([COMMAND_TERMINATE, None]) self.conn.close() self.proc.join() print("SC2 environment closed") def step(self, actions): self.conn.send([COMMAND_STEP, actions]) obs = self.conn.recv() return [obs], obs.reward, obs.last()
def evaluate(args): env = gym.make(args.env) env_params = get_env_params(env, args) env.close() agent = PPOAgent(args, env_params) agent.load_model(load_model_remark=args.load_model_remark) parent_conn, child_conn = Pipe() worker = AtariEnvironment(args.env, 1, child_conn, is_render=True, max_episode_step=args.max_episode_step) worker.start() for i_episode in range(100): obs = worker.reset() while True: obs = np.expand_dims(obs, axis=0) action = agent.choose_action(obs / 255) parent_conn.send(action[0]) obs_, r, done, info = parent_conn.recv() obs = obs_ if done: break
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = gym.make(args.env_name) input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in args.env_name: output_size -= 1 env.close() is_render = True model_path = os.path.join(args.save_dir, args.env_name + '.model') if not os.path.exists(model_path): print("Model file not found") return num_worker = 1 sticky_action = False model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) model = model.to(device) if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) parent_conn, child_conn = Pipe() work = AtariEnvironment( args.env_name, is_render, 0, child_conn, sticky_action=sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() # states = np.zeros([num_worker, 4, 84, 84]) states = torch.zeros(num_worker, 4, 84, 84) while True: actions = get_action(model, device, torch.div(states, 255.)) parent_conn.send(actions) next_states = [] next_state, reward, done, real_done, log_reward = parent_conn.recv() next_states.append(next_state) states = torch.from_numpy(np.stack(next_states)) states = states.type(torch.FloatTensor)
class DummyServer(INeuralNetworkAPI, IFlightControl): def __init__(self, **kwargs): self.handler_conn, server_conn = Pipe() self.handler = HandlerProcess(server_conn=server_conn, **kwargs) self.handler.start() def forward(self, batch: TikTensor) -> None: pass # self.handler_conn.send( # ( # "forward", # {"keys": [a.id for a in batch], "data": torch.stack([torch.from_numpy(a.as_numpy()) for a in batch])}, # ) # ) def active_children(self): self.handler_conn.send(("active_children", {})) def listen(self, timeout: float = 10) -> Union[None, Tuple[str, dict]]: if self.handler_conn.poll(timeout=timeout): answer = self.handler_conn.recv() logger.debug("got answer: %s", answer) return answer else: return None def shutdown(self): self.handler_conn.send(SHUTDOWN) got_shutdown_answer = False while self.handler.is_alive(): if self.handler_conn.poll(timeout=2): answer = self.handler_conn.recv() if answer == SHUTDOWN_ANSWER: got_shutdown_answer = True assert got_shutdown_answer
def play(self): parent, child = Pipe() if flag.ENV == "MR": env = montezuma_revenge_env.MontezumaRevenge(0, child, 1, 0, 18000) env.start() self.current_observation = np.zeros((4, 84, 84)) while True: observation_tensor = torch.from_numpy( np.expand_dims(self.current_observation, 0)).float().to( self.device) predicted_action, value1, value2 = self.model.step( observation_tensor / 255) parent.send(predicted_action[0]) self.current_observation, rew, done = parent.recv()
class OnlineVaeAlgorithmSegmented(TorchBatchRLAlgorithm): def __init__(self, vae_original, vae_segmented, vae_trainer_original, vae_trainer_segmented, *base_args, vae_save_period=1, vae_training_schedule=vae_schedules.never_train, oracle_data=False, parallel_vae_train=True, vae_min_num_steps_before_training=0, uniform_dataset=None, keep_train_segmentation_vae=False, **base_kwargs): super().__init__(*base_args, **base_kwargs) assert isinstance(self.replay_buffer, OnlineVaeRelabelingBufferSegmented) self.vae_original = vae_original self.vae_segmented = vae_segmented self.vae_trainer_original = vae_trainer_original self.vae_trainer_segmented = vae_trainer_segmented self.vae_trainer_original.model = self.vae_original self.vae_trainer_segmented.model = self.vae_segmented self.vae_save_period = vae_save_period self.vae_training_schedule = vae_training_schedule self.oracle_data = oracle_data self.parallel_vae_train = parallel_vae_train self.vae_min_num_steps_before_training = vae_min_num_steps_before_training self.uniform_dataset = uniform_dataset self._vae_training_process = None self._update_subprocess_vae_thread = None self._vae_conn_pipe = None self.keep_train_segmentation_vae = keep_train_segmentation_vae def _train(self): super()._train() self._cleanup() def _end_epoch(self, epoch): # self.check_replay_buffer() self._train_vae(epoch) gt.stamp('vae training') super()._end_epoch(epoch) def _log_stats(self, epoch): self._log_vae_stats() super()._log_stats(epoch) def to(self, device): self.vae_original.to(device) self.vae_segmented.to(device) super().to(device) def _get_snapshot(self): snapshot = super()._get_snapshot() assert 'vae' not in snapshot snapshot['vae_original'] = self.vae_original snapshot['vae_segmented'] = self.vae_segmented return snapshot """ debug code """ def check_replay_buffer(self): batch = self.replay_buffer.random_batch(self.batch_size) rewards = batch['rewards'] terminals = batch['terminals'] obs = batch['observations'] actions = batch['actions'] next_obs = batch['next_observations'] goals = batch['resampled_goals'] print("obs: ", type(obs)) print("obs shape: ", obs.shape) decoded_obs = self.eval_env._decode(obs, self.eval_env.vae_original) for idx in range(10): self.eval_env.show_obs(decoded_obs[idx], "sac policy obs") print("next_obs: ", type(next_obs)) print("next obs shape: ", next_obs.shape) decoded_next_obs = self.eval_env._decode(next_obs, self.eval_env.vae_original) for idx in range(10): self.eval_env.show_obs(decoded_next_obs[idx], "sac policy next_obs") decoded_goal = self.eval_env._decode(goals, self.eval_env.vae_segmented) for idx in range(10): self.eval_env.show_obs(decoded_goal[idx], "sac policy goal") """ VAE-specific Code """ def _train_vae(self, epoch): if self.parallel_vae_train and self._vae_training_process is None: self.init_vae_training_subprocess() should_train, amount_to_train = self.vae_training_schedule(epoch) rl_start_epoch = int(self.min_num_steps_before_training / (self.num_expl_steps_per_train_loop * self.num_train_loops_per_epoch)) print(" _train_vae called, should_train, amount_to_train", should_train, amount_to_train) if should_train or epoch <= (rl_start_epoch - 1): if self.parallel_vae_train: assert self._vae_training_process.is_alive() # Make sure the last vae update has finished before starting # another one if self._update_subprocess_vae_thread is not None: self._update_subprocess_vae_thread.join() self._update_subprocess_vae_thread = Thread( target=OnlineVaeAlgorithmSegmented. update_vae_in_training_subprocess, args=(self, epoch, ptu.device)) self._update_subprocess_vae_thread.start() self._vae_conn_pipe.send((amount_to_train, epoch)) else: _train_vae(self.vae_trainer_original, self.replay_buffer, epoch, amount_to_train, key='image_observation') # train segmentation vae using both oracle data and newly collected data # train using newly collected data if self.keep_train_segmentation_vae: _train_vae(self.vae_trainer_segmented, self.replay_buffer, epoch, amount_to_train // 3 * 2, key='image_observation_segmented') # train using pre-collected oracle data _train_vae(self.vae_trainer_segmented, self.replay_buffer, epoch, amount_to_train // 3, key='image_observation_segmented', oracle_data=True) self.replay_buffer.refresh_latents(epoch) _test_vae(self.vae_trainer_original, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, save_prefix='r_original_') _test_vae(self.vae_trainer_segmented, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, save_prefix='r_segmented_') def _log_vae_stats(self): logger.record_dict( self.vae_trainer_original.get_diagnostics(), prefix='vae_trainer_original/', ) logger.record_dict( self.vae_trainer_segmented.get_diagnostics(), prefix='vae_trainer_segmented/', ) def _cleanup(self): if self.parallel_vae_train: self._vae_conn_pipe.close() self._vae_training_process.terminate() def init_vae_training_subprocess(self): assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer) self._vae_conn_pipe, process_pipe = Pipe() self._vae_training_process = Process( target=subprocess_train_vae_loop, args=( process_pipe, self.vae, self.vae.state_dict(), self.replay_buffer, self.replay_buffer.get_mp_info(), ptu.device, )) self._vae_training_process.start() self._vae_conn_pipe.send(self.vae_trainer) def update_vae_in_training_subprocess(self, epoch, device): self.vae.__setstate__(self._vae_conn_pipe.recv()) self.vae.to(device) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, )
for _ in range(num_step): if not is_training: time.sleep(0.05) agent.model.eval() agent.icm.eval() actions = agent.get_action(states) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards = [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_states = np.stack(next_states) rewards = np.hstack(rewards) * reward_scale dones = np.hstack(dones) real_dones = np.hstack(real_dones) # total reward = int reward + ext Resard intrinsic_reward = agent.compute_intrinsic_reward( states, next_states, actions) rewards += intrinsic_reward
def main(run_id=0, checkpoint=None, save_interval=1000): print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] # Create environment env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': print('Mario environment not fully implemented - thomaseh') raise NotImplementedError env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() # Load configuration parameters is_load_model = checkpoint is not None is_render = False model_path = 'models/{}_{}_run{}_model'.format(env_id, train_method, run_id) if train_method == 'RND': predictor_path = 'models/{}_{}_run{}_pred'.format(env_id, train_method, run_id) target_path = 'models/{}_{}_run{}_target'.format(env_id, train_method, run_id) elif train_method == 'generative': predictor_path = 'models/{}_{}_run{}_vae'.format(env_id, train_method, run_id) writer = SummaryWriter(comment='_{}_{}_run{}'.format(env_id, train_method, run_id)) use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) num_rollouts = int(default_config['NumRollouts']) num_pretrain_rollouts = int(default_config['NumPretrainRollouts']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) if train_method == 'RND': agent = RNDAgent elif train_method == 'generative': agent = GenerativeAgent else: raise NotImplementedError if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment else: raise NotImplementedError # Initialize agent agent = agent( input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net ) # Load pre-existing model if is_load_model: print('load model...') if use_cuda: agent.model.load_state_dict(torch.load(model_path)) if train_method == 'RND': agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) agent.rnd.target.load_state_dict(torch.load(target_path)) elif train_method == 'generative': agent.vae.load_state_dict(torch.load(predictor_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) if train_method == 'RND': agent.rnd.predictor.load_state_dict( torch.load(predictor_path, map_location='cpu')) agent.rnd.target.load_state_dict( torch.load(target_path, map_location='cpu')) elif train_method == 'generative': agent.vae.load_state_dict(torch.load(predictor_path, map_location='cpu')) print('load finished!') # Create workers to run in environments works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type( env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, ) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84], dtype='float32') sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # Initialize observation normalizers print('Start to initialize observation normalization parameter...') next_obs = np.zeros([num_worker * num_step, 1, 84, 84]) for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for idx, parent_conn in enumerate(parent_conns): s, r, d, rd, lr, _ = parent_conn.recv() next_obs[(step % num_step) * num_worker + idx, 0, :, :] = s[3, :, :] if (step % num_step) == num_step - 1: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = np.zeros([num_worker * num_step, 1, 84, 84]) print('End to initialize...') # Initialize stats dict stats = { 'total_reward': [], 'ep_length': [], 'num_updates': [], 'frames_seen': [], } # Main training loop while True: total_state = np.zeros([num_worker * num_step, 4, 84, 84], dtype='float32') total_next_obs = np.zeros([num_worker * num_step, 1, 84, 84]) total_reward, total_done, total_next_state, total_action, \ total_int_reward, total_ext_values, total_int_values, total_policy, \ total_policy_np = [], [], [], [], [], [], [], [], [] # Step 1. n-step rollout (collect data) for step in range(num_step): actions, value_ext, value_int, policy = agent.get_action(states/255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_obs = np.zeros([num_worker, 1, 84, 84]) next_states = np.zeros([num_worker, 4, 84, 84]) rewards, dones, real_dones, log_rewards = [], [], [], [] for idx, parent_conn in enumerate(parent_conns): s, r, d, rd, lr, stat = parent_conn.recv() next_states[idx] = s rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs[idx, 0] = s[3, :, :] total_next_obs[idx * num_step + step, 0] = s[3, :, :] if rd: stats['total_reward'].append(stat[0]) stats['ep_length'].append(stat[1]) stats['num_updates'].append(global_update) stats['frames_seen'].append(global_step) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) # Compute total reward = intrinsic reward + external reward next_obs -= obs_rms.mean next_obs /= np.sqrt(obs_rms.var) next_obs.clip(-5, 5, out=next_obs) intrinsic_reward = agent.compute_intrinsic_reward(next_obs) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] for idx, state in enumerate(states): total_state[idx * num_step + step] = state total_int_reward.append(intrinsic_reward) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! total_state /= 255. total_next_obs -= obs_rms.mean total_next_obs /= np.sqrt(obs_rms.var) total_next_obs.clip(-5, 5, out=total_next_obs) agent.train_model(total_state, ext_target, int_target, total_action, total_adv, total_next_obs, total_policy) global_step += (num_worker * num_step) global_update += 1 if global_update % save_interval == 0: print('Saving model at global step={}, num rollouts={}.'.format( global_step, global_update)) torch.save(agent.model.state_dict(), model_path + "_{}.pt".format(global_update)) if train_method == 'RND': torch.save(agent.rnd.predictor.state_dict(), predictor_path + '_{}.pt'.format(global_update)) torch.save(agent.rnd.target.state_dict(), target_path + '_{}.pt'.format(global_update)) elif train_method == 'generative': torch.save(agent.vae.state_dict(), predictor_path + '_{}.pt'.format(global_update)) # Save stats to pickle file with open('models/{}_{}_run{}_stats_{}.pkl'.format(env_id, train_method, run_id, global_update),'wb') as f: pickle.dump(stats, f) if global_update == num_rollouts + num_pretrain_rollouts: print('Finished Training.') break
work.start() works.append(work) child_conns.append(child_conn) parent_conns.append(parent_conn) steps = 0 next_obs = [] print('Start to initialize observation normalization ...') while steps < pre_obs_norm_step: steps += num_worker actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d = parent_conn.recv() next_obs.append(s) print('initializing...:', steps, '/', pre_obs_norm_step) next_obs = np.stack(next_obs) obs_rms.update(next_obs) print('End to initialize') states = np.zeros([num_worker, 2]) global_update = 0 global_step = 0 sample_i_rall = 0 sample_episode = 0 sample_env_idx = 0 sample_rall = 0
def main(): if 'NAME' in os.environ.keys(): NAME = os.environ['NAME'] else: raise ValueError('set NAME via env variable') try: env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r')) except: env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r')) if 'home-test' not in NAME: wandb.init( project='CarRacing_RND', reinit=True, name=f'rnd_{NAME}', config={'env_config': env_settings, 'agent_config': default_config}, ) # print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] # env_type = default_config['EnvType'] # if env_type == 'mario': # env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) # elif env_type == 'atari': # env = gym.make(env_id) # else: # raise NotImplementedError seed = np.random.randint(0, 2 ** 16 - 1) print(f'use name : {NAME}') print(f"use env config : {default_config['CarIntersectConfigPath']}") print(f'use seed : {seed}') print(f"use device : {os.environ['DEVICE']}") os.chdir('..') env = makeCarIntersect(env_settings) eval_env = create_eval_env(makeCarIntersect(env_settings)) # input_size = env.observation_space.shape # 4 input_size = env.observation_space.shape assert isinstance(env.action_space, gym.spaces.Box) action_size = env.action_space.shape[0] # 2 env.close() is_load_model = True is_render = False # model_path = 'models/{}.model'.format(NAME) # predictor_path = 'models/{}.pred'.format(NAME) # target_path = 'models/{}.target'.format(NAME) # writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent( input_size, action_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net, device=os.environ['DEVICE'], ) # if is_load_model: # print('load model...') # if use_cuda: # agent.model.load_state_dict(torch.load(model_path)) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) # agent.rnd.target.load_state_dict(torch.load(target_path)) # else: # agent.model.load_state_dict(torch.load(model_path, map_location='cpu')) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu')) # agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu')) # print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, settings=env_settings) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) os.chdir('rnd_continues') states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 logger = Logger(None, use_console=True, use_wandb=True, log_interval=1) print('Test evaluater:') evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=False, exp_class='RND', exp_name=NAME, debug=True, ) print('end evaluater test.') # normalize obs print('Start to initailize observation normalization parameter.....') # print('ALERT! pass section') # assert 'home-test' in NAME next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.uniform(-1, 1, size=(num_worker, action_size)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \ [], [], [], [], [], [], [], [], [], [], [] # Step 1. n-step rollout for _ in range(num_step): global_step += num_worker # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action.cpu().numpy()) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions.cpu().numpy()) total_ext_values.append(value_ext) total_int_values.append(value_int) # total_policy.append(policy) # total_policy_np.append(policy.cpu().numpy()) total_policy_log_prob.extend(policy_log_prob.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) # writer.add_scalar('data/step', sample_step, sample_episode) logger.log_it({ 'reward_per_episode': sample_rall, 'intrinsic_reward': sample_i_rall, 'episode_steps': sample_step, 'global_step_cnt': global_step, 'updates_cnt': global_update, }) logger.publish_logs(step=global_step) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) # total_action = np.stack(total_action).transpose().reshape([-1, action_size]) total_action = np.array(total_action).reshape((-1, action_size)) # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1)) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() # total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- global_update += 1 # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy_log_prob) # if global_step % (num_worker * num_step * 100) == 0: # print('Now Global Step :{}'.format(global_step)) # torch.save(agent.model.state_dict(), model_path) # torch.save(agent.rnd.predictor.state_dict(), predictor_path) # torch.save(agent.rnd.target.state_dict(), target_path) if global_update % 100 == 0: evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=True, exp_class='RND', exp_name=NAME, ) logger.publish_logs(step=global_step)
def main(): args = parse_arguments() train_method = args.train_method env_id = args.env_id env_type = args.env_type if env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 env.close() is_load_model = False is_render = False os.makedirs('models', exist_ok=True) model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) results_dir = os.path.join('outputs', args.env_id) os.makedirs(results_dir, exist_ok=True) logger = Logger(results_dir) writer = SummaryWriter( os.path.join(results_dir, 'tensorboard', args.env_id)) use_cuda = args.use_gpu use_gae = args.use_gae use_noisy_net = args.use_noisynet lam = args.lam num_worker = args.num_env num_step = args.num_step ppo_eps = args.ppo_eps epoch = args.epoch mini_batch = args.minibatch batch_size = int(num_step * num_worker / mini_batch) learning_rate = args.learning_rate entropy_coef = args.entropy gamma = args.gamma int_gamma = args.int_gamma clip_grad_norm = args.clip_grad_norm ext_coef = args.ext_coef int_coef = args.int_coef sticky_action = args.sticky_action action_prob = args.action_prob life_done = args.life_done pre_obs_norm_step = args.obs_norm_step reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent if args.env_type == 'atari': env_type = AtariEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) logger.info('Start to initialize workers') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs logger.info('Start to initailize observation normalization parameter.....') next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr, nr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] logger.info('End to initalize...') while True: logger.info('Iteration: {}'.format(global_update)) ##################################################################################################### total_state, total_reward, total_done, total_next_state, \ total_action, total_int_reward, total_next_obs, total_ext_values, \ total_int_values, total_policy, total_policy_np, total_num_rooms = \ [], [], [], [], [], [], [], [], [], [], [], [] ##################################################################################################### global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value_ext, value_int, policy = agent.get_action( np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) ################################################################################################# next_states, rewards, dones, real_dones, log_rewards, next_obs, num_rooms = \ [], [], [], [], [], [], [] ################################################################################################# for parent_conn in parent_conns: s, r, d, rd, lr, nr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) ############################################################################################# num_rooms.append(nr) ############################################################################################# next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) ################################################################################################# num_rooms = np.hstack(num_rooms) ################################################################################################# # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) ##################################################################################################### total_num_rooms.append(num_rooms) ##################################################################################################### states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/returns_vs_frames', sample_rall, global_step) writer.add_scalar('data/lengths_vs_frames', sample_step, global_step) writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action( np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) ##################################################################################################### total_num_rooms = np.stack(total_num_rooms).transpose().reshape(-1) total_done_cal = total_done.reshape(-1) if np.any(total_done_cal): avg_num_rooms = np.mean(total_num_rooms[total_done_cal]) else: avg_num_rooms = 0 ##################################################################################################### # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) ##################################################################################################### writer.add_scalar('data/avg_num_rooms_per_iteration', avg_num_rooms, global_update) writer.add_scalar('data/avg_num_rooms_per_step', avg_num_rooms, global_step) ##################################################################################################### # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! agent.train_model( np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip( -5, 5), total_policy) if global_update % 1000 == 0: torch.save(agent.model.state_dict(), 'models/{}-{}.model'.format(env_id, global_update)) logger.info('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path)
global_step = 0 recent_prob = deque(maxlen=10) while True: total_state, total_reward, total_done, total_next_state, total_action = [], [], [], [], [] global_step += (num_worker * num_step) for _ in range(num_step): actions = agent.get_action(states) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones = [], [], [], [] for parent_conn in parent_conns: s, r, d, rd = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) total_state.append(states) total_next_state.append(next_states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions)
def sim_games(N_games, N_MCTS, model, number_of_processes, v_resign, model2 = None, duel=False, batch_size = 8, board_size = 9): #### Function for generating games print("Starting sim games") process_workers = [] torch.multiprocessing.set_start_method('spawn', force=True) # Make queues for sending data gpu_Q = Queue() if (duel==False): data_Q = Queue() # Also make pipe for receiving v_resign conn_rec, conn_send = Pipe(False) p_data = Process(target=data_handler, args=(data_Q, N_games, conn_send)) process_workers.append(p_data) else: winner_Q = Queue() gpu_Q2 = Queue() process_workers.append(Process(target=gpu_worker, args=(gpu_Q2, batch_size, board_size, model2))) # Make counter and lock game_counter = Value('i', 0) lock = Lock() # Make process for gpu worker and data_loader process_workers.append(Process(target=gpu_worker, args=(gpu_Q, batch_size, board_size, model))) # Start gpu and data_loader worker print("GPU processes") for p in process_workers: p.start() # Construct tasks for workers procs = [] torch.multiprocessing.set_start_method('fork', force=True) print("defining worker processes") for i in range(number_of_processes): seed = np.random.randint(int(2**31)) if (duel==True): procs.append(Process(target=sim_duel_game_worker, args=(gpu_Q, gpu_Q2, N_MCTS, winner_Q, N_games, lock, game_counter, seed))) else: procs.append(Process(target=sim_game_worker, args=(gpu_Q, N_MCTS, data_Q, v_resign, N_games, lock, game_counter, seed))) print("Starting worker processes") # Begin running games for p in procs: p.start() # Join processes if (duel==False): # Receive new v_resign v_resign = conn_rec.recv() else: player1_wins = 0 player2_wins = 0 for i in range(N_games): player1_won = winner_Q.get(True) if (player1_won==1): player1_wins += 1 else: player2_wins += 1 for p in procs: p.join() # Close processes for p in process_workers: p.terminate() # Returns v_resign if training else winrate when dueling if (duel==False): return v_resign else: return player1_wins, player2_wins
class OnlineVaeOffpolicyAlgorithm(TorchBatchRLAlgorithm): def __init__(self, vae, vae_trainer, *base_args, vae_save_period=1, vae_training_schedule=vae_schedules.never_train, oracle_data=False, parallel_vae_train=True, vae_min_num_steps_before_training=0, uniform_dataset=None, dataset_path=None, rl_offpolicy_num_training_steps=0, **base_kwargs): super().__init__(*base_args, **base_kwargs) assert isinstance(self.replay_buffer, OnlineVaeRelabelingBuffer) self.vae = vae self.vae_trainer = vae_trainer self.vae_trainer.model = self.vae self.vae_save_period = vae_save_period self.vae_training_schedule = vae_training_schedule self.oracle_data = oracle_data self.parallel_vae_train = parallel_vae_train self.vae_min_num_steps_before_training = vae_min_num_steps_before_training self.uniform_dataset = uniform_dataset self._vae_training_process = None self._update_subprocess_vae_thread = None self._vae_conn_pipe = None self.dataset_path = dataset_path if self.dataset_path: self.load_dataset(dataset_path) # train Q and policy rl_offpolicy_num_training_steps times self.rl_offpolicy_num_training_steps = rl_offpolicy_num_training_steps def pretrain(self): for _ in range(self.rl_offpolicy_num_training_steps): train_data = self.replay_buffer.random_batch(self.batch_size) self.trainer.train(train_data) def load_dataset(self, dataset_path): dataset = load_local_or_remote_file(dataset_path) dataset = dataset.item() observations = dataset['observations'] actions = dataset['actions'] # dataset['observations'].shape # (2000, 50, 6912) # dataset['actions'].shape # (2000, 50, 2) # dataset['env'].shape # (2000, 6912) N, H, imlength = observations.shape self.vae.eval() for n in range(N): x0 = ptu.from_numpy(dataset['env'][n:n + 1, :] / 255.0) x = ptu.from_numpy(observations[n, :, :] / 255.0) latents = self.vae.encode(x, x0, distrib=False) r1, r2 = self.vae.latent_sizes conditioning = latents[0, r1:] goal = torch.cat( [ptu.randn(self.vae.latent_sizes[0]), conditioning]) goal = ptu.get_numpy(goal) # latents[-1, :] latents = ptu.get_numpy(latents) latent_delta = latents - goal distances = np.zeros((H - 1, 1)) for i in range(H - 1): distances[i, 0] = np.linalg.norm(latent_delta[i + 1, :]) terminals = np.zeros((H - 1, 1)) # terminals[-1, 0] = 1 path = dict( observations=[], actions=actions[n, :H - 1, :], next_observations=[], rewards=-distances, terminals=terminals, ) for t in range(H - 1): # reward = -np.linalg.norm(latent_delta[i, :]) obs = dict( latent_observation=latents[t, :], latent_achieved_goal=latents[t, :], latent_desired_goal=goal, ) next_obs = dict( latent_observation=latents[t + 1, :], latent_achieved_goal=latents[t + 1, :], latent_desired_goal=goal, ) path['observations'].append(obs) path['next_observations'].append(next_obs) # import ipdb; ipdb.set_trace() self.replay_buffer.add_path(path) def _end_epoch(self): timer.start_timer('vae training') self._train_vae(self.epoch) timer.stop_timer('vae training') super()._end_epoch() def _get_diagnostics(self): vae_log = self._get_vae_diagnostics().copy() vae_log.update(super()._get_diagnostics()) return vae_log def to(self, device): self.vae.to(device) super().to(device) """ VAE-specific Code """ def _train_vae(self, epoch): if self.parallel_vae_train and self._vae_training_process is None: self.init_vae_training_subprocess() should_train, amount_to_train = self.vae_training_schedule(epoch) rl_start_epoch = int(self.min_num_steps_before_training / (self.num_expl_steps_per_train_loop * self.num_train_loops_per_epoch)) if should_train: # or epoch <= (rl_start_epoch - 1): if self.parallel_vae_train: assert self._vae_training_process.is_alive() # Make sure the last vae update has finished before starting # another one if self._update_subprocess_vae_thread is not None: self._update_subprocess_vae_thread.join() self._update_subprocess_vae_thread = Thread( target=OnlineVaeAlgorithm. update_vae_in_training_subprocess, args=(self, epoch, ptu.device)) self._update_subprocess_vae_thread.start() self._vae_conn_pipe.send((amount_to_train, epoch)) else: _train_vae(self.vae_trainer, epoch, self.replay_buffer, amount_to_train) self.replay_buffer.refresh_latents(epoch) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, ) def _get_vae_diagnostics(self): return add_prefix( self.vae_trainer.get_diagnostics(), prefix='vae_trainer/', ) def _cleanup(self): if self.parallel_vae_train: self._vae_conn_pipe.close() self._vae_training_process.terminate() def init_vae_training_subprocess(self): assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer) self._vae_conn_pipe, process_pipe = Pipe() self._vae_training_process = Process( target=subprocess_train_vae_loop, args=( process_pipe, self.vae, self.vae.state_dict(), self.replay_buffer, self.replay_buffer.get_mp_info(), ptu.device, )) self._vae_training_process.start() self._vae_conn_pipe.send(self.vae_trainer) def update_vae_in_training_subprocess(self, epoch, device): self.vae.__setstate__(self._vae_conn_pipe.recv()) self.vae.to(device) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, )
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') seed = np.random.randint(0, 100) env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed, retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300) env._flattener = ActionFlattener([2, 3, 2, 1]) env._action_space = env._flattener.action_space input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, 'main.model') predictor_path = os.path.join(args.save_dir, 'main.pred') target_path = os.path.join(args.save_dir, 'main.target') writer = SummaryWriter()#log_dir=args.log_dir) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: "Loading model..." if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment( args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 print("Load RMS =", args.load_rms) if args.load_rms: print("Loading RMS values for observation and reward normalization") with open('reward_rms.pkl', 'rb') as f: reward_rms = dill.load(f) with open('obs_rms.pkl', 'rb') as f: obs_rms = dill.load(f) else: reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) # normalize observation print('Initializing observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv() next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward(rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path) """ checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))]) if len(checkpoint_list) == 0: last_checkpoint = -1 else: last_checkpoint = checkpoint_list.max() next_checkpoint = last_checkpoint + 1 print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint)) incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model') incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred') incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target') with open(incre_model_path, 'wb') as f: torch.save(model.state_dict(), f) with open(incre_predictor_path, 'wb') as f: torch.save(rnd.predictor.state_dict(), f) with open(incre_target_path, 'wb') as f: torch.save(rnd.target.state_dict(), f) """ if args.terminate and (global_step > args.terminate_steps): with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) break
works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker * num_worker_per_env, input_size]) while True: total_state, total_reward, total_done, total_next_state, total_action = [], [], [], [], [] for _ in range(num_step): actions = agent.get_action(states) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) rewards, dones, next_states = [], [], [] for parent_conn in parent_conns: s, r, d, _ = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) next_states = np.vstack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) total_next_state.append(next_states) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) states = next_states[:, :]
def MCTS(root_node, gpu_Q ,N, go_board, color, number_passes): start_color = color turn_switcher = {"black": "white", "white": "black"} # Switch for adding calculated v relative to black relative_value = {"black": 1, "white": -1} # Define pipe for GPU process conn_rec, conn_send = Pipe(False) # Define variables to be used legal_board = np.empty((82), dtype=float) for i in range(N): #print(i) current_path = deque([]) current_node = root_node color = start_color while True: current_node.N_total += 1 # Choose action current_node.U = (5*np.sqrt(current_node.N_total))*current_node.P*current_node.N_inv # Depending on current color Q_values are multiplied by -1 if (color=="black"): a_chosen = np.argmax(current_node.U+current_node.illigal_board+current_node.Q) else: a_chosen = np.argmax(current_node.U+current_node.illigal_board-current_node.Q) # Add action and node to path and change color # Add current node to path current_path.append((current_node, a_chosen)) if (current_node.N[a_chosen]!=0): # Case where edge is already explored #print("going down explored edge") # Increment visit count and current_node.N[a_chosen] += 1 current_node.N_inv[a_chosen] = 1/(1+current_node.N[a_chosen]) # Left over code from virtual loss (not important) current_node.Q[a_chosen] = current_node.W[a_chosen]/current_node.N[a_chosen] # Case of already explored game end if (a_chosen==81): game_done = False if (number_passes==1) & (len(current_path)==1): # Case where last action in game was a pass game_done = True else: # Normal rule of each game done after two passes in a row try: game_done = (81==current_path[-1][1]==current_path[-2][1]) except: game_done = False # Count backwards if game has ended if game_done: v = current_node.W[81]/(current_node.N[81]-1) for node, action in current_path: node.W[action] += v node.Q[action] = node.W[action]/node.N[action] break # Update current node, color of turn, and repeat current_node = current_node.action_edges[a_chosen] # Switch collor color = turn_switcher[color] continue else: # Case where edge is not explored # Update visit count of action current_node.N[a_chosen] = 1 current_node.N_inv[a_chosen] = 1/(1+current_node.N[a_chosen]) new_go_state = current_node.go_state.copy_game() # First check if game is done # Simulate action if (a_chosen==81): # Calculate if game has ended game_done = False if (number_passes==1) & (len(current_path)==1): # Case where last action in game was a pass game_done = True else: # Normal rule of each game done after two passes in a row try: game_done = (81==current_path[-1][1]==current_path[-2][1]) except: game_done = False # Count backwards if game has ended if game_done: # Compute who won counted_points = new_go_state.count_points() v = (counted_points>0)-int(counted_points<0) for node, action in current_path: node.W[action] += v node.Q[action] = node.W[action]/node.N[action] break # Take pass move new_go_state.move('pass', color) else: new_go_state.move(np.unravel_index(a_chosen, (9,9)), color) # Get state color = turn_switcher[color] S = new_go_state.get_state(color) # Rotate and reflect state randomly S, rotation, reflection = rotate_S(S) # Get policy and value gpu_Q.put([S, conn_send]) # Construct legal and illigal board in the mean time legal_board[0:81] = np.ndarray.flatten(new_go_state.get_legal_board(color)) legal_board[81] = 1 illegal_board = (legal_board-1)*1000 # Receive P, v P, v = conn_rec.recv() v = relative_value[color]*v # Reverse rotation of P P = reverse_rotate(P, rotation, reflection) # Rescale P based on legal moves P = np.multiply(P,legal_board) P = P/np.sum(P) # Generate new node new_node = state_node(new_go_state, P, color) # Make large n_roundsnegative penalty to stop choosing illigal moves new_node.illigal_board = illegal_board # Add new node to tree current_node.action_edges[a_chosen] = new_node # Now back up for node, action in current_path: node.W[action] += v node.Q[action] = node.W[action]/node.N[action] # Normally we would update visit count N aswell, # but since virtual loss is not used, we can instead do it # at the start of the visit break return root_node
def train(args): torch.multiprocessing.set_start_method('forkserver') num_envs = args.num_envs num_workers = args.num_workers total_envs = num_workers * num_envs game_name = args.env_name max_train_steps = args.max_train_steps n_steps = args.n_steps init_lr = args.lr gamma = args.gamma clip_grad_norm = args.clip_grad_norm num_action = gym.make(game_name).action_space.n image_size = 84 n_stack = 4 model = paac_ff(min_act=num_action).cuda() x = Variable(torch.zeros(total_envs, n_stack, image_size, image_size), volatile=True).cuda() xs = [ Variable(torch.zeros(total_envs, n_stack, image_size, image_size)).cuda() for i in range(n_steps) ] share_reward = [ Variable(torch.zeros(total_envs)).cuda() for _ in range(n_steps) ] share_mask = [ Variable(torch.zeros(total_envs)).cuda() for _ in range(n_steps) ] constant_one = torch.ones(total_envs).cuda() optimizer = optim.Adam(model.parameters(), lr=init_lr) workers = [] parent_conns = [] child_conns = [] for i in range(num_workers): parent_conn, child_conn = Pipe() w = worker(i, num_envs, game_name, n_stack, child_conn, args) w.start() workers.append(w) parent_conns.append(parent_conn) child_conns.append(child_conn) new_s = np.zeros((total_envs, n_stack, image_size, image_size)) for global_step in range(1, max_train_steps + 1): cache_v_series = [] entropies = [] sampled_log_probs = [] for step in range(n_steps): xs[step].data.copy_(torch.from_numpy(new_s)) v, pi = model(xs[step]) cache_v_series.append(v) sampling_action = pi.data.multinomial(1) log_pi = (pi + 1e-12).log() entropy = -(log_pi * pi).sum(1) sampled_log_prob = log_pi.gather( 1, Variable(sampling_action)).squeeze() sampled_log_probs.append(sampled_log_prob) entropies.append(entropy) send_action = sampling_action.squeeze().cpu().numpy() send_action = np.split(send_action, num_workers) # send action and then get state for parent_conn, action in zip(parent_conns, send_action): parent_conn.send(action) batch_s, batch_r, batch_mask = [], [], [] for parent_conn in parent_conns: s, r, mask = parent_conn.recv() batch_s.append(s) batch_r.append(r) batch_mask.append(mask) new_s = np.vstack(batch_s) r = np.hstack(batch_r).clip(-1, 1) # clip reward mask = np.hstack(batch_mask) share_reward[step].data.copy_(torch.from_numpy(r)) share_mask[step].data.copy_(torch.from_numpy(mask)) x.data.copy_(torch.from_numpy(new_s)) v, _ = model(x) # v is volatile R = Variable(v.data.clone()) v_loss = 0.0 policy_loss = 0.0 entropy_loss = 0.0 for i in reversed(range(n_steps)): R = share_reward[i] + 0.99 * share_mask[i] * R advantage = R - cache_v_series[i] v_loss += advantage.pow(2).mul(0.5).mean() policy_loss -= sampled_log_probs[i].mul(advantage.detach()).mean() entropy_loss -= entropies[i].mean() total_loss = policy_loss + entropy_loss.mul(0.02) + v_loss * 0.5 total_loss = total_loss.mul(1 / (n_steps)) # adjust learning rate new_lr = init_lr - (global_step / max_train_steps) * init_lr for param_group in optimizer.param_groups: param_group['lr'] = new_lr optimizer.zero_grad() total_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), clip_grad_norm) optimizer.step() if global_step % 10000 == 0: torch.save(model.state_dict(), './model/model_%s.pth' % game_name) for parent_conn in parent_conns: parent_conn.send(None) for w in workers: w.join()
valueloss_sample = [] POLICYLOSS = [] POLICYLOSS_MEAN = [] policyloss_sample = [] ENTROPY = [] ENTROPY_MEAN = [] entropy_sample = [] EPISODES = [] REWARDS = [] REWARDS_MEAN = [] episode = 0 while True: (cpu, is_nstep, value_loss, policy_loss, entropy, reward, complete) = receiver.recv() dones[cpu] = complete exit = True for d in dones: if d == False: exit = False break if exit: break if complete: continue if is_nstep:
class OnlineVaeAlgorithm(TorchBatchRLAlgorithm): def __init__(self, vae, vae_trainer, *base_args, vae_save_period=1, vae_training_schedule=vae_schedules.never_train, oracle_data=False, parallel_vae_train=True, vae_min_num_steps_before_training=0, uniform_dataset=None, **base_kwargs): super().__init__(*base_args, **base_kwargs) assert isinstance(self.replay_buffer, OnlineVaeRelabelingBuffer) self.vae = vae self.vae_trainer = vae_trainer self.vae_trainer.model = self.vae self.vae_save_period = vae_save_period self.vae_training_schedule = vae_training_schedule self.oracle_data = oracle_data self.parallel_vae_train = parallel_vae_train self.vae_min_num_steps_before_training = vae_min_num_steps_before_training self.uniform_dataset = uniform_dataset self._vae_training_process = None self._update_subprocess_vae_thread = None self._vae_conn_pipe = None def _train(self): super()._train() print("_train") self._cleanup() def _end_epoch(self, epoch): self._train_vae(epoch) gt.stamp('vae training') super()._end_epoch(epoch) def _log_stats(self, epoch): self._log_vae_stats() super()._log_stats(epoch) def to(self, device): self.vae.to(device) super().to(device) def _get_snapshot(self): snapshot = super()._get_snapshot() assert 'vae' not in snapshot snapshot['vae'] = self.vae return snapshot """ VAE-specific Code """ def _train_vae(self, epoch): if self.parallel_vae_train and self._vae_training_process is None: self.init_vae_training_subprocess() should_train, amount_to_train = self.vae_training_schedule(epoch) rl_start_epoch = int(self.min_num_steps_before_training / (self.num_expl_steps_per_train_loop * self.num_train_loops_per_epoch)) if should_train or epoch <= (rl_start_epoch - 1): if self.parallel_vae_train: assert self._vae_training_process.is_alive() # Make sure the last vae update has finished before starting # another one if self._update_subprocess_vae_thread is not None: self._update_subprocess_vae_thread.join() self._update_subprocess_vae_thread = Thread( target=OnlineVaeAlgorithm. update_vae_in_training_subprocess, args=(self, epoch, ptu.device)) self._update_subprocess_vae_thread.start() self._vae_conn_pipe.send((amount_to_train, epoch)) else: _train_vae(self.vae_trainer, self.replay_buffer, epoch, amount_to_train) self.replay_buffer.refresh_latents(epoch) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, ) def _log_vae_stats(self): logger.record_dict( self.vae_trainer.get_diagnostics(), prefix='vae_trainer/', ) def _cleanup(self): if self.parallel_vae_train: self._vae_conn_pipe.close() self._vae_training_process.terminate() def init_vae_training_subprocess(self): assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer) self._vae_conn_pipe, process_pipe = Pipe() self._vae_training_process = Process( target=subprocess_train_vae_loop, args=( process_pipe, self.vae, self.vae.state_dict(), self.replay_buffer, self.replay_buffer.get_mp_info(), ptu.device, )) self._vae_training_process.start() self._vae_conn_pipe.send(self.vae_trainer) def update_vae_in_training_subprocess(self, epoch, device): self.vae.__setstate__(self._vae_conn_pipe.recv()) self.vae.to(device) _test_vae( self.vae_trainer, epoch, self.replay_buffer, vae_save_period=self.vae_save_period, uniform_dataset=self.uniform_dataset, )
def main(): print({section: dict(config[section]) for section in config.sections()}) env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) elif env_type == 'vizdoom': input_size = (image_size, image_size) if env_id == 'battle': output_size = 4 print('vizdoom battle init') elif env_id == 'my_way_home': output_size = 3 print('vizdoom my way home init') else: raise NotImplementedError if env_type == 'mario' or env_type == 'atari': input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_render = True model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) use_cuda = False use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = 1 num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) clip_grad_norm = float(default_config['ClipGradNorm']) sticky_action = False action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') agent = RNDAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment elif default_config['EnvType'] == 'vizdoom': print('Doom Environment') env_type = DoomEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) print('Loading Pre-trained model....') if use_cuda: print('using cuda') agent.model.load_state_dict(torch.load(model_path)) agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) agent.rnd.target.load_state_dict(torch.load(target_path)) else: print('not using cuda') agent.model.load_state_dict(torch.load(model_path, map_location='cpu')) agent.rnd.predictor.load_state_dict( torch.load(predictor_path, map_location='cpu')) agent.rnd.target.load_state_dict( torch.load(target_path, map_location='cpu')) print('End load...') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) print('start enjoy!') for i in range(1, 10): states = np.zeros([num_worker, 4, image_size, image_size]) steps = 0 rall = 0 rd = False intrinsic_reward_list = [] while not rd: if default_config['EnvType'] == 'vizdoom': time.sleep(0.05) steps += 1 actions, value_ext, value_int, policy = agent.get_action( np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() rall += r next_states = s.reshape([1, 4, image_size, image_size]) next_obs = s[3, :, :].reshape([1, 1, image_size, image_size]) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward(next_obs) intrinsic_reward_list.append(intrinsic_reward) states = next_states[:, :, :, :] if rd: intrinsic_reward_list = (intrinsic_reward_list - np.mean(intrinsic_reward_list) ) / np.std(intrinsic_reward_list) ''' with open('int_reward', 'wb') as f: pickle.dump(intrinsic_reward_list, f) ''' steps = 0 rall = 0
def sim_game(gpu_Q, N, data_Q, v_resign): print("Starting game") no_resign = np.random.rand(1)[0]>0.95 # Hyperparameters temp_switch = 16 #Number of turns before other temperature measure is used eta_par = 0.03 epsilon = 0.25 # Switch for adding calculated v relative to black relative_value = {"black": 1, "white": -1} turn_switcher = {"black": "white", "white": "black"} # Define pipe for GPU process conn_rec, conn_send = Pipe(False) # List for storing resignation values resign_list_black = [] resign_list_white = [] # Start game n_rounds = 0 turn_color = "white" number_passes = 0 go_game = go_board() data = [] resign = False # Evalute first node S = go_game.get_state(turn_switcher[turn_color]) # Get policy and value gpu_Q.put([S, conn_send]) P, v = conn_rec.recv() # Generate start node root_node = state_node(go_game, P, turn_switcher[turn_color]) root_node.illigal_board = np.zeros(82) # Run next moves while True: n_rounds += 1 turn_color = turn_switcher[turn_color] # Case where early temperature is used if (n_rounds<=temp_switch): # Simulate MCTS root_node = MCTS(root_node, gpu_Q , N, go_game, turn_color, number_passes) # Compute legal policy pi_legal = root_node.N/root_node.N_total # Selecet action action = np.random.choice(82, size=1, p=pi_legal)[0] # Case where later temperature is used else: # Get noise eta = np.random.dirichlet(np.ones(82)*eta_par) root_node.P = (1-epsilon)*root_node.P+epsilon*eta # Simulate MCTS root_node = MCTS(root_node, gpu_Q , N, go_game, turn_color, number_passes) # Compute legal actions visit count (needed for storing) pi_legal = root_node.N/root_node.N_total # Pick move action = np.argmax(root_node.N) # Save Data S = go_game.get_state(turn_color) data.append([S.copy(), pi_legal.copy(), turn_color]) # Check for resignation if (turn_color=="black"): try: resign_req = max([np.max(root_node.action_edges[action].Q), root_node.Q[action]]) except: resign_req = relative_value[turn_color]*root_node.Q[action] else: try: # To account for flipped v resign_req = -1*min([np.min(root_node.action_edges[action].Q), root_node.Q[action]]) except: resign_req = relative_value[turn_color]*root_node.Q[action] # Add resign values for color if (turn_color=="black"): resign_list_black.append(resign_req) else: resign_list_white.append(resign_req) # Check if game ends if ((no_resign==False) & (resign_req<v_resign)): # resign resign = True break # Convert and take action #print("Move n. ",n_rounds, "New move was: ", action, "color was: ", turn_color) if (action==81): go_game.move('pass', turn_color) number_passes += 1 else: go_game.move(np.unravel_index(action, (9,9)), turn_color) number_passes = 0 # Check if game is over or too long (9*9*2) if ((number_passes==2) | (n_rounds>162)): break # Pick move root_node = root_node.action_edges[action] # Game is over # Find winner if (resign==True): # Set winner depending on resigned color if (turn_color=="black"): z = {"black": -1, "white": 1} else: z = {"black": 1, "white": -1} else: # No resignation, points = go_game.count_points() # Black is winner if (points>0): z = {"black": 1, "white": -1} else: z = {"black": -1, "white": 1} # Define data arrays S_array = np.empty((n_rounds, 17, 9, 9), dtype=bool) P_array = np.empty((n_rounds, 82), dtype=float) z_array = np.empty((n_rounds), dtype=int) # Loop over each move and fill in arrays i = 0 for S, P , turn_color in data: S_array[i] = S P_array[i] = P z_array[i] = z[turn_color] i += 1 # Send data # In case game was used to check for false positives, compute lowest value if (no_resign==True): if (z["black"]==1): false_positive = min(resign_list_black) else: false_positive = min(resign_list_white) # Send data data_Q.put([S_array, P_array, z_array, false_positive]) else: data_Q.put([S_array, P_array, z_array, None])
def main(): print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] assert train_method == 'RND' env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_load_model = False is_render = False model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) run_path = Path( f'runs/{env_id}_{datetime.now().strftime("%b%d_%H-%M-%S")}') log_path = run_path / 'logs' subgoals_path = run_path / 'subgoal_plots' data_path = run_path / 'json_data' run_path.mkdir(parents=True) log_path.mkdir() subgoals_path.mkdir() data_path.mkdir() writer = SummaryWriter(log_path) use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') torch.set_default_tensor_type( 'torch.cuda.FloatTensor' if use_cuda else 'torch.FloatTensor') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) drn_model = DeepRelNov(agent.rnd, input_size, output_size, use_cuda=use_cuda) if is_load_model: print('load model...') if use_cuda: agent.model.load_state_dict(torch.load(model_path)) agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) agent.rnd.target.load_state_dict(torch.load(target_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) agent.rnd.predictor.load_state_dict( torch.load(predictor_path, map_location='cpu')) agent.rnd.target.load_state_dict( torch.load(target_path, map_location='cpu')) print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs print('Start to initailize observation normalization parameter.....') next_obs = [] for _ in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr, i = parent_conn.recv() next_obs.append(s[-1, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') #this is for all envs accumulated_worker_episode_reward = np.zeros((num_worker, )) #this is fora single env (env = 0) accumulated_worker_episode_info = { "images": [], "visited_rooms": [], "current_room": [], "player_pos": [] } episode_traj_buffer = [] episode_counter = 0 episode_rewards = [[] for _ in range(num_worker)] step_rewards = [[] for _ in range(num_worker)] global_ep = 0 while True: total_state, total_reward, total_done, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy, total_policy_np = \ [], [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for cur_step in range(num_step): actions, value_ext, value_int, policy = agent.get_action( np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs, info = [], [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr, i = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[-1, :, :].reshape([1, 84, 84])) info.append(i) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) accumulated_worker_episode_reward += rewards for i in range(len(rewards)): step_rewards[i].append(rewards[i]) if real_dones[i]: episode_rewards[i].append( accumulated_worker_episode_reward[i]) accumulated_worker_episode_reward[i] = 0 accumulated_worker_episode_info["images"].append(next_obs[0]) accumulated_worker_episode_info["visited_rooms"].append( info[0].get('episode', {}).get('visited_rooms', {})) accumulated_worker_episode_info["current_room"].append(info[0].get( 'current_room', {})) accumulated_worker_episode_info["player_pos"].append(info[0].get( 'player_pos', {})) if real_dones[0]: episode_traj_buffer.append(accumulated_worker_episode_info) accumulated_worker_episode_info = { "images": [], "visited_rooms": [], "current_room": [], "player_pos": [] } # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 writer.add_scalar('data/avg_reward_per_step', np.mean(rewards), global_step + num_worker * (cur_step - num_step)) while all(episode_rewards): global_ep += 1 avg_ep_reward = np.mean( [env_ep_rewards.pop(0) for env_ep_rewards in episode_rewards]) writer.add_scalar('data/avg_reward_per_episode', avg_ep_reward, global_ep) _, value_ext, value_int, _ = agent.get_action( np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() #total_int_reward = np.stack(total_int_reward).swapaxes(0, 1) total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! agent.train_model( np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip( -5, 5), total_policy) if global_step % (num_worker * num_step * 100) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path) ############################# for traj_num, episode_dict in enumerate(episode_traj_buffer): traj = np.array(episode_dict["images"]) obs_traj = ((traj - obs_rms.mean) / np.sqrt(obs_rms.var)).clip( -5, 5) drn_model.train_rel_nov(obs_traj) episode_counter += 1 if episode_counter % 100 == 0: subgoals = drn_model.get_filtered_subgoals(obs_traj, 1) #TODO Make Option episode_traj_buffer = []
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = gym.make(args.env_name) input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in args.env_name: output_size -= 1 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, args.env_name + '.model') predictor_path = os.path.join(args.save_dir, args.env_name + '.pred') target_path = os.path.join(args.save_dir, args.env_name + '.target') writer = SummaryWriter(log_dir=args.log_dir) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize observation print('Initializes observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action( model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv( ) next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward( rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path)
for _ in range(REPEAT): total_state, total_reward, total_target_reward, total_done, total_action, total_moreward\ = [], [], [], [], [], [] while True: actions = agent.get_action(states, explore_w) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, target_rewards, dones, real_dones, morewards, scores\ = [], [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, mor, sc = parent_conn.recv() next_states.append(s) rewards.append(explore_w.dot(mor)) target_rewards.append(UNKNOWN_PREFERENCE.dot(mor)) dones.append(d) real_dones.append(rd) morewards.append(mor) scores.append(sc) # resample if done # if d: # explore_w = renew_w(explore_w, cnt, pref_param) next_states = np.stack(next_states) rewards = np.hstack(rewards) * args.reward_scale target_rewards = np.hstack(target_rewards) * args.reward_scale dones = np.hstack(dones)
def main(): env = gym.make(args.env_name) env.seed(500) torch.manual_seed(500) img_shape = env.observation_space.shape num_actions = env.action_space.n - 1 print('image size:', img_shape) print('action size:', num_actions) net = FuN(num_actions, args, device) optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01) writer = SummaryWriter('logs') if not os.path.isdir(args.save_path): os.makedirs(args.save_path) workers = [] parent_conns = [] child_conns = [] for i in range(args.num_envs): parent_conn, child_conn = Pipe() worker = EnvWorker(args.env_name, args.render, child_conn) worker.start() workers.append(worker) parent_conns.append(parent_conn) child_conns.append(child_conn) net.to(device) net.train() global_steps = 0 score = np.zeros(args.num_envs) count = 0 grad_norm = 0 histories = torch.zeros([args.num_envs, 3, 84, 84]).to(device) m_hx = torch.zeros(args.num_envs, num_actions * 16).to(device) m_cx = torch.zeros(args.num_envs, num_actions * 16).to(device) m_lstm = (m_hx, m_cx) w_hx = torch.zeros(args.num_envs, num_actions * 16).to(device) w_cx = torch.zeros(args.num_envs, num_actions * 16).to(device) w_lstm = (w_hx, w_cx) goals_horizon = torch.zeros(args.num_envs, args.horizon + 1, num_actions * 16).to(device) while True: count += 1 memory = Memory() global_steps += (args.num_envs * args.num_step) # gather samples from the environment for i in range(args.num_step): # TODO: think about net output net_output = net(histories.to(device), m_lstm, w_lstm, goals_horizon) policies, goal, goals_horizon, m_lstm, w_lstm, m_value, w_value_ext, w_value_int, m_state = net_output actions = get_action(policies, num_actions) # send action to each worker environment and get state information next_histories, rewards, masks, dones = [], [], [], [] for i, (parent_conn, action) in enumerate(zip(parent_conns, actions)): parent_conn.send(action) next_history, reward, dead, done = parent_conn.recv() next_histories.append(next_history) rewards.append(reward) masks.append(1 - dead) dones.append(done) if dead: m_hx_mask = torch.ones(args.num_envs, num_actions * 16).to(device) m_hx_mask[i, :] = m_hx_mask[i, :] * 0 m_cx_mask = torch.ones(args.num_envs, num_actions * 16).to(device) m_cx_mask[i, :] = m_cx_mask[i, :] * 0 m_hx, m_cx = m_lstm m_hx = m_hx * m_hx_mask m_cx = m_cx * m_cx_mask m_lstm = (m_hx, m_cx) w_hx_mask = torch.ones(args.num_envs, num_actions * 16).to(device) w_hx_mask[i, :] = w_hx_mask[i, :] * 0 w_cx_mask = torch.ones(args.num_envs, num_actions * 16).to(device) w_cx_mask[i, :] = w_cx_mask[i, :] * 0 w_hx, w_cx = w_lstm w_hx = w_hx * w_hx_mask w_cx = w_cx * w_cx_mask w_lstm = (w_hx, w_cx) goal_init = torch.zeros(args.horizon + 1, num_actions * 16).to(device) goals_horizon[i] = goal_init score += rewards[0] # if agent in first environment dies, print and log score for i in range(args.num_envs): if dones[i]: entropy = -policies * torch.log(policies + 1e-5) entropy = entropy.mean().data.cpu() print( 'global steps {} | score: {} | entropy: {:.4f} | grad norm: {:.3f} ' .format(global_steps, score[i], entropy, grad_norm)) if i == 0: writer.add_scalar('log/score', score[i], global_steps) score[i] = 0 next_histories = torch.Tensor(next_histories).to(device) rewards = np.hstack(rewards) masks = np.hstack(masks) memory.push(histories, next_histories, actions, rewards, masks, goal, policies, m_lstm, w_lstm, m_value, w_value_ext, w_value_int, m_state) histories = next_histories # Train every args.num_step if (global_steps % args.num_step) == 0: # Need to fix logic transitions = memory.sample() loss, grad_norm = train_model(net, optimizer, transitions, args) m_hx, m_cx = m_lstm m_lstm = (m_hx.detach(), m_cx.detach()) w_hx, w_cx = w_lstm w_lstm = (w_hx.detach(), w_cx.detach()) goals_horizon = goals_horizon.detach() # avg_loss.append(loss.cpu().data) if count % args.save_interval == 0: ckpt_path = args.save_path + 'model.pt' torch.save(net.state_dict(), ckpt_path)
def main(): print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_load_model = True is_render = False model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) if is_load_model: print('load model...') if use_cuda: agent.model.load_state_dict(torch.load(model_path)) agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) agent.rnd.target.load_state_dict(torch.load(target_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) agent.rnd.predictor.load_state_dict( torch.load(predictor_path, map_location='cpu')) agent.rnd.target.load_state_dict( torch.load(target_path, map_location='cpu')) print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs print('Start to initailize observation normalization parameter.....') next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy, total_policy_np = \ [], [], [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value_ext, value_int, policy = agent.get_action( np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action( np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! agent.train_model( np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip( -5, 5), total_policy) if global_step % (num_worker * num_step * 100) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path)
def main(): print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] env_type = default_config['EnvType'] if env_type == 'mario': env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) elif env_type == 'atari': env = gym.make(env_id) else: raise NotImplementedError input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in env_id: output_size -= 1 env.close() is_load_model = False is_render = False model_path = 'models/{}.model'.format(env_id) icm_path = 'models/{}.icm'.format(env_id) writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = 32 num_step = 128 ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = 256 learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) eta = float(default_config['ETA']) clip_grad_norm = float(default_config['ClipGradNorm']) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(gamma) agent = ICMAgent if default_config['EnvType'] == 'atari': env_type = AtariEnvironment elif default_config['EnvType'] == 'mario': env_type = MarioEnvironment else: raise NotImplementedError agent = agent(input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, eta=eta, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net) if is_load_model: if use_cuda: agent.model.load_state_dict(torch.load(model_path)) else: agent.model.load_state_dict( torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs print('Start to initailize observation normalization parameter.....') next_obs = [] steps = 0 while steps < pre_obs_norm_step: steps += num_worker actions = np.random.randint(0, output_size, size=(num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_obs = np.stack(next_obs) obs_rms.update(next_obs) print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy = \ [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value, policy = agent.get_action( (np.float32(states) - obs_rms.mean) / np.sqrt(obs_rms.var)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) # total reward = int reward intrinsic_reward = agent.compute_intrinsic_reward( (states - obs_rms.mean) / np.sqrt(obs_rms.var), (next_states - obs_rms.mean) / np.sqrt(obs_rms.var), actions) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_int_reward.append(intrinsic_reward) total_state.append(states) total_next_state.append(next_states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_values.append(value) total_policy.append(policy) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value, _ = agent.get_action( (np.float32(states) - obs_rms.mean) / np.sqrt(obs_rms.var)) total_values.append(value) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_next_state = np.stack(total_next_state).transpose( [1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_action = np.stack(total_action).transpose().reshape([-1]) total_reward = np.stack(total_reward).transpose() total_done = np.stack(total_done).transpose() total_values = np.stack(total_values).transpose() total_logging_policy = np.vstack(total_policy) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage target, adv = make_train_data_icm(total_int_reward, np.zeros_like(total_int_reward), total_values, gamma, num_step, num_worker) adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8) # ----------------------------------------------- # Step 5. Training! print('training') agent.train_model( (np.float32(total_state) - obs_rms.mean) / np.sqrt(obs_rms.var), (np.float32(total_next_state) - obs_rms.mean) / np.sqrt(obs_rms.var), target, total_action, adv, total_policy) if global_step % (num_worker * num_step * 100) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.icm.state_dict(), icm_path)