def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') seed = np.random.randint(0, 100) env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed, retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300) env._flattener = ActionFlattener([2, 3, 2, 1]) env._action_space = env._flattener.action_space input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, 'main.model') predictor_path = os.path.join(args.save_dir, 'main.pred') target_path = os.path.join(args.save_dir, 'main.target') writer = SummaryWriter()#log_dir=args.log_dir) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: "Loading model..." if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment( args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 print("Load RMS =", args.load_rms) if args.load_rms: print("Loading RMS values for observation and reward normalization") with open('reward_rms.pkl', 'rb') as f: reward_rms = dill.load(f) with open('obs_rms.pkl', 'rb') as f: obs_rms = dill.load(f) else: reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) # normalize observation print('Initializing observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv() next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward(rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path) """ checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))]) if len(checkpoint_list) == 0: last_checkpoint = -1 else: last_checkpoint = checkpoint_list.max() next_checkpoint = last_checkpoint + 1 print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint)) incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model') incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred') incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target') with open(incre_model_path, 'wb') as f: torch.save(model.state_dict(), f) with open(incre_predictor_path, 'wb') as f: torch.save(rnd.predictor.state_dict(), f) with open(incre_target_path, 'wb') as f: torch.save(rnd.target.state_dict(), f) """ if args.terminate and (global_step > args.terminate_steps): with open('reward_rms.pkl', 'wb') as f: dill.dump(reward_rms, f) with open('obs_rms.pkl', 'wb') as f: dill.dump(obs_rms, f) break
def main(): args = parse_arguments() train_method = args.train_method env_id = args.env_id env_type = args.env_type if env_type == 'atari': env = gym.make(env_id) input_size = env.observation_space.shape output_size = env.action_space.n env.close() else: raise NotImplementedError is_load_model = False is_render = False os.makedirs('models', exist_ok=True) model_path = 'models/{}.model'.format(env_id) predictor_path = 'models/{}.pred'.format(env_id) target_path = 'models/{}.target'.format(env_id) results_dir = os.path.join('outputs', args.env_id) os.makedirs(results_dir, exist_ok=True) logger = Logger(results_dir) writer = SummaryWriter(os.path.join(results_dir, 'tensorboard', args.env_id)) use_cuda = args.use_gpu use_gae = args.use_gae use_noisy_net = args.use_noisynet lam = args.lam num_worker = args.num_worker num_step = args.num_step ppo_eps = args.ppo_eps epoch = args.epoch mini_batch = args.minibatch batch_size = int(num_step * num_worker / mini_batch) learning_rate = args.learning_rate entropy_coef = args.entropy gamma = args.gamma int_gamma = args.int_gamma clip_grad_norm = args.clip_grad_norm ext_coef = args.ext_coef int_coef = args.int_coef sticky_action = args.sticky_action action_prob = args.action_prob life_done = args.life_done pre_obs_norm_step = args.obs_norm_step reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(int_gamma) if args.train_method == 'RND': agent = RNDAgent else: raise NotImplementedError if args.env_type == 'atari': env_type = AtariEnvironment else: raise NotImplementedError agent = agent( input_size, output_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net ) logger.info('Start to initialize workers') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = env_type(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, max_step_per_episode=args.max_step_per_episode) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize obs logger.info('Start to initailize observation normalization parameter.....') next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.randint(0, output_size, size=(num_worker,)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] logger.info('End to initalize...') pbar = tqdm.tqdm(total=args.total_frames) while True: logger.info('Iteration: {}'.format(global_update)) total_state, total_reward, total_done, total_next_state, \ total_action, total_int_reward, total_next_obs, total_ext_values, \ total_int_values, total_policy, total_policy_np = \ [], [], [], [], [], [], [], [], [], [], [] global_step += (num_worker * num_step) global_update += 1 # Step 1. n-step rollout for _ in range(num_step): actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = \ [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_policy.append(policy) total_policy_np.append(policy.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 writer.add_scalar('data/returns_vs_frames', sample_rall, global_step) writer.add_scalar('data/lengths_vs_frames', sample_step, global_step) writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # logging Max action probability writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # Step 4. update obs normalize param obs_rms.update(total_next_obs) # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy) if args.save_models and global_update % 1000 == 0: torch.save(agent.model.state_dict(), 'models/{}-{}.model'.format(env_id, global_update)) logger.info('Now Global Step :{}'.format(global_step)) torch.save(agent.model.state_dict(), model_path) torch.save(agent.rnd.predictor.state_dict(), predictor_path) torch.save(agent.rnd.target.state_dict(), target_path) pbar.update(num_worker * num_step) if global_step >= args.total_frames: break pbar.close()
def main(): args = get_args() device = torch.device('cuda' if args.cuda else 'cpu') env = gym.make(args.env_name) input_size = env.observation_space.shape # 4 output_size = env.action_space.n # 2 if 'Breakout' in args.env_name: output_size -= 1 env.close() is_render = False if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model_path = os.path.join(args.save_dir, args.env_name + '.model') predictor_path = os.path.join(args.save_dir, args.env_name + '.pred') target_path = os.path.join(args.save_dir, args.env_name + '.target') writer = SummaryWriter(log_dir=args.log_dir) reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) discounted_reward = RewardForwardFilter(args.ext_gamma) model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net) rnd = RNDModel(input_size, output_size) model = model.to(device) rnd = rnd.to(device) optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr) if args.load_model: if args.cuda: model.load_state_dict(torch.load(model_path)) else: model.load_state_dict(torch.load(model_path, map_location='cpu')) works = [] parent_conns = [] child_conns = [] for idx in range(args.num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(args.env_name, is_render, idx, child_conn, sticky_action=args.sticky_action, p=args.sticky_action_prob, max_episode_steps=args.max_episode_steps) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) states = np.zeros([args.num_worker, 4, 84, 84]) sample_env_index = 0 # Sample Environment index to log sample_episode = 0 sample_rall = 0 sample_step = 0 sample_i_rall = 0 global_update = 0 global_step = 0 # normalize observation print('Initializes observation normalization...') next_obs = [] for step in range(args.num_step * args.pre_obs_norm_steps): actions = np.random.randint(0, output_size, size=(args.num_worker, )) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: next_state, reward, done, realdone, log_reward = parent_conn.recv() next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (args.num_step * args.num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('Training...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], [] global_step += (args.num_worker * args.num_step) global_update += 1 # Step 1. n-step rollout for _ in range(args.num_step): actions, value_ext, value_int, action_probs = get_action( model, device, np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: next_state, reward, done, real_done, log_reward = parent_conn.recv( ) next_states.append(next_state) rewards.append(reward) dones.append(done) real_dones.append(real_done) log_rewards.append(log_reward) next_obs.append(next_state[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = compute_intrinsic_reward( rnd, device, ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_index] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions) total_ext_values.append(value_ext) total_int_values.append(value_int) total_action_probs.append(action_probs) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_index] sample_step += 1 if real_dones[sample_env_index]: sample_episode += 1 writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) writer.add_scalar('data/step', sample_step, sample_episode) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape( [-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) total_action = np.stack(total_action).transpose().reshape([-1]) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose( [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() total_logging_action_probs = np.vstack(total_action_probs) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([ discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T ]) mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std**2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode) writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, args.ext_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, args.int_gamma, args.gae_lambda, args.num_step, args.num_worker, args.use_gae) # add ext adv and int adv total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- # Step 5. Training! train_model(args, device, output_size, model, rnd, optimizer, np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs) if global_step % (args.num_worker * args.num_step * args.save_interval) == 0: print('Now Global Step :{}'.format(global_step)) torch.save(model.state_dict(), model_path) torch.save(rnd.predictor.state_dict(), predictor_path) torch.save(rnd.target.state_dict(), target_path)
class Trainer: def __init__(self, num_training_steps, num_env, num_game_steps, num_epoch, learning_rate, discount_factor, int_discount_factor, num_action, value_coef, clip_range, save_interval, entropy_coef, lam, mini_batch_num, num_action_repeat, load_path, ext_adv_coef, int_adv_coef, num_pre_norm_steps, predictor_update_proportion): self.training_steps = num_training_steps self.num_epoch = num_epoch self.learning_rate = learning_rate self.discount_factor = discount_factor self.num_game_steps = num_game_steps self.num_env = num_env self.batch_size = num_env * num_game_steps self.clip_range = clip_range self.value_coef = value_coef self.entropy_coef = entropy_coef self.mini_batch_num = mini_batch_num self.num_action = num_action self.num_pre_norm_steps = num_pre_norm_steps self.int_discount_factor = int_discount_factor self.predictor_update_proportion = predictor_update_proportion assert self.batch_size % self.mini_batch_num == 0 self.mini_batch_size = int(self.batch_size / self.mini_batch_num) self.current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = 'logs/' + self.current_time + '/log' self.save_interval = save_interval self.lam = lam self.num_action_repeat = num_action_repeat self.clip_range = clip_range self.value_coef = value_coef self.entropy_coef = entropy_coef self.load_path = load_path self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.new_model = Model(self.num_action).to(self.device) self.ext_adv_coef = ext_adv_coef self.int_adv_coef = int_adv_coef self.writer = SummaryWriter('logs/' + self.current_time + '/log') print("-----------------------------------------") print("program configuration") print("time: ", self.current_time) print("number of train steps: ", self.training_steps) print("normilization steps parameter: ", self.num_pre_norm_steps) print("num_env: ", self.num_env) print("number of epochs: ", self.num_epoch) print("steps: ", self.num_game_steps) print("mini batch: ", self.mini_batch_size) print("lr: ", self.learning_rate) print("gamma: ", self.discount_factor) print("intrinsic gamma: ", self.int_discount_factor) print("lambda: ", self.lam) print("clip: ", self.clip_range) print("v_coef: ", self.value_coef) print("ent_coef: ", self.entropy_coef) print("the predictor's update proportion: ", self.predictor_update_proportion) print("intrinsic advantages coefficient: ", self.int_adv_coef) print("extrinsic advantages coefficient: ", self.ext_adv_coef) print("-----------------------------------------") self.target_model = TargetModel().to(self.device) self.predictor_model = PredictorModel().to(self.device) self.mse_loss = nn.MSELoss() self.predictor_mse_loss = nn.MSELoss(reduction='none') self.optimizer = optim.Adam(list(self.new_model.parameters()) + list(self.predictor_model.parameters()), lr=self.learning_rate) self.reward_rms = RunningStdMean() self.obs_rms = RunningStdMean(shape=(1, 1, 84, 84)) self.reward_filter = RewardForwardFilter(self.int_discount_factor) def collect_experiance_and_train(self): start_train_step = 0 sample_episode_num = 0 if flag.LOAD: if self.device.type == "cpu": checkpoint = torch.load(self.load_path, map_location=self.device) else: checkpoint = torch.load(self.load_path) self.new_model.load_state_dict(checkpoint['new_model_state_dict']) self.predictor_model.load_state_dict( checkpoint['predictor_state_dict']) self.target_model.load_state_dict(checkpoint['target_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) start_train_step = checkpoint['train_step'] sample_episode_num = checkpoint['ep_num'] self.obs_rms.mean = checkpoint['obs_mean'] self.obs_rms.var = checkpoint['obs_var'] self.obs_rms.count = checkpoint['obs_count'] self.reward_rms.mean = checkpoint['rew_mean'] self.reward_rms.var = checkpoint['rew_var'] self.reward_rms.count = checkpoint['rew_count'] self.reward_filter.rewems = checkpoint['rewems'] print("loaded model weights from checkpoint") current_observations = [] parents = [] childs = [] envs = [] for i in range(self.num_env): parent, child = Pipe() if flag.ENV == "MR": new_env = montezuma_revenge_env \ .MontezumaRevenge(i, child, self.num_action_repeat, 0.25, 6000) new_env.start() envs.append(new_env) parents.append(parent) childs.append(child) if flag.LOAD: actions = np.random.randint(0, self.num_action, size=(self.num_env)) for i in range(0, len(parents)): parents[i].send(actions[i]) current_observations = [] for i in range(0, len(parents)): obs, rew, done = parents[i].recv() current_observations.append(obs) else: # normalize observations observations_to_normalize = [] for step in range(self.num_game_steps * self.num_pre_norm_steps): actions = np.random.randint(0, self.num_action, size=(self.num_env)) for i in range(0, len(parents)): parents[i].send(actions[i]) current_observations = [] for i in range(0, len(parents)): obs, rew, done = parents[i].recv() current_observations.append(obs) observations_to_normalize.extend(current_observations) if (len(observations_to_normalize) % (self.num_game_steps * self.num_env) == 0): observations_to_normalize = np.stack( observations_to_normalize)[:, 3, :, :].reshape( -1, 1, 84, 84) self.obs_rms.update(observations_to_normalize) observations_to_normalize = [] print("normalization ended") sample_ext_reward = 0 sample_int_reward = 0 for train_step in range(start_train_step, self.training_steps): total_observations = [] total_int_rewards = [] total_ext_rewards = [] total_dones = [] total_int_values = [] total_ext_values = [] total_actions = [] for game_step in range(self.num_game_steps): total_observations.extend(current_observations) with torch.no_grad(): current_observations_tensor = torch.from_numpy( np.array(current_observations)).float().to(self.device) decided_actions, predicted_ext_values, \ predicted_int_values \ = self.new_model.step( current_observations_tensor / 255. ) one_channel_observations = np.array( current_observations)[:, 3, :, :].reshape(-1, 1, 84, 84) one_channel_observations = ( (one_channel_observations - self.obs_rms.mean) / np.sqrt(self.obs_rms.var)).clip(-5, 5) one_channel_observations_tensor = torch.from_numpy( one_channel_observations).float().to(self.device) int_reward = self.get_intrinsic_rewards( one_channel_observations_tensor) total_int_rewards.append(int_reward) total_int_values.append(predicted_int_values) total_ext_values.append(predicted_ext_values) total_actions.extend(decided_actions) current_observations = [] for i in range(0, len(parents)): parents[i].send(decided_actions[i]) step_rewards = [] step_dones = [] for i in range(0, len(parents)): observation, reward, done = parents[i].recv() current_observations.append(observation) step_rewards.append(reward) step_dones.append(done) sample_ext_reward += step_rewards[0] sample_int_reward += int_reward[0] if step_dones[0]: self.writer.add_scalar( 'ext_reward_per_episode_for_one_env', sample_ext_reward, sample_episode_num) self.writer.add_scalar( 'int_reward_per_episode_for_one_env', sample_int_reward, sample_episode_num) sample_ext_reward = 0 sample_int_reward = 0 sample_episode_num += 1 total_ext_rewards.append(step_rewards) total_dones.append(step_dones) # next state value, required for computing advantages with torch.no_grad(): current_observations_tensor = torch.from_numpy( np.array(current_observations)).float().to(self.device) decided_actions, predicted_ext_values, predicted_int_values = \ self.new_model.step( current_observations_tensor / 255.) total_int_values.append(predicted_int_values) total_ext_values.append(predicted_ext_values) # convert lists to numpy arrays observations_array = np.array(total_observations) total_one_channel_observations_array = ( observations_array[:, 3, :, :].reshape(-1, 1, 84, 84)) self.obs_rms.update(total_one_channel_observations_array) total_one_channel_observations_array \ = ((total_one_channel_observations_array - self.obs_rms.mean) / np.sqrt( self.obs_rms.var)).clip(-5, 5) ext_rewards_array = np.array(total_ext_rewards).clip(-1, 1) dones_array = np.array(total_dones) ext_values_array = np.array(total_ext_values) int_values_array = np.array(total_int_values) actions_array = np.array(total_actions) int_rewards_array = np.stack(total_int_rewards) total_reward_per_env = np.array([ self.reward_filter.update(reward_per_env) for reward_per_env in int_rewards_array.T ]) # calcuting returns for every env mean, std, count = np.mean(total_reward_per_env), np.std( total_reward_per_env), len(total_reward_per_env) self.reward_rms.update_from_mean_std(mean, std**2, count) # normalize intrinsic reward int_rewards_array /= np.sqrt(self.reward_rms.var) self.writer.add_scalar( 'avg_int_reward_per_train_step_for_all_envs', np.sum(int_rewards_array) / self.num_env, train_step) self.writer.add_scalar('int_reward_for_one_env_per_train_step', int_rewards_array.T[0].mean(), train_step) ext_advantages_array, ext_returns_array = self.compute_advantage( ext_rewards_array, ext_values_array, dones_array, 0) int_advantages_array, int_returns_array = self.compute_advantage( int_rewards_array, int_values_array, dones_array, 1) advantages_array = self.ext_adv_coef * ext_advantages_array \ + self.int_adv_coef \ * int_advantages_array if flag.DEBUG: print("all actions are", total_actions) observations_tensor = torch.from_numpy( np.array(observations_array)).float().to(self.device) observations_tensor = observations_tensor / 255. ext_returns_tensor = torch.from_numpy( np.array(ext_returns_array)).float().to(self.device) int_returns_tensor = torch.from_numpy( np.array(int_returns_array)).float().to(self.device) actions_tensor = torch.from_numpy( np.array(actions_array)).long().to(self.device) advantages_tensor = torch.from_numpy( np.array(advantages_array)).float().to(self.device) one_channel_observations_tensor = torch.from_numpy( total_one_channel_observations_array).float().to(self.device) random_indexes = np.arange(self.batch_size) np.random.shuffle(random_indexes) with torch.no_grad(): old_policy, _, _ = self.new_model(observations_tensor) dist_old = Categorical(F.softmax(old_policy, dim=1)) old_log_prob = dist_old.log_prob(actions_tensor) loss_avg = [] policy_loss_avg = [] value_loss_avg = [] entropy_avg = [] predictor_loss_avg = [] for epoch in range(0, self.num_epoch): # print("----------------next epoch----------------") for n in range(0, self.mini_batch_num): # print("----------------next mini batch-------------") start_index = n * self.mini_batch_size index_slice = random_indexes[start_index:start_index + self.mini_batch_size] if flag.DEBUG: print("indexed chosen are:", index_slice) experience_slice = (arr[index_slice] for arr in ( observations_tensor, ext_returns_tensor, int_returns_tensor, actions_tensor, advantages_tensor, one_channel_observations_tensor)) loss, policy_loss, value_loss, predictor_loss, entropy \ = self.train_model( *experience_slice, old_log_prob[index_slice] ) if epoch == self.num_epoch - 1: loss = loss.detach().cpu().numpy() policy_loss = policy_loss.detach().cpu().numpy() predictor_loss = predictor_loss.detach().cpu().numpy() value_loss = value_loss.detach().cpu().numpy() entropy = entropy.detach().cpu().numpy() loss_avg.append(loss) policy_loss_avg.append(policy_loss) value_loss_avg.append(value_loss) entropy_avg.append(entropy) predictor_loss_avg.append(predictor_loss) loss_avg_result = np.array(loss_avg).mean() policy_loss_avg_result = np.array(policy_loss_avg).mean() value_loss_avg_result = np.array(value_loss_avg).mean() entropy_avg_result = np.array(entropy_avg).mean() predictor_loss_avg_result = np.array(predictor_loss_avg).mean() print( "training step {:03d}, Epoch {:03d}: Loss: {:.3f}, policy loss" ": {:.3f}, value loss: {:.3f},predictor loss: {:.3f}," " entropy: {:.3f} ".format(train_step, epoch, loss_avg_result, policy_loss_avg_result, value_loss_avg_result, predictor_loss_avg_result, entropy_avg_result)) if flag.TENSORBOARD_AVALAIBLE: self.writer.add_scalar('loss_avg', loss_avg_result, train_step) self.writer.add_scalar('policy_loss_avg', policy_loss_avg_result, train_step) self.writer.add_scalar('value_loss_avg', value_loss_avg_result, train_step) self.writer.add_scalar('predictor_loss_avg', predictor_loss_avg_result, train_step) self.writer.add_scalar('entropy_avg', entropy_avg_result, train_step) if train_step % self.save_interval == 0: train_checkpoint_dir = 'logs/' + self.current_time + str( train_step) torch.save( { 'train_step': train_step, 'new_model_state_dict': self.new_model.state_dict(), 'predictor_state_dict': self.predictor_model.state_dict(), 'target_state_dict': self.target_model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'obs_mean': self.obs_rms.mean, 'obs_var': self.obs_rms.var, 'obs_count': self.obs_rms.count, 'rew_mean': self.reward_rms.mean, 'rew_var': self.reward_rms.var, 'rew_count': self.reward_rms.count, 'rewems': self.reward_filter.rewems, 'ep_num': sample_episode_num }, train_checkpoint_dir) def compute_advantage(self, rewards, values, dones, int_flag=0): if flag.DEBUG: print("---------computing advantage---------") print("rewards are", rewards) print("values from steps are", values) if int_flag == 1: discount_factor = self.int_discount_factor else: discount_factor = self.discount_factor advantages = [] advantage = 0 for step in reversed(range(self.num_game_steps)): if int_flag == 1: is_there_a_next_state = 1 else: is_there_a_next_state = 1.0 - dones[step] delta = rewards[step] + (is_there_a_next_state * discount_factor * values[step + 1]) - values[step] if flag.USE_GAE: advantage = delta + discount_factor * \ self.lam * is_there_a_next_state * advantage advantages.append(advantage) else: advantages.append(delta) advantages.reverse() advantages = np.array(advantages) advantages = advantages.flatten() values = values[:-1] returns = advantages + values.flatten() if flag.DEBUG: print("all advantages are", advantages) print("all returns are", returns) return advantages, returns def train_model(self, observations_tensor, ext_returns_tensor, int_returns_tensor, actions_tensor, advantages_tensor, one_channel_observations_tensor, old_log_prob): if flag.DEBUG: print("input observations shape", observations_tensor.shape) print("ext returns shape", ext_returns_tensor.shape) print("int returns shape", int_returns_tensor.shape) print("input actions shape", actions_tensor.shape) print("input advantages shape", advantages_tensor.shape) print("one channel observations", one_channel_observations_tensor.shape) self.new_model.train() self.predictor_model.train() target_value = self.target_model(one_channel_observations_tensor) predictor_value = self.predictor_model(one_channel_observations_tensor) predictor_loss = self.predictor_mse_loss(predictor_value, target_value).mean(-1) mask = torch.rand(len(predictor_loss)).to(self.device) mask = (mask < self.predictor_update_proportion).type( torch.FloatTensor).to(self.device) predictor_loss = (predictor_loss * mask).sum() / torch.max( mask.sum(), torch.Tensor([1]).to(self.device)) new_policy, ext_new_values, int_new_values = self.new_model( observations_tensor) ext_value_loss = self.mse_loss(ext_new_values, ext_returns_tensor) int_value_loss = self.mse_loss(int_new_values, int_returns_tensor) value_loss = ext_value_loss + int_value_loss softmax_policy = F.softmax(new_policy, dim=1) new_dist = Categorical(softmax_policy) new_log_prob = new_dist.log_prob(actions_tensor) ratio = torch.exp(new_log_prob - old_log_prob) clipped_policy_loss = torch.clamp(ratio, 1.0 - self.clip_range, 1 + self.clip_range) \ * advantages_tensor policy_loss = ratio * advantages_tensor selected_policy_loss = -torch.min(clipped_policy_loss, policy_loss).mean() entropy = new_dist.entropy().mean() self.optimizer.zero_grad() loss = selected_policy_loss + (self.value_coef * value_loss) \ - (self.entropy_coef * entropy) + predictor_loss loss.backward() global_grad_norm_( list(self.new_model.parameters()) + list(self.predictor_model.parameters())) self.optimizer.step() return loss, selected_policy_loss, value_loss, predictor_loss, entropy def get_intrinsic_rewards(self, input_observation): target_value = self.target_model(input_observation) # shape: [n,512] predictor_value = self.predictor_model( input_observation) # shape [n,512] intrinsic_reward = (target_value - predictor_value).pow(2).sum(1) / 2 intrinsic_reward = intrinsic_reward.data.cpu().numpy() return intrinsic_reward
class RNDagent(object): def __init__(self, input_size, output_size, seed, num_env, pre_obs_norm_step, num_step, gamma=0.99, gamma_int=0.99, lam=0.95, int_coef=1., ext_coef=2., ent_coef=0.001, cliprange=0.1, max_grad_norm=0.0, lr=1e-4, nepochs=4, batch_size=128, update_proportion=0.25, use_gae=True): self.num_env = num_env self.output_size = output_size self.input_size = input_size self.seed = np.random.seed(seed) self.pre_obs_norm_step = pre_obs_norm_step self.num_step = num_step self.gamma = gamma self.gamma_int = gamma_int self.lam = lam self.nepochs = nepochs self.batch_size = batch_size self.use_gae = use_gae self.int_coef = int_coef self.ext_coef = ext_coef self.ent_coef = ent_coef self.cliprange = cliprange self.max_grad_norm = max_grad_norm self.update_proportion = update_proportion self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = CnnActorCritic(input_size, output_size, seed).to(self.device) self.rnd = RNDModel(input_size, output_size, seed).to(self.device) self.optimizer = optim.Adam(list(self.model.parameters()) + list(self.rnd.predictor.parameters()), lr=lr) self.rff_int = RewardForwardFilter(gamma) #self.rff_rms_int = RunningMeanStd() #self.obs_rms = RunningMeanStd(shape=(1,84,84)) self.rff_rms_int = RunningMeanStd_openAI() self.obs_rms = RunningMeanStd_openAI(shape=(1, 84, 84)) self.rooms = None self.n_rooms = [] self.best_nrooms = -np.inf self.scores = [] self.scores_window = deque(maxlen=100) self.stats = defaultdict(float) # Count episodes and timesteps self.stats['epcount'] = 0 self.stats['tcount'] = 0 def collect_random_statistics(self, envs): """Initializes observation normalization with data from random agent.""" all_ob = [] all_ob.append(envs.reset()) for _ in range(self.pre_obs_norm_step): actions = np.random.randint(0, self.output_size, size=(self.num_env, )) ob, _, _, _ = envs.step(actions) all_ob.append(ob) if len(all_ob) % (128 * self.num_env) == 0: ob_ = np.asarray(all_ob).astype(np.float32).reshape( (-1, *envs.observation_space.shape)) self.obs_rms.update(ob_[:, -1:, :, :]) all_ob.clear() def act(self, state, action=None, calc_ent=False): """Returns dict of trajectory info. Shape ====== state (uint8) : (batch_size, framestack=4, 84, 84) Returns example {'a': tensor([10, 5, 1]), 'ent': None, 'log_pi_a': tensor([-2.8904, -2.8904, -2.8904], grad_fn=<SqueezeBackward1>), 'v_ext': tensor([0.0012, 0.0012, 0.0012], grad_fn=<SqueezeBackward0>), 'v_int': tensor([-0.0013, -0.0013, -0.0013], grad_fn=<SqueezeBackward0>)} """ #state = torch.FloatTensor(state / 255).to(self.device) assert state.dtype == 'uint8' state = torch.tensor(state / 255., dtype=torch.float, device=self.device) #state = torch.from_numpy(state /255).float().to(self.device) action_probs, value_ext, value_int = self.model(state) dist = Categorical(action_probs) if action is None: action = dist.sample() log_prob = dist.log_prob(action) entropy = dist.entropy() if calc_ent else None return { 'a': action, 'log_pi_a': log_prob, 'ent': entropy, 'v_ext': value_ext.squeeze(), 'v_int': value_int.squeeze() } def compute_intrinsic_reward(self, next_obs): """next_obs is the latest frame and must be normalized by RunningMeanStd(shape=(1, 84, 84)) Shape ====== next_obs : (batch_size, 1, 84, 84) """ next_obs = torch.tensor(next_obs, dtype=torch.float, device=self.device) #next_obs = torch.FloatTensor(next_obs).to(self.device) target_next_feature = self.rnd.target(next_obs) predict_next_feature = self.rnd.predictor(next_obs) intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).mean( 1) ### MSE --- Issues #intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum(1) / 2 return intrinsic_reward.data.cpu().numpy() def step(self, envs): """ """ # Step 1. n-step rollout next_obs_batch, int_reward_batch, state_batch, reward_batch, done_batch, action_batch, values_ext_batch, values_int_batch, log_prob_old_batch = [],[],[],[],[],[],[],[],[] epinfos = [] states = envs.reset() for _ in range(self.num_step): traj_info = self.act(states) log_prob_old = traj_info['log_pi_a'].detach().cpu().numpy() actions = traj_info['a'].cpu().numpy() value_ext = traj_info['v_ext'].detach().cpu().numpy() value_int = traj_info['v_int'].detach().cpu().numpy() next_states, rewards, dones, infos = envs.step(actions) next_obs = next_states[:, -1:, :, :] intrinsic_reward = self.compute_intrinsic_reward( ((next_obs - self.obs_rms.mean) / (np.sqrt(self.obs_rms.var))).clip(-5, 5)) #+1e-10 next_obs_batch.append(next_obs) int_reward_batch.append(intrinsic_reward) state_batch.append(states) reward_batch.append(rewards) done_batch.append(dones) action_batch.append(actions) values_ext_batch.append(value_ext) values_int_batch.append(value_int) log_prob_old_batch.append(log_prob_old) for info in infos: if 'episode' in info: epinfos.append(info['episode']) states = next_states # calculate last next value last_traj_info = self.act(states) values_ext_batch.append(last_traj_info['v_ext'].detach().cpu().numpy()) values_int_batch.append(last_traj_info['v_int'].detach().cpu().numpy()) # convert to numpy array and transpose (num_env, num_step) from (num_step, num_env) for the later calculation # For self.update() state_batch = np.stack(state_batch).transpose(1, 0, 2, 3, 4).reshape( -1, 4, 84, 84) next_obs_batch = np.stack(next_obs_batch).transpose(1, 0, 2, 3, 4).reshape( -1, 1, 84, 84) action_batch = np.stack(action_batch).transpose().reshape(-1, ) log_prob_old_batch = np.stack(log_prob_old_batch).transpose().reshape( -1, ) # For get_advantage_and_value_target_from() reward_batch = np.stack(reward_batch).transpose() done_batch = np.stack(done_batch).transpose() values_ext_batch = np.stack(values_ext_batch).transpose() values_int_batch = np.stack(values_int_batch).transpose() # -------------------------------------------------- # Step 2. calculate intrinsic reward # running estimate of the intrinsic returns int_reward_batch = np.stack(int_reward_batch).transpose() discounted_reward_per_env = np.array([ self.rff_int.update(reward_per_step) for reward_per_step in int_reward_batch.T[::-1] ]) mean, std, count = np.mean(discounted_reward_per_env), np.std( discounted_reward_per_env), len(discounted_reward_per_env) self.rff_rms_int.update_from_moments(mean, std**2, count) ### THINK ddof ! # normalize intrinsic reward int_reward_batch /= np.sqrt(self.rff_rms_int.var) # ------------------------------------------------------------------------------------------- # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = get_advantage_and_value_target_from( reward_batch, done_batch, values_ext_batch, self.gamma, self.lam, self.num_step, self.num_env, self.use_gae) # intrinsic reward calculate # None Episodic int_target, int_adv = get_advantage_and_value_target_from( int_reward_batch, np.zeros_like(int_reward_batch), values_int_batch, self.gamma_int, self.lam, self.num_step, self.num_env, self.use_gae) # add ext adv and int adv total_advs = self.int_coef * int_adv + self.ext_coef * ext_adv # ----------------------------------------------- # Step 4. update obs normalize param self.obs_rms.update(next_obs_batch) # ----------------------------------------------- # Step 5. Train loss_infos = self.update( state_batch, ext_target, int_target, action_batch, total_advs, ((next_obs_batch - self.obs_rms.mean) / (np.sqrt(self.obs_rms.var))).clip(-5, 5), #+1e-10 log_prob_old_batch) # ----------------------------------------------- # Collects info for reporting. vals_info = dict( advextmean=ext_adv.mean(), retextmean=ext_target.mean(), advintmean=int_adv.mean(), retintmean=int_target.mean(), rewintsample=int_reward_batch[1] # env_number = 1 ) # Some reporting logic for epinfo in epinfos: #if self.testing: # self.I.statlists['eprew_test'].append(epinfo['r']) # self.I.statlists['eplen_test'].append(epinfo['l']) #else: if "visited_rooms" in epinfo: self.n_rooms.append(len(epinfo["visited_rooms"])) if self.best_nrooms is None: self.best_nrooms = len(epinfo["visited_rooms"]) elif len(epinfo["visited_rooms"]) > self.best_nrooms: self.best_nrooms = len(epinfo["visited_rooms"]) self.rooms = sorted(list(epinfo["visited_rooms"])) #self.rooms += list(epinfo["visited_rooms"]) #self.rooms = sorted(list(set(self.rooms))) #self.I.statlists['eprooms'].append(len(epinfo["visited_rooms"])) self.scores.append(epinfo['r']) self.scores_window.append(epinfo['r']) self.stats['epcount'] += 1 self.stats['tcount'] += epinfo['l'] #self.I.statlists['eprew'].append(epinfo['r']) #self.I.statlists['eplen'].append(epinfo['l']) #self.stats['rewtotal'] += epinfo['r'] return {'loss': loss_infos, 'vals': vals_info} def update(self, s_batch, target_ext_batch, target_int_batch, action_batch, adv_batch, next_obs_batch, log_prob_old_batch): #s_batch = torch.FloatTensor(s_batch).to(self.device) target_ext_batch = torch.FloatTensor(target_ext_batch).to(self.device) target_int_batch = torch.FloatTensor(target_int_batch).to(self.device) action_batch = torch.LongTensor(action_batch).to(self.device) adv_batch = torch.FloatTensor(adv_batch).to(self.device) next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device) log_prob_old_batch = torch.FloatTensor(log_prob_old_batch).to( self.device) sample_range = np.arange(len(s_batch)) forward_mse = nn.MSELoss(reduction='none') loss_infos = defaultdict(list) for _ in range(self.nepochs): np.random.shuffle(sample_range) for j in range(int(len(s_batch) / self.batch_size)): sample_idx = sample_range[self.batch_size * j:self.batch_size * (j + 1)] # -------------------------------------------------------------------------------- # for Curiosity-driven(Random Network Distillation) predict_next_state_feature, target_next_state_feature = self.rnd( next_obs_batch[sample_idx]) forward_loss = forward_mse( predict_next_state_feature, target_next_state_feature.detach()).mean(-1) # Proportion of exp used for predictor update --- cf. cnn_policy_param_matched.py mask = torch.rand(len(forward_loss)).to(self.device) mask = (mask < self.update_proportion).float().to(self.device) forward_loss = (forward_loss * mask).sum() / torch.max( mask.sum(), torch.Tensor([1]).to(self.device)) # --------------------------------------------------------------------------------- traj_info = self.act(s_batch[sample_idx], action_batch[sample_idx], calc_ent=True) ratio = torch.exp(traj_info['log_pi_a'] - log_prob_old_batch[sample_idx]) surr1 = ratio * adv_batch[sample_idx] surr2 = torch.clamp(ratio, 1.0 - self.cliprange, 1.0 + self.cliprange) * adv_batch[sample_idx] policy_loss = -torch.min(surr1, surr2).mean() critic_ext_loss = F.mse_loss(traj_info['v_ext'], target_ext_batch[sample_idx]) critic_int_loss = F.mse_loss(traj_info['v_int'], target_int_batch[sample_idx]) value_loss = critic_ext_loss + critic_int_loss entropy = traj_info['ent'].mean() self.optimizer.zero_grad() loss = policy_loss + 0.5 * value_loss - self.ent_coef * entropy + forward_loss loss.backward() if self.max_grad_norm: nn.utils.clip_grad_norm_( list(self.model.parameters()) + list(self.rnd.predictor.parameters()), self.max_grad_norm) self.optimizer.step() _data = dict(policy=policy_loss.data.cpu().numpy(), value_ext=critic_ext_loss.data.cpu().numpy(), value_int=critic_int_loss.data.cpu().numpy(), entropy=entropy.data.cpu().numpy(), forward=forward_loss.data.cpu().numpy()) for k, v in _data.items(): loss_infos[k].append(v) return loss_infos
def main(): if 'NAME' in os.environ.keys(): NAME = os.environ['NAME'] else: raise ValueError('set NAME via env variable') try: env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r')) except: env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r')) if 'home-test' not in NAME: wandb.init( project='CarRacing_RND', reinit=True, name=f'rnd_{NAME}', config={'env_config': env_settings, 'agent_config': default_config}, ) # print({section: dict(config[section]) for section in config.sections()}) train_method = default_config['TrainMethod'] env_id = default_config['EnvID'] # env_type = default_config['EnvType'] # if env_type == 'mario': # env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) # elif env_type == 'atari': # env = gym.make(env_id) # else: # raise NotImplementedError seed = np.random.randint(0, 2 ** 16 - 1) print(f'use name : {NAME}') print(f"use env config : {default_config['CarIntersectConfigPath']}") print(f'use seed : {seed}') print(f"use device : {os.environ['DEVICE']}") os.chdir('..') env = makeCarIntersect(env_settings) eval_env = create_eval_env(makeCarIntersect(env_settings)) # input_size = env.observation_space.shape # 4 input_size = env.observation_space.shape assert isinstance(env.action_space, gym.spaces.Box) action_size = env.action_space.shape[0] # 2 env.close() is_load_model = True is_render = False # model_path = 'models/{}.model'.format(NAME) # predictor_path = 'models/{}.pred'.format(NAME) # target_path = 'models/{}.target'.format(NAME) # writer = SummaryWriter() use_cuda = default_config.getboolean('UseGPU') use_gae = default_config.getboolean('UseGAE') use_noisy_net = default_config.getboolean('UseNoisyNet') lam = float(default_config['Lambda']) num_worker = int(default_config['NumEnv']) num_step = int(default_config['NumStep']) ppo_eps = float(default_config['PPOEps']) epoch = int(default_config['Epoch']) mini_batch = int(default_config['MiniBatch']) batch_size = int(num_step * num_worker / mini_batch) learning_rate = float(default_config['LearningRate']) entropy_coef = float(default_config['Entropy']) gamma = float(default_config['Gamma']) int_gamma = float(default_config['IntGamma']) clip_grad_norm = float(default_config['ClipGradNorm']) ext_coef = float(default_config['ExtCoef']) int_coef = float(default_config['IntCoef']) sticky_action = default_config.getboolean('StickyAction') action_prob = float(default_config['ActionProb']) life_done = default_config.getboolean('LifeDone') reward_rms = RunningMeanStd() obs_rms = RunningMeanStd(shape=(1, 1, 84, 84)) pre_obs_norm_step = int(default_config['ObsNormStep']) discounted_reward = RewardForwardFilter(int_gamma) agent = RNDAgent( input_size, action_size, num_worker, num_step, gamma, lam=lam, learning_rate=learning_rate, ent_coef=entropy_coef, clip_grad_norm=clip_grad_norm, epoch=epoch, batch_size=batch_size, ppo_eps=ppo_eps, use_cuda=use_cuda, use_gae=use_gae, use_noisy_net=use_noisy_net, device=os.environ['DEVICE'], ) # if is_load_model: # print('load model...') # if use_cuda: # agent.model.load_state_dict(torch.load(model_path)) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path)) # agent.rnd.target.load_state_dict(torch.load(target_path)) # else: # agent.model.load_state_dict(torch.load(model_path, map_location='cpu')) # agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu')) # agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu')) # print('load finished!') works = [] parent_conns = [] child_conns = [] for idx in range(num_worker): parent_conn, child_conn = Pipe() work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob, life_done=life_done, settings=env_settings) work.start() works.append(work) parent_conns.append(parent_conn) child_conns.append(child_conn) os.chdir('rnd_continues') states = np.zeros([num_worker, 4, 84, 84]) sample_episode = 0 sample_rall = 0 sample_step = 0 sample_env_idx = 0 sample_i_rall = 0 global_update = 0 global_step = 0 logger = Logger(None, use_console=True, use_wandb=True, log_interval=1) print('Test evaluater:') evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=False, exp_class='RND', exp_name=NAME, debug=True, ) print('end evaluater test.') # normalize obs print('Start to initailize observation normalization parameter.....') # print('ALERT! pass section') # assert 'home-test' in NAME next_obs = [] for step in range(num_step * pre_obs_norm_step): actions = np.random.uniform(-1, 1, size=(num_worker, action_size)) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action) for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_obs.append(s[3, :, :].reshape([1, 84, 84])) if len(next_obs) % (num_step * num_worker) == 0: next_obs = np.stack(next_obs) obs_rms.update(next_obs) next_obs = [] print('End to initalize...') while True: total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \ [], [], [], [], [], [], [], [], [], [], [] # Step 1. n-step rollout for _ in range(num_step): global_step += num_worker # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.) actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.) for parent_conn, action in zip(parent_conns, actions): parent_conn.send(action.cpu().numpy()) next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], [] for parent_conn in parent_conns: s, r, d, rd, lr = parent_conn.recv() next_states.append(s) rewards.append(r) dones.append(d) real_dones.append(rd) log_rewards.append(lr) next_obs.append(s[3, :, :].reshape([1, 84, 84])) next_states = np.stack(next_states) rewards = np.hstack(rewards) dones = np.hstack(dones) real_dones = np.hstack(real_dones) next_obs = np.stack(next_obs) # total reward = int reward + ext Reward intrinsic_reward = agent.compute_intrinsic_reward( ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5)) intrinsic_reward = np.hstack(intrinsic_reward) sample_i_rall += intrinsic_reward[sample_env_idx] total_next_obs.append(next_obs) total_int_reward.append(intrinsic_reward) total_state.append(states) total_reward.append(rewards) total_done.append(dones) total_action.append(actions.cpu().numpy()) total_ext_values.append(value_ext) total_int_values.append(value_int) # total_policy.append(policy) # total_policy_np.append(policy.cpu().numpy()) total_policy_log_prob.extend(policy_log_prob.cpu().numpy()) states = next_states[:, :, :, :] sample_rall += log_rewards[sample_env_idx] sample_step += 1 if real_dones[sample_env_idx]: sample_episode += 1 # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode) # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update) # writer.add_scalar('data/step', sample_step, sample_episode) logger.log_it({ 'reward_per_episode': sample_rall, 'intrinsic_reward': sample_i_rall, 'episode_steps': sample_step, 'global_step_cnt': global_step, 'updates_cnt': global_update, }) logger.publish_logs(step=global_step) sample_rall = 0 sample_step = 0 sample_i_rall = 0 # calculate last next value _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.) total_ext_values.append(value_ext) total_int_values.append(value_int) # -------------------------------------------------- total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84]) total_reward = np.stack(total_reward).transpose().clip(-1, 1) # total_action = np.stack(total_action).transpose().reshape([-1, action_size]) total_action = np.array(total_action).reshape((-1, action_size)) # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1)) total_done = np.stack(total_done).transpose() total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84]) total_ext_values = np.stack(total_ext_values).transpose() total_int_values = np.stack(total_int_values).transpose() # total_logging_policy = np.vstack(total_policy_np) # Step 2. calculate intrinsic reward # running mean intrinsic reward total_int_reward = np.stack(total_int_reward).transpose() total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T]) mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env) reward_rms.update_from_moments(mean, std ** 2, count) # normalize intrinsic reward total_int_reward /= np.sqrt(reward_rms.var) # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode) # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update) # ------------------------------------------------------------------------------------------- # logging Max action probability # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode) # Step 3. make target and advantage # extrinsic reward calculate ext_target, ext_adv = make_train_data(total_reward, total_done, total_ext_values, gamma, num_step, num_worker) # intrinsic reward calculate # None Episodic int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward), total_int_values, int_gamma, num_step, num_worker) # add ext adv and int adv total_adv = int_adv * int_coef + ext_adv * ext_coef # ----------------------------------------------- # Step 4. update obs normalize param obs_rms.update(total_next_obs) # ----------------------------------------------- global_update += 1 # Step 5. Training! agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action, total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5), total_policy_log_prob) # if global_step % (num_worker * num_step * 100) == 0: # print('Now Global Step :{}'.format(global_step)) # torch.save(agent.model.state_dict(), model_path) # torch.save(agent.rnd.predictor.state_dict(), predictor_path) # torch.save(agent.rnd.target.state_dict(), target_path) if global_update % 100 == 0: evaluate_and_log( eval_env=eval_env, action_get_method=lambda eval_state: agent.get_action( np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255. )[0][0].cpu().numpy(), logger=logger, log_animation=True, exp_class='RND', exp_name=NAME, ) logger.publish_logs(step=global_step)