Esempio n. 1
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')
    seed = np.random.randint(0, 100)

    env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed,
                               retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300)
    env._flattener = ActionFlattener([2, 3, 2, 1])
    env._action_space = env._flattener.action_space
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, 'main.model')
    predictor_path = os.path.join(args.save_dir, 'main.pred')
    target_path = os.path.join(args.save_dir, 'main.target')

    writer = SummaryWriter()#log_dir=args.log_dir)



    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr)
   
    if args.load_model:
        "Loading model..."
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))


    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(
            args.env_name,
            is_render,
            idx,
            child_conn,
            sticky_action=args.sticky_action,
            p=args.sticky_action_prob,
            max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0   # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    print("Load RMS =", args.load_rms)
    if args.load_rms:
        print("Loading RMS values for observation and reward normalization")
        with open('reward_rms.pkl', 'rb') as f:
            reward_rms = dill.load(f)
        with open('obs_rms.pkl', 'rb') as f:
            obs_rms = dill.load(f)
    else:
        reward_rms = RunningMeanStd()
        obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))

        # normalize observation
        print('Initializing observation normalization...')
        next_obs = []
        for step in range(args.num_step * args.pre_obs_norm_steps):
            actions = np.random.randint(0, output_size, size=(args.num_worker,))

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            for parent_conn in parent_conns:
                next_state, reward, done, realdone, log_reward = parent_conn.recv()
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            if len(next_obs) % (args.num_step * args.num_worker) == 0:
                next_obs = np.stack(next_obs)
                obs_rms.update(next_obs)
                next_obs = []
        with open('reward_rms.pkl', 'wb') as f:
            dill.dump(reward_rms, f)
        with open('obs_rms.pkl', 'wb') as f:
            dill.dump(obs_rms, f)

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv()
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              args.ext_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              args.int_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                        np.float32(total_state) / 255., ext_target, int_target, total_action,
                        total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                        total_action_probs)

        if global_step % (args.num_worker * args.num_step * args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)

            """
            checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))])
            if len(checkpoint_list) == 0:
                last_checkpoint = -1
            else:
                last_checkpoint = checkpoint_list.max()
            next_checkpoint = last_checkpoint + 1
            print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint))

            incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model')
            incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred')
            incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target')
            with open(incre_model_path, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(incre_predictor_path, 'wb') as f:
                torch.save(rnd.predictor.state_dict(), f)
            with open(incre_target_path, 'wb') as f:
                torch.save(rnd.target.state_dict(), f)
            """
            if args.terminate and (global_step > args.terminate_steps):
                with open('reward_rms.pkl', 'wb') as f:
                    dill.dump(reward_rms, f)
                with open('obs_rms.pkl', 'wb') as f:
                    dill.dump(obs_rms, f)
                break
Esempio n. 2
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')

    env = gym.make(args.env_name)

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in args.env_name:
        output_size -= 1

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, args.env_name + '.model')
    predictor_path = os.path.join(args.save_dir, args.env_name + '.pred')
    target_path = os.path.join(args.save_dir, args.env_name + '.target')

    writer = SummaryWriter(log_dir=args.log_dir)

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) +
                           list(rnd.predictor.parameters()),
                           lr=args.lr)

    if args.load_model:
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(args.env_name,
                                is_render,
                                idx,
                                child_conn,
                                sticky_action=args.sticky_action,
                                p=args.sticky_action_prob,
                                max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0  # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize observation
    print('Initializes observation normalization...')
    next_obs = []
    for step in range(args.num_step * args.pre_obs_norm_steps):
        actions = np.random.randint(0, output_size, size=(args.num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            next_state, reward, done, realdone, log_reward = parent_conn.recv()
            next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (args.num_step * args.num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(
                model, device,
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv(
                )
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(
                rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device,
                                                np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / args.num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / args.num_worker,
                          global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          total_logging_action_probs.max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, args.ext_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, args.int_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                    np.float32(total_state) / 255., ext_target, int_target,
                    total_action, total_adv,
                    ((total_next_obs - obs_rms.mean) /
                     np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs)

        if global_step % (args.num_worker * args.num_step *
                          args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)
Esempio n. 3
0
def main():

    args = parse_arguments()

    train_method = args.train_method
    env_id = args.env_id
    env_type = args.env_type

    if env_type == 'atari':
        env = gym.make(env_id)
        input_size = env.observation_space.shape 
        output_size = env.action_space.n 
        env.close()
    else:
        raise NotImplementedError

    is_load_model = False
    is_render = False
    os.makedirs('models', exist_ok=True)
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    results_dir = os.path.join('outputs', args.env_id)
    os.makedirs(results_dir, exist_ok=True)
    logger = Logger(results_dir)
    writer = SummaryWriter(os.path.join(results_dir, 'tensorboard', args.env_id))   

    use_cuda = args.use_gpu
    use_gae = args.use_gae
    use_noisy_net = args.use_noisynet
    lam = args.lam
    num_worker = args.num_worker
    num_step = args.num_step
    ppo_eps = args.ppo_eps
    epoch = args.epoch
    mini_batch = args.minibatch 
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = args.learning_rate
    entropy_coef = args.entropy
    gamma = args.gamma
    int_gamma = args.int_gamma
    clip_grad_norm = args.clip_grad_norm
    ext_coef = args.ext_coef
    int_coef = args.int_coef
    sticky_action = args.sticky_action
    action_prob = args.action_prob
    life_done = args.life_done
    pre_obs_norm_step = args.obs_norm_step

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(int_gamma)

    if args.train_method == 'RND':
        agent = RNDAgent
    else:
        raise NotImplementedError

    if args.env_type == 'atari':
        env_type = AtariEnvironment
    else:
        raise NotImplementedError

    agent = agent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net
    )

    logger.info('Start to initialize workers')
    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id, is_render, idx, child_conn, 
            sticky_action=sticky_action, p=action_prob, life_done=life_done, 
            max_step_per_episode=args.max_step_per_episode)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    logger.info('Start to initailize observation normalization parameter.....')
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker,))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    logger.info('End to initalize...')

    pbar = tqdm.tqdm(total=args.total_frames)
    while True:
        logger.info('Iteration: {}'.format(global_update))
        total_state, total_reward, total_done, total_next_state, \
            total_action, total_int_reward, total_next_obs, total_ext_values, \
            total_int_values, total_policy, total_policy_np = \
            [], [], [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = \
                [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/returns_vs_frames', sample_rall, global_step)
                writer.add_scalar('data/lengths_vs_frames', sample_step, global_step)
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)
        
        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)

        # logging Max action probability
        writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done, 
            total_ext_values, gamma, num_step, num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward, np.zeros_like(total_int_reward),
            total_int_values, int_gamma, num_step, num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)

        # Step 5. Training!
        agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action,
                          total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                          total_policy)

        if args.save_models and global_update % 1000 == 0:
            torch.save(agent.model.state_dict(), 'models/{}-{}.model'.format(env_id, global_update))
            logger.info('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)

        pbar.update(num_worker * num_step)
        if global_step >= args.total_frames:
            break

    pbar.close()
Esempio n. 4
0
    def __init__(self, num_training_steps, num_env, num_game_steps, num_epoch,
                 learning_rate, discount_factor, int_discount_factor,
                 num_action, value_coef, clip_range, save_interval,
                 entropy_coef, lam, mini_batch_num, num_action_repeat,
                 load_path, ext_adv_coef, int_adv_coef, num_pre_norm_steps,
                 predictor_update_proportion):
        self.training_steps = num_training_steps
        self.num_epoch = num_epoch
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.num_game_steps = num_game_steps
        self.num_env = num_env
        self.batch_size = num_env * num_game_steps
        self.clip_range = clip_range
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.mini_batch_num = mini_batch_num
        self.num_action = num_action
        self.num_pre_norm_steps = num_pre_norm_steps
        self.int_discount_factor = int_discount_factor
        self.predictor_update_proportion = predictor_update_proportion

        assert self.batch_size % self.mini_batch_num == 0
        self.mini_batch_size = int(self.batch_size / self.mini_batch_num)
        self.current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = 'logs/' + self.current_time + '/log'
        self.save_interval = save_interval
        self.lam = lam

        self.num_action_repeat = num_action_repeat
        self.clip_range = clip_range

        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.load_path = load_path

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.new_model = Model(self.num_action).to(self.device)

        self.ext_adv_coef = ext_adv_coef
        self.int_adv_coef = int_adv_coef
        self.writer = SummaryWriter('logs/' + self.current_time + '/log')
        print("-----------------------------------------")
        print("program configuration")
        print("time: ", self.current_time)
        print("number of train steps: ", self.training_steps)
        print("normilization steps parameter: ", self.num_pre_norm_steps)
        print("num_env: ", self.num_env)
        print("number of epochs: ", self.num_epoch)
        print("steps: ", self.num_game_steps)
        print("mini batch: ", self.mini_batch_size)
        print("lr: ", self.learning_rate)
        print("gamma: ", self.discount_factor)
        print("intrinsic gamma: ", self.int_discount_factor)
        print("lambda: ", self.lam)
        print("clip: ", self.clip_range)
        print("v_coef: ", self.value_coef)
        print("ent_coef: ", self.entropy_coef)
        print("the predictor's update proportion: ",
              self.predictor_update_proportion)

        print("intrinsic advantages coefficient: ", self.int_adv_coef)
        print("extrinsic advantages coefficient: ", self.ext_adv_coef)
        print("-----------------------------------------")

        self.target_model = TargetModel().to(self.device)
        self.predictor_model = PredictorModel().to(self.device)
        self.mse_loss = nn.MSELoss()
        self.predictor_mse_loss = nn.MSELoss(reduction='none')
        self.optimizer = optim.Adam(list(self.new_model.parameters()) +
                                    list(self.predictor_model.parameters()),
                                    lr=self.learning_rate)

        self.reward_rms = RunningStdMean()
        self.obs_rms = RunningStdMean(shape=(1, 1, 84, 84))
        self.reward_filter = RewardForwardFilter(self.int_discount_factor)
Esempio n. 5
0
class Trainer:
    def __init__(self, num_training_steps, num_env, num_game_steps, num_epoch,
                 learning_rate, discount_factor, int_discount_factor,
                 num_action, value_coef, clip_range, save_interval,
                 entropy_coef, lam, mini_batch_num, num_action_repeat,
                 load_path, ext_adv_coef, int_adv_coef, num_pre_norm_steps,
                 predictor_update_proportion):
        self.training_steps = num_training_steps
        self.num_epoch = num_epoch
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.num_game_steps = num_game_steps
        self.num_env = num_env
        self.batch_size = num_env * num_game_steps
        self.clip_range = clip_range
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.mini_batch_num = mini_batch_num
        self.num_action = num_action
        self.num_pre_norm_steps = num_pre_norm_steps
        self.int_discount_factor = int_discount_factor
        self.predictor_update_proportion = predictor_update_proportion

        assert self.batch_size % self.mini_batch_num == 0
        self.mini_batch_size = int(self.batch_size / self.mini_batch_num)
        self.current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        log_dir = 'logs/' + self.current_time + '/log'
        self.save_interval = save_interval
        self.lam = lam

        self.num_action_repeat = num_action_repeat
        self.clip_range = clip_range

        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.load_path = load_path

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.new_model = Model(self.num_action).to(self.device)

        self.ext_adv_coef = ext_adv_coef
        self.int_adv_coef = int_adv_coef
        self.writer = SummaryWriter('logs/' + self.current_time + '/log')
        print("-----------------------------------------")
        print("program configuration")
        print("time: ", self.current_time)
        print("number of train steps: ", self.training_steps)
        print("normilization steps parameter: ", self.num_pre_norm_steps)
        print("num_env: ", self.num_env)
        print("number of epochs: ", self.num_epoch)
        print("steps: ", self.num_game_steps)
        print("mini batch: ", self.mini_batch_size)
        print("lr: ", self.learning_rate)
        print("gamma: ", self.discount_factor)
        print("intrinsic gamma: ", self.int_discount_factor)
        print("lambda: ", self.lam)
        print("clip: ", self.clip_range)
        print("v_coef: ", self.value_coef)
        print("ent_coef: ", self.entropy_coef)
        print("the predictor's update proportion: ",
              self.predictor_update_proportion)

        print("intrinsic advantages coefficient: ", self.int_adv_coef)
        print("extrinsic advantages coefficient: ", self.ext_adv_coef)
        print("-----------------------------------------")

        self.target_model = TargetModel().to(self.device)
        self.predictor_model = PredictorModel().to(self.device)
        self.mse_loss = nn.MSELoss()
        self.predictor_mse_loss = nn.MSELoss(reduction='none')
        self.optimizer = optim.Adam(list(self.new_model.parameters()) +
                                    list(self.predictor_model.parameters()),
                                    lr=self.learning_rate)

        self.reward_rms = RunningStdMean()
        self.obs_rms = RunningStdMean(shape=(1, 1, 84, 84))
        self.reward_filter = RewardForwardFilter(self.int_discount_factor)

    def collect_experiance_and_train(self):
        start_train_step = 0
        sample_episode_num = 0

        if flag.LOAD:
            if self.device.type == "cpu":
                checkpoint = torch.load(self.load_path,
                                        map_location=self.device)
            else:
                checkpoint = torch.load(self.load_path)

            self.new_model.load_state_dict(checkpoint['new_model_state_dict'])
            self.predictor_model.load_state_dict(
                checkpoint['predictor_state_dict'])
            self.target_model.load_state_dict(checkpoint['target_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            start_train_step = checkpoint['train_step']
            sample_episode_num = checkpoint['ep_num']
            self.obs_rms.mean = checkpoint['obs_mean']
            self.obs_rms.var = checkpoint['obs_var']
            self.obs_rms.count = checkpoint['obs_count']
            self.reward_rms.mean = checkpoint['rew_mean']
            self.reward_rms.var = checkpoint['rew_var']
            self.reward_rms.count = checkpoint['rew_count']
            self.reward_filter.rewems = checkpoint['rewems']
            print("loaded model weights from checkpoint")

        current_observations = []
        parents = []
        childs = []
        envs = []

        for i in range(self.num_env):
            parent, child = Pipe()
            if flag.ENV == "MR":
                new_env = montezuma_revenge_env \
                          .MontezumaRevenge(i, child,
                                            self.num_action_repeat,
                                            0.25, 6000)
            new_env.start()
            envs.append(new_env)
            parents.append(parent)
            childs.append(child)
        if flag.LOAD:

            actions = np.random.randint(0,
                                        self.num_action,
                                        size=(self.num_env))

            for i in range(0, len(parents)):
                parents[i].send(actions[i])
            current_observations = []
            for i in range(0, len(parents)):
                obs, rew, done = parents[i].recv()
                current_observations.append(obs)
        else:
            # normalize observations

            observations_to_normalize = []
            for step in range(self.num_game_steps * self.num_pre_norm_steps):

                actions = np.random.randint(0,
                                            self.num_action,
                                            size=(self.num_env))

                for i in range(0, len(parents)):
                    parents[i].send(actions[i])
                current_observations = []
                for i in range(0, len(parents)):
                    obs, rew, done = parents[i].recv()
                    current_observations.append(obs)
                observations_to_normalize.extend(current_observations)
                if (len(observations_to_normalize) %
                    (self.num_game_steps * self.num_env) == 0):
                    observations_to_normalize = np.stack(
                        observations_to_normalize)[:, 3, :, :].reshape(
                            -1, 1, 84, 84)
                    self.obs_rms.update(observations_to_normalize)
                    observations_to_normalize = []
            print("normalization ended")

        sample_ext_reward = 0
        sample_int_reward = 0

        for train_step in range(start_train_step, self.training_steps):

            total_observations = []
            total_int_rewards = []
            total_ext_rewards = []
            total_dones = []
            total_int_values = []
            total_ext_values = []
            total_actions = []

            for game_step in range(self.num_game_steps):

                total_observations.extend(current_observations)

                with torch.no_grad():
                    current_observations_tensor = torch.from_numpy(
                        np.array(current_observations)).float().to(self.device)
                    decided_actions, predicted_ext_values, \
                        predicted_int_values \
                        = self.new_model.step(
                                             current_observations_tensor / 255.
                                             )
                    one_channel_observations = np.array(
                        current_observations)[:,
                                              3, :, :].reshape(-1, 1, 84, 84)
                    one_channel_observations = (
                        (one_channel_observations - self.obs_rms.mean) /
                        np.sqrt(self.obs_rms.var)).clip(-5, 5)
                    one_channel_observations_tensor = torch.from_numpy(
                        one_channel_observations).float().to(self.device)
                    int_reward = self.get_intrinsic_rewards(
                        one_channel_observations_tensor)
                    total_int_rewards.append(int_reward)

                total_int_values.append(predicted_int_values)
                total_ext_values.append(predicted_ext_values)
                total_actions.extend(decided_actions)

                current_observations = []
                for i in range(0, len(parents)):
                    parents[i].send(decided_actions[i])

                step_rewards = []
                step_dones = []
                for i in range(0, len(parents)):
                    observation, reward, done = parents[i].recv()
                    current_observations.append(observation)
                    step_rewards.append(reward)
                    step_dones.append(done)
                sample_ext_reward += step_rewards[0]
                sample_int_reward += int_reward[0]

                if step_dones[0]:
                    self.writer.add_scalar(
                        'ext_reward_per_episode_for_one_env',
                        sample_ext_reward, sample_episode_num)
                    self.writer.add_scalar(
                        'int_reward_per_episode_for_one_env',
                        sample_int_reward, sample_episode_num)
                    sample_ext_reward = 0
                    sample_int_reward = 0
                    sample_episode_num += 1

                total_ext_rewards.append(step_rewards)
                total_dones.append(step_dones)
            # next state value, required for computing advantages
            with torch.no_grad():
                current_observations_tensor = torch.from_numpy(
                    np.array(current_observations)).float().to(self.device)
                decided_actions, predicted_ext_values, predicted_int_values = \
                    self.new_model.step(
                        current_observations_tensor / 255.)

            total_int_values.append(predicted_int_values)
            total_ext_values.append(predicted_ext_values)

            # convert lists to numpy arrays
            observations_array = np.array(total_observations)
            total_one_channel_observations_array = (
                observations_array[:, 3, :, :].reshape(-1, 1, 84, 84))

            self.obs_rms.update(total_one_channel_observations_array)
            total_one_channel_observations_array \
                = ((total_one_channel_observations_array - self.obs_rms.mean)
                    / np.sqrt(
                             self.obs_rms.var)).clip(-5, 5)
            ext_rewards_array = np.array(total_ext_rewards).clip(-1, 1)

            dones_array = np.array(total_dones)
            ext_values_array = np.array(total_ext_values)
            int_values_array = np.array(total_int_values)
            actions_array = np.array(total_actions)
            int_rewards_array = np.stack(total_int_rewards)

            total_reward_per_env = np.array([
                self.reward_filter.update(reward_per_env)
                for reward_per_env in int_rewards_array.T
            ])  # calcuting returns for every env

            mean, std, count = np.mean(total_reward_per_env), np.std(
                total_reward_per_env), len(total_reward_per_env)
            self.reward_rms.update_from_mean_std(mean, std**2, count)

            # normalize intrinsic reward
            int_rewards_array /= np.sqrt(self.reward_rms.var)
            self.writer.add_scalar(
                'avg_int_reward_per_train_step_for_all_envs',
                np.sum(int_rewards_array) / self.num_env, train_step)
            self.writer.add_scalar('int_reward_for_one_env_per_train_step',
                                   int_rewards_array.T[0].mean(), train_step)

            ext_advantages_array, ext_returns_array = self.compute_advantage(
                ext_rewards_array, ext_values_array, dones_array, 0)
            int_advantages_array, int_returns_array = self.compute_advantage(
                int_rewards_array, int_values_array, dones_array, 1)

            advantages_array = self.ext_adv_coef * ext_advantages_array \
                                                 + self.int_adv_coef \
                                                 * int_advantages_array

            if flag.DEBUG:
                print("all actions are", total_actions)

            observations_tensor = torch.from_numpy(
                np.array(observations_array)).float().to(self.device)
            observations_tensor = observations_tensor / 255.
            ext_returns_tensor = torch.from_numpy(
                np.array(ext_returns_array)).float().to(self.device)
            int_returns_tensor = torch.from_numpy(
                np.array(int_returns_array)).float().to(self.device)
            actions_tensor = torch.from_numpy(
                np.array(actions_array)).long().to(self.device)
            advantages_tensor = torch.from_numpy(
                np.array(advantages_array)).float().to(self.device)
            one_channel_observations_tensor = torch.from_numpy(
                total_one_channel_observations_array).float().to(self.device)

            random_indexes = np.arange(self.batch_size)
            np.random.shuffle(random_indexes)

            with torch.no_grad():
                old_policy, _, _ = self.new_model(observations_tensor)
                dist_old = Categorical(F.softmax(old_policy, dim=1))
                old_log_prob = dist_old.log_prob(actions_tensor)

            loss_avg = []
            policy_loss_avg = []
            value_loss_avg = []
            entropy_avg = []
            predictor_loss_avg = []

            for epoch in range(0, self.num_epoch):
                # print("----------------next epoch----------------")

                for n in range(0, self.mini_batch_num):
                    # print("----------------next mini batch-------------")
                    start_index = n * self.mini_batch_size
                    index_slice = random_indexes[start_index:start_index +
                                                 self.mini_batch_size]
                    if flag.DEBUG:
                        print("indexed chosen are:", index_slice)

                    experience_slice = (arr[index_slice] for arr in (
                        observations_tensor, ext_returns_tensor,
                        int_returns_tensor, actions_tensor, advantages_tensor,
                        one_channel_observations_tensor))

                    loss, policy_loss, value_loss, predictor_loss, entropy \
                        = self.train_model(
                                           *experience_slice,
                                           old_log_prob[index_slice]
                                           )

                    if epoch == self.num_epoch - 1:
                        loss = loss.detach().cpu().numpy()
                        policy_loss = policy_loss.detach().cpu().numpy()
                        predictor_loss = predictor_loss.detach().cpu().numpy()
                        value_loss = value_loss.detach().cpu().numpy()
                        entropy = entropy.detach().cpu().numpy()
                        loss_avg.append(loss)
                        policy_loss_avg.append(policy_loss)
                        value_loss_avg.append(value_loss)
                        entropy_avg.append(entropy)
                        predictor_loss_avg.append(predictor_loss)

            loss_avg_result = np.array(loss_avg).mean()
            policy_loss_avg_result = np.array(policy_loss_avg).mean()
            value_loss_avg_result = np.array(value_loss_avg).mean()
            entropy_avg_result = np.array(entropy_avg).mean()
            predictor_loss_avg_result = np.array(predictor_loss_avg).mean()
            print(
                "training step {:03d}, Epoch {:03d}: Loss: {:.3f}, policy loss"
                ": {:.3f}, value loss: {:.3f},predictor loss: {:.3f},"
                " entropy: {:.3f} ".format(train_step, epoch, loss_avg_result,
                                           policy_loss_avg_result,
                                           value_loss_avg_result,
                                           predictor_loss_avg_result,
                                           entropy_avg_result))

            if flag.TENSORBOARD_AVALAIBLE:
                self.writer.add_scalar('loss_avg', loss_avg_result, train_step)
                self.writer.add_scalar('policy_loss_avg',
                                       policy_loss_avg_result, train_step)
                self.writer.add_scalar('value_loss_avg', value_loss_avg_result,
                                       train_step)
                self.writer.add_scalar('predictor_loss_avg',
                                       predictor_loss_avg_result, train_step)
                self.writer.add_scalar('entropy_avg', entropy_avg_result,
                                       train_step)

            if train_step % self.save_interval == 0:
                train_checkpoint_dir = 'logs/' + self.current_time + str(
                    train_step)

                torch.save(
                    {
                        'train_step': train_step,
                        'new_model_state_dict': self.new_model.state_dict(),
                        'predictor_state_dict':
                        self.predictor_model.state_dict(),
                        'target_state_dict': self.target_model.state_dict(),
                        'optimizer_state_dict': self.optimizer.state_dict(),
                        'obs_mean': self.obs_rms.mean,
                        'obs_var': self.obs_rms.var,
                        'obs_count': self.obs_rms.count,
                        'rew_mean': self.reward_rms.mean,
                        'rew_var': self.reward_rms.var,
                        'rew_count': self.reward_rms.count,
                        'rewems': self.reward_filter.rewems,
                        'ep_num': sample_episode_num
                    }, train_checkpoint_dir)

    def compute_advantage(self, rewards, values, dones, int_flag=0):

        if flag.DEBUG:
            print("---------computing advantage---------")
            print("rewards are", rewards)
            print("values from steps are", values)
        if int_flag == 1:
            discount_factor = self.int_discount_factor
        else:
            discount_factor = self.discount_factor
        advantages = []
        advantage = 0
        for step in reversed(range(self.num_game_steps)):

            if int_flag == 1:
                is_there_a_next_state = 1
            else:
                is_there_a_next_state = 1.0 - dones[step]
            delta = rewards[step] + (is_there_a_next_state * discount_factor *
                                     values[step + 1]) - values[step]
            if flag.USE_GAE:
                advantage = delta + discount_factor * \
                            self.lam * is_there_a_next_state * advantage
                advantages.append(advantage)
            else:
                advantages.append(delta)
        advantages.reverse()

        advantages = np.array(advantages)
        advantages = advantages.flatten()
        values = values[:-1]
        returns = advantages + values.flatten()
        if flag.DEBUG:
            print("all advantages are", advantages)
            print("all returns are", returns)
        return advantages, returns

    def train_model(self, observations_tensor, ext_returns_tensor,
                    int_returns_tensor, actions_tensor, advantages_tensor,
                    one_channel_observations_tensor, old_log_prob):

        if flag.DEBUG:
            print("input observations shape", observations_tensor.shape)
            print("ext returns shape", ext_returns_tensor.shape)
            print("int returns shape", int_returns_tensor.shape)
            print("input actions shape", actions_tensor.shape)
            print("input advantages shape", advantages_tensor.shape)
            print("one channel observations",
                  one_channel_observations_tensor.shape)

        self.new_model.train()
        self.predictor_model.train()
        target_value = self.target_model(one_channel_observations_tensor)
        predictor_value = self.predictor_model(one_channel_observations_tensor)
        predictor_loss = self.predictor_mse_loss(predictor_value,
                                                 target_value).mean(-1)

        mask = torch.rand(len(predictor_loss)).to(self.device)
        mask = (mask < self.predictor_update_proportion).type(
            torch.FloatTensor).to(self.device)
        predictor_loss = (predictor_loss * mask).sum() / torch.max(
            mask.sum(),
            torch.Tensor([1]).to(self.device))
        new_policy, ext_new_values, int_new_values = self.new_model(
            observations_tensor)
        ext_value_loss = self.mse_loss(ext_new_values, ext_returns_tensor)
        int_value_loss = self.mse_loss(int_new_values, int_returns_tensor)
        value_loss = ext_value_loss + int_value_loss
        softmax_policy = F.softmax(new_policy, dim=1)
        new_dist = Categorical(softmax_policy)
        new_log_prob = new_dist.log_prob(actions_tensor)

        ratio = torch.exp(new_log_prob - old_log_prob)

        clipped_policy_loss = torch.clamp(ratio, 1.0 - self.clip_range,
                                          1 + self.clip_range) \
                                          * advantages_tensor
        policy_loss = ratio * advantages_tensor

        selected_policy_loss = -torch.min(clipped_policy_loss,
                                          policy_loss).mean()
        entropy = new_dist.entropy().mean()
        self.optimizer.zero_grad()

        loss = selected_policy_loss + (self.value_coef * value_loss) \
            - (self.entropy_coef * entropy) + predictor_loss
        loss.backward()

        global_grad_norm_(
            list(self.new_model.parameters()) +
            list(self.predictor_model.parameters()))

        self.optimizer.step()
        return loss, selected_policy_loss, value_loss, predictor_loss, entropy

    def get_intrinsic_rewards(self, input_observation):
        target_value = self.target_model(input_observation)  # shape: [n,512]
        predictor_value = self.predictor_model(
            input_observation)  # shape [n,512]
        intrinsic_reward = (target_value - predictor_value).pow(2).sum(1) / 2
        intrinsic_reward = intrinsic_reward.data.cpu().numpy()
        return intrinsic_reward
Esempio n. 6
0
    def __init__(self,
                 input_size,
                 output_size,
                 seed,
                 num_env,
                 pre_obs_norm_step,
                 num_step,
                 gamma=0.99,
                 gamma_int=0.99,
                 lam=0.95,
                 int_coef=1.,
                 ext_coef=2.,
                 ent_coef=0.001,
                 cliprange=0.1,
                 max_grad_norm=0.0,
                 lr=1e-4,
                 nepochs=4,
                 batch_size=128,
                 update_proportion=0.25,
                 use_gae=True):

        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.seed = np.random.seed(seed)

        self.pre_obs_norm_step = pre_obs_norm_step
        self.num_step = num_step
        self.gamma = gamma
        self.gamma_int = gamma_int
        self.lam = lam
        self.nepochs = nepochs
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.int_coef = int_coef
        self.ext_coef = ext_coef
        self.ent_coef = ent_coef
        self.cliprange = cliprange
        self.max_grad_norm = max_grad_norm
        self.update_proportion = update_proportion

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = CnnActorCritic(input_size, output_size,
                                    seed).to(self.device)
        self.rnd = RNDModel(input_size, output_size, seed).to(self.device)
        self.optimizer = optim.Adam(list(self.model.parameters()) +
                                    list(self.rnd.predictor.parameters()),
                                    lr=lr)

        self.rff_int = RewardForwardFilter(gamma)
        #self.rff_rms_int = RunningMeanStd()
        #self.obs_rms = RunningMeanStd(shape=(1,84,84))
        self.rff_rms_int = RunningMeanStd_openAI()
        self.obs_rms = RunningMeanStd_openAI(shape=(1, 84, 84))

        self.rooms = None
        self.n_rooms = []
        self.best_nrooms = -np.inf
        self.scores = []
        self.scores_window = deque(maxlen=100)

        self.stats = defaultdict(float)  # Count episodes and timesteps
        self.stats['epcount'] = 0
        self.stats['tcount'] = 0
Esempio n. 7
0
class RNDagent(object):
    def __init__(self,
                 input_size,
                 output_size,
                 seed,
                 num_env,
                 pre_obs_norm_step,
                 num_step,
                 gamma=0.99,
                 gamma_int=0.99,
                 lam=0.95,
                 int_coef=1.,
                 ext_coef=2.,
                 ent_coef=0.001,
                 cliprange=0.1,
                 max_grad_norm=0.0,
                 lr=1e-4,
                 nepochs=4,
                 batch_size=128,
                 update_proportion=0.25,
                 use_gae=True):

        self.num_env = num_env
        self.output_size = output_size
        self.input_size = input_size
        self.seed = np.random.seed(seed)

        self.pre_obs_norm_step = pre_obs_norm_step
        self.num_step = num_step
        self.gamma = gamma
        self.gamma_int = gamma_int
        self.lam = lam
        self.nepochs = nepochs
        self.batch_size = batch_size
        self.use_gae = use_gae
        self.int_coef = int_coef
        self.ext_coef = ext_coef
        self.ent_coef = ent_coef
        self.cliprange = cliprange
        self.max_grad_norm = max_grad_norm
        self.update_proportion = update_proportion

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.model = CnnActorCritic(input_size, output_size,
                                    seed).to(self.device)
        self.rnd = RNDModel(input_size, output_size, seed).to(self.device)
        self.optimizer = optim.Adam(list(self.model.parameters()) +
                                    list(self.rnd.predictor.parameters()),
                                    lr=lr)

        self.rff_int = RewardForwardFilter(gamma)
        #self.rff_rms_int = RunningMeanStd()
        #self.obs_rms = RunningMeanStd(shape=(1,84,84))
        self.rff_rms_int = RunningMeanStd_openAI()
        self.obs_rms = RunningMeanStd_openAI(shape=(1, 84, 84))

        self.rooms = None
        self.n_rooms = []
        self.best_nrooms = -np.inf
        self.scores = []
        self.scores_window = deque(maxlen=100)

        self.stats = defaultdict(float)  # Count episodes and timesteps
        self.stats['epcount'] = 0
        self.stats['tcount'] = 0

    def collect_random_statistics(self, envs):
        """Initializes observation normalization with data from random agent."""
        all_ob = []
        all_ob.append(envs.reset())
        for _ in range(self.pre_obs_norm_step):
            actions = np.random.randint(0,
                                        self.output_size,
                                        size=(self.num_env, ))
            ob, _, _, _ = envs.step(actions)
            all_ob.append(ob)

            if len(all_ob) % (128 * self.num_env) == 0:
                ob_ = np.asarray(all_ob).astype(np.float32).reshape(
                    (-1, *envs.observation_space.shape))
                self.obs_rms.update(ob_[:, -1:, :, :])
                all_ob.clear()

    def act(self, state, action=None, calc_ent=False):
        """Returns dict of trajectory info.
        Shape
        ======
            state (uint8) : (batch_size, framestack=4, 84, 84)
        
        Returns example
            {'a': tensor([10,  5,  1]),
             'ent': None,
             'log_pi_a': tensor([-2.8904, -2.8904, -2.8904], grad_fn=<SqueezeBackward1>),
             'v_ext': tensor([0.0012, 0.0012, 0.0012], grad_fn=<SqueezeBackward0>),
             'v_int': tensor([-0.0013, -0.0013, -0.0013], grad_fn=<SqueezeBackward0>)}
        """
        #state = torch.FloatTensor(state / 255).to(self.device)
        assert state.dtype == 'uint8'
        state = torch.tensor(state / 255.,
                             dtype=torch.float,
                             device=self.device)
        #state = torch.from_numpy(state /255).float().to(self.device)

        action_probs, value_ext, value_int = self.model(state)
        dist = Categorical(action_probs)
        if action is None:
            action = dist.sample()
        log_prob = dist.log_prob(action)
        entropy = dist.entropy() if calc_ent else None

        return {
            'a': action,
            'log_pi_a': log_prob,
            'ent': entropy,
            'v_ext': value_ext.squeeze(),
            'v_int': value_int.squeeze()
        }

    def compute_intrinsic_reward(self, next_obs):
        """next_obs is the latest frame and must be normalized by RunningMeanStd(shape=(1, 84, 84))
        Shape
        ======
            next_obs : (batch_size, 1, 84, 84)
        """
        next_obs = torch.tensor(next_obs,
                                dtype=torch.float,
                                device=self.device)
        #next_obs = torch.FloatTensor(next_obs).to(self.device)

        target_next_feature = self.rnd.target(next_obs)
        predict_next_feature = self.rnd.predictor(next_obs)
        intrinsic_reward = (target_next_feature -
                            predict_next_feature).pow(2).mean(
                                1)  ### MSE  --- Issues
        #intrinsic_reward = (target_next_feature - predict_next_feature).pow(2).sum(1) / 2

        return intrinsic_reward.data.cpu().numpy()

    def step(self, envs):
        """
        """
        # Step 1. n-step rollout
        next_obs_batch, int_reward_batch, state_batch, reward_batch, done_batch, action_batch, values_ext_batch, values_int_batch, log_prob_old_batch = [],[],[],[],[],[],[],[],[]
        epinfos = []

        states = envs.reset()
        for _ in range(self.num_step):

            traj_info = self.act(states)

            log_prob_old = traj_info['log_pi_a'].detach().cpu().numpy()
            actions = traj_info['a'].cpu().numpy()
            value_ext = traj_info['v_ext'].detach().cpu().numpy()
            value_int = traj_info['v_int'].detach().cpu().numpy()

            next_states, rewards, dones, infos = envs.step(actions)

            next_obs = next_states[:, -1:, :, :]
            intrinsic_reward = self.compute_intrinsic_reward(
                ((next_obs - self.obs_rms.mean) /
                 (np.sqrt(self.obs_rms.var))).clip(-5, 5))  #+1e-10

            next_obs_batch.append(next_obs)
            int_reward_batch.append(intrinsic_reward)

            state_batch.append(states)
            reward_batch.append(rewards)
            done_batch.append(dones)
            action_batch.append(actions)
            values_ext_batch.append(value_ext)
            values_int_batch.append(value_int)
            log_prob_old_batch.append(log_prob_old)

            for info in infos:
                if 'episode' in info:
                    epinfos.append(info['episode'])

            states = next_states

        # calculate last next value
        last_traj_info = self.act(states)
        values_ext_batch.append(last_traj_info['v_ext'].detach().cpu().numpy())
        values_int_batch.append(last_traj_info['v_int'].detach().cpu().numpy())

        # convert to numpy array and transpose (num_env, num_step) from (num_step, num_env) for the later calculation
        # For self.update()
        state_batch = np.stack(state_batch).transpose(1, 0, 2, 3, 4).reshape(
            -1, 4, 84, 84)
        next_obs_batch = np.stack(next_obs_batch).transpose(1, 0, 2, 3,
                                                            4).reshape(
                                                                -1, 1, 84, 84)
        action_batch = np.stack(action_batch).transpose().reshape(-1, )
        log_prob_old_batch = np.stack(log_prob_old_batch).transpose().reshape(
            -1, )

        # For get_advantage_and_value_target_from()
        reward_batch = np.stack(reward_batch).transpose()
        done_batch = np.stack(done_batch).transpose()
        values_ext_batch = np.stack(values_ext_batch).transpose()
        values_int_batch = np.stack(values_int_batch).transpose()
        # --------------------------------------------------

        # Step 2. calculate intrinsic reward
        # running estimate of the intrinsic returns
        int_reward_batch = np.stack(int_reward_batch).transpose()
        discounted_reward_per_env = np.array([
            self.rff_int.update(reward_per_step)
            for reward_per_step in int_reward_batch.T[::-1]
        ])
        mean, std, count = np.mean(discounted_reward_per_env), np.std(
            discounted_reward_per_env), len(discounted_reward_per_env)
        self.rff_rms_int.update_from_moments(mean, std**2,
                                             count)  ### THINK ddof !

        # normalize intrinsic reward
        int_reward_batch /= np.sqrt(self.rff_rms_int.var)
        # -------------------------------------------------------------------------------------------

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = get_advantage_and_value_target_from(
            reward_batch, done_batch, values_ext_batch, self.gamma, self.lam,
            self.num_step, self.num_env, self.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = get_advantage_and_value_target_from(
            int_reward_batch, np.zeros_like(int_reward_batch),
            values_int_batch, self.gamma_int, self.lam, self.num_step,
            self.num_env, self.use_gae)

        # add ext adv and int adv
        total_advs = self.int_coef * int_adv + self.ext_coef * ext_adv
        # -----------------------------------------------

        # Step 4. update obs normalize param
        self.obs_rms.update(next_obs_batch)
        # -----------------------------------------------

        # Step 5. Train
        loss_infos = self.update(
            state_batch,
            ext_target,
            int_target,
            action_batch,
            total_advs,
            ((next_obs_batch - self.obs_rms.mean) /
             (np.sqrt(self.obs_rms.var))).clip(-5, 5),  #+1e-10
            log_prob_old_batch)
        # -----------------------------------------------

        # Collects info for reporting.
        vals_info = dict(
            advextmean=ext_adv.mean(),
            retextmean=ext_target.mean(),
            advintmean=int_adv.mean(),
            retintmean=int_target.mean(),
            rewintsample=int_reward_batch[1]  # env_number = 1
        )

        # Some reporting logic
        for epinfo in epinfos:
            #if self.testing:
            #    self.I.statlists['eprew_test'].append(epinfo['r'])
            #    self.I.statlists['eplen_test'].append(epinfo['l'])
            #else:
            if "visited_rooms" in epinfo:
                self.n_rooms.append(len(epinfo["visited_rooms"]))

                if self.best_nrooms is None:
                    self.best_nrooms = len(epinfo["visited_rooms"])
                elif len(epinfo["visited_rooms"]) > self.best_nrooms:
                    self.best_nrooms = len(epinfo["visited_rooms"])
                    self.rooms = sorted(list(epinfo["visited_rooms"]))
                #self.rooms += list(epinfo["visited_rooms"])
                #self.rooms = sorted(list(set(self.rooms)))
                #self.I.statlists['eprooms'].append(len(epinfo["visited_rooms"]))
            self.scores.append(epinfo['r'])
            self.scores_window.append(epinfo['r'])
            self.stats['epcount'] += 1
            self.stats['tcount'] += epinfo['l']
            #self.I.statlists['eprew'].append(epinfo['r'])
            #self.I.statlists['eplen'].append(epinfo['l'])
            #self.stats['rewtotal'] += epinfo['r']

        return {'loss': loss_infos, 'vals': vals_info}

    def update(self, s_batch, target_ext_batch, target_int_batch, action_batch,
               adv_batch, next_obs_batch, log_prob_old_batch):
        #s_batch = torch.FloatTensor(s_batch).to(self.device)
        target_ext_batch = torch.FloatTensor(target_ext_batch).to(self.device)
        target_int_batch = torch.FloatTensor(target_int_batch).to(self.device)
        action_batch = torch.LongTensor(action_batch).to(self.device)
        adv_batch = torch.FloatTensor(adv_batch).to(self.device)
        next_obs_batch = torch.FloatTensor(next_obs_batch).to(self.device)
        log_prob_old_batch = torch.FloatTensor(log_prob_old_batch).to(
            self.device)

        sample_range = np.arange(len(s_batch))
        forward_mse = nn.MSELoss(reduction='none')

        loss_infos = defaultdict(list)

        for _ in range(self.nepochs):
            np.random.shuffle(sample_range)
            for j in range(int(len(s_batch) / self.batch_size)):
                sample_idx = sample_range[self.batch_size * j:self.batch_size *
                                          (j + 1)]

                # --------------------------------------------------------------------------------
                # for Curiosity-driven(Random Network Distillation)
                predict_next_state_feature, target_next_state_feature = self.rnd(
                    next_obs_batch[sample_idx])
                forward_loss = forward_mse(
                    predict_next_state_feature,
                    target_next_state_feature.detach()).mean(-1)

                # Proportion of exp used for predictor update   ---  cf. cnn_policy_param_matched.py
                mask = torch.rand(len(forward_loss)).to(self.device)
                mask = (mask < self.update_proportion).float().to(self.device)
                forward_loss = (forward_loss * mask).sum() / torch.max(
                    mask.sum(),
                    torch.Tensor([1]).to(self.device))
                # ---------------------------------------------------------------------------------

                traj_info = self.act(s_batch[sample_idx],
                                     action_batch[sample_idx],
                                     calc_ent=True)

                ratio = torch.exp(traj_info['log_pi_a'] -
                                  log_prob_old_batch[sample_idx])

                surr1 = ratio * adv_batch[sample_idx]
                surr2 = torch.clamp(ratio, 1.0 - self.cliprange, 1.0 +
                                    self.cliprange) * adv_batch[sample_idx]

                policy_loss = -torch.min(surr1, surr2).mean()

                critic_ext_loss = F.mse_loss(traj_info['v_ext'],
                                             target_ext_batch[sample_idx])
                critic_int_loss = F.mse_loss(traj_info['v_int'],
                                             target_int_batch[sample_idx])
                value_loss = critic_ext_loss + critic_int_loss

                entropy = traj_info['ent'].mean()

                self.optimizer.zero_grad()
                loss = policy_loss + 0.5 * value_loss - self.ent_coef * entropy + forward_loss
                loss.backward()
                if self.max_grad_norm:
                    nn.utils.clip_grad_norm_(
                        list(self.model.parameters()) +
                        list(self.rnd.predictor.parameters()),
                        self.max_grad_norm)
                self.optimizer.step()

            _data = dict(policy=policy_loss.data.cpu().numpy(),
                         value_ext=critic_ext_loss.data.cpu().numpy(),
                         value_int=critic_int_loss.data.cpu().numpy(),
                         entropy=entropy.data.cpu().numpy(),
                         forward=forward_loss.data.cpu().numpy())
            for k, v in _data.items():
                loss_infos[k].append(v)

        return loss_infos
Esempio n. 8
0
def main():
    if 'NAME' in os.environ.keys():
        NAME = os.environ['NAME']
    else:
        raise ValueError('set NAME via env variable')

    try:
        env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r'))
    except:
        env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r'))

    if 'home-test' not in NAME:
        wandb.init(
            project='CarRacing_RND',
            reinit=True,
            name=f'rnd_{NAME}',
            config={'env_config': env_settings, 'agent_config': default_config},
        )

    # print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']

    env_id = default_config['EnvID']
    # env_type = default_config['EnvType']

    # if env_type == 'mario':
    #     env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    # elif env_type == 'atari':
    #     env = gym.make(env_id)
    # else:
    #     raise NotImplementedError

    seed = np.random.randint(0, 2 ** 16 - 1)

    print(f'use name : {NAME}')
    print(f"use env config : {default_config['CarIntersectConfigPath']}")
    print(f'use seed : {seed}')
    print(f"use device : {os.environ['DEVICE']}")

    os.chdir('..')
    env = makeCarIntersect(env_settings)
    eval_env = create_eval_env(makeCarIntersect(env_settings))

    # input_size = env.observation_space.shape  # 4
    input_size = env.observation_space.shape
    assert isinstance(env.action_space, gym.spaces.Box)
    action_size = env.action_space.shape[0]  # 2

    env.close()

    is_load_model = True
    is_render = False
    # model_path = 'models/{}.model'.format(NAME)
    # predictor_path = 'models/{}.pred'.format(NAME)
    # target_path = 'models/{}.target'.format(NAME)

    # writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent(
        input_size,
        action_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net,
        device=os.environ['DEVICE'],
    )

    # if is_load_model:
    #     print('load model...')
    #     if use_cuda:
    #         agent.model.load_state_dict(torch.load(model_path))
    #         agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
    #         agent.rnd.target.load_state_dict(torch.load(target_path))
    #     else:
    #         agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))
    #         agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu'))
    #         agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu'))
    #     print('load finished!')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob,
                        life_done=life_done, settings=env_settings)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    os.chdir('rnd_continues')

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    logger = Logger(None, use_console=True, use_wandb=True, log_interval=1)

    print('Test evaluater:')
    evaluate_and_log(
        eval_env=eval_env,
        action_get_method=lambda eval_state: agent.get_action(
            np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255.
        )[0][0].cpu().numpy(),
        logger=logger,
        log_animation=False,
        exp_class='RND',
        exp_name=NAME,
        debug=True,
    )
    print('end evaluater test.')

    # normalize obs
    print('Start to initailize observation normalization parameter.....')

    # print('ALERT! pass section')
    # assert 'home-test' in NAME
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.uniform(-1, 1, size=(num_worker, action_size))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')

    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \
            [], [], [], [], [], [], [], [], [], [], []

        # Step 1. n-step rollout
        for _ in range(num_step):
            global_step += num_worker
            # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)
            actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action.cpu().numpy())

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions.cpu().numpy())
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)

            # total_policy.append(policy)
            # total_policy_np.append(policy.cpu().numpy())

            total_policy_log_prob.extend(policy_log_prob.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                # writer.add_scalar('data/step', sample_step, sample_episode)
                logger.log_it({
                    'reward_per_episode': sample_rall,
                    'intrinsic_reward': sample_i_rall,
                    'episode_steps': sample_step,
                    'global_step_cnt': global_step,
                    'updates_cnt': global_update,
                })
                logger.publish_logs(step=global_step)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)

        # total_action = np.stack(total_action).transpose().reshape([-1, action_size])
        total_action = np.array(total_action).reshape((-1, action_size))
        # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1))

        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        # total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              gamma,
                                              num_step,
                                              num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              int_gamma,
                                              num_step,
                                              num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        global_update += 1
        # Step 5. Training!
        agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action,
                          total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                          total_policy_log_prob)

        # if global_step % (num_worker * num_step * 100) == 0:
        #     print('Now Global Step :{}'.format(global_step))
        #     torch.save(agent.model.state_dict(), model_path)
        #     torch.save(agent.rnd.predictor.state_dict(), predictor_path)
        #     torch.save(agent.rnd.target.state_dict(), target_path)

        if global_update % 100 == 0:
            evaluate_and_log(
                eval_env=eval_env,
                action_get_method=lambda eval_state: agent.get_action(
                    np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255.
                )[0][0].cpu().numpy(),
                logger=logger,
                log_animation=True,
                exp_class='RND',
                exp_name=NAME,
            )
            logger.publish_logs(step=global_step)