コード例 #1
0
ファイル: sc2_environment.py プロジェクト: voiler/starcraft
class SC2Environment(environment.Environment):
    def __init__(self, env_args):
        super(SC2Environment, self).__init__()
        env = partial(make_sc2env, **env_args)
        self.conn, child_conn = Pipe()
        self.proc = Process(target=worker,
                            args=(child_conn, CloudpickleWrapper(env)))
        self.proc.start()
        self.reset()

    @staticmethod
    def get_action_size():
        return len(FUNCTIONS)

    def reset(self):
        self.conn.send([COMMAND_RESET, None])
        return [self.conn.recv()]

    def close(self):
        self.conn.send([COMMAND_TERMINATE, None])
        self.conn.close()
        self.proc.join()
        print("SC2 environment closed")

    def step(self, actions):
        self.conn.send([COMMAND_STEP, actions])
        obs = self.conn.recv()
        return [obs], obs.reward, obs.last()
コード例 #2
0
ファイル: evaluate.py プロジェクト: Kavka1/RL
def evaluate(args):
    env = gym.make(args.env)
    env_params = get_env_params(env, args)
    env.close()

    agent = PPOAgent(args, env_params)
    agent.load_model(load_model_remark=args.load_model_remark)

    parent_conn, child_conn = Pipe()
    worker = AtariEnvironment(args.env,
                              1,
                              child_conn,
                              is_render=True,
                              max_episode_step=args.max_episode_step)
    worker.start()

    for i_episode in range(100):
        obs = worker.reset()
        while True:
            obs = np.expand_dims(obs, axis=0)
            action = agent.choose_action(obs / 255)

            parent_conn.send(action[0])
            obs_, r, done, info = parent_conn.recv()

            obs = obs_

            if done:
                break
コード例 #3
0
ファイル: enjoy.py プロジェクト: speedcell4/RND-Pytorch
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')

    env = gym.make(args.env_name)

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in args.env_name:
        output_size -= 1

    env.close()

    is_render = True
    model_path = os.path.join(args.save_dir, args.env_name + '.model')
    if not os.path.exists(model_path):
        print("Model file not found")
        return
    num_worker = 1
    sticky_action = False

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    model = model.to(device)

    if args.cuda:
        model.load_state_dict(torch.load(model_path))
    else:
        model.load_state_dict(torch.load(model_path, map_location='cpu'))

    parent_conn, child_conn = Pipe()
    work = AtariEnvironment(
        args.env_name,
        is_render,
        0,
        child_conn,
        sticky_action=sticky_action,
        p=args.sticky_action_prob,
        max_episode_steps=args.max_episode_steps)
    work.start()

    # states = np.zeros([num_worker, 4, 84, 84])
    states = torch.zeros(num_worker, 4, 84, 84)

    while True:
        actions = get_action(model, device, torch.div(states, 255.))

        parent_conn.send(actions)

        next_states = []
        next_state, reward, done, real_done, log_reward = parent_conn.recv()
        next_states.append(next_state)
        states = torch.from_numpy(np.stack(next_states))
        states = states.type(torch.FloatTensor)
コード例 #4
0
class DummyServer(INeuralNetworkAPI, IFlightControl):
    def __init__(self, **kwargs):
        self.handler_conn, server_conn = Pipe()
        self.handler = HandlerProcess(server_conn=server_conn, **kwargs)
        self.handler.start()

    def forward(self, batch: TikTensor) -> None:
        pass

        # self.handler_conn.send(
        #     (
        #         "forward",
        #         {"keys": [a.id for a in batch], "data": torch.stack([torch.from_numpy(a.as_numpy()) for a in batch])},
        #     )
        # )

    def active_children(self):
        self.handler_conn.send(("active_children", {}))

    def listen(self, timeout: float = 10) -> Union[None, Tuple[str, dict]]:
        if self.handler_conn.poll(timeout=timeout):
            answer = self.handler_conn.recv()
            logger.debug("got answer: %s", answer)
            return answer
        else:
            return None

    def shutdown(self):
        self.handler_conn.send(SHUTDOWN)
        got_shutdown_answer = False
        while self.handler.is_alive():
            if self.handler_conn.poll(timeout=2):
                answer = self.handler_conn.recv()
                if answer == SHUTDOWN_ANSWER:
                    got_shutdown_answer = True

        assert got_shutdown_answer
コード例 #5
0
    def play(self):
        parent, child = Pipe()
        if flag.ENV == "MR":
            env = montezuma_revenge_env.MontezumaRevenge(0, child, 1, 0, 18000)
        env.start()
        self.current_observation = np.zeros((4, 84, 84))

        while True:
            observation_tensor = torch.from_numpy(
                np.expand_dims(self.current_observation, 0)).float().to(
                self.device)

            predicted_action, value1, value2 = self.model.step(
                observation_tensor / 255)
            parent.send(predicted_action[0])
            self.current_observation, rew, done = parent.recv()
コード例 #6
0
class OnlineVaeAlgorithmSegmented(TorchBatchRLAlgorithm):
    def __init__(self,
                 vae_original,
                 vae_segmented,
                 vae_trainer_original,
                 vae_trainer_segmented,
                 *base_args,
                 vae_save_period=1,
                 vae_training_schedule=vae_schedules.never_train,
                 oracle_data=False,
                 parallel_vae_train=True,
                 vae_min_num_steps_before_training=0,
                 uniform_dataset=None,
                 keep_train_segmentation_vae=False,
                 **base_kwargs):
        super().__init__(*base_args, **base_kwargs)
        assert isinstance(self.replay_buffer,
                          OnlineVaeRelabelingBufferSegmented)
        self.vae_original = vae_original
        self.vae_segmented = vae_segmented
        self.vae_trainer_original = vae_trainer_original
        self.vae_trainer_segmented = vae_trainer_segmented
        self.vae_trainer_original.model = self.vae_original
        self.vae_trainer_segmented.model = self.vae_segmented

        self.vae_save_period = vae_save_period
        self.vae_training_schedule = vae_training_schedule
        self.oracle_data = oracle_data

        self.parallel_vae_train = parallel_vae_train
        self.vae_min_num_steps_before_training = vae_min_num_steps_before_training
        self.uniform_dataset = uniform_dataset

        self._vae_training_process = None
        self._update_subprocess_vae_thread = None
        self._vae_conn_pipe = None

        self.keep_train_segmentation_vae = keep_train_segmentation_vae

    def _train(self):
        super()._train()
        self._cleanup()

    def _end_epoch(self, epoch):
        # self.check_replay_buffer()
        self._train_vae(epoch)
        gt.stamp('vae training')
        super()._end_epoch(epoch)

    def _log_stats(self, epoch):
        self._log_vae_stats()
        super()._log_stats(epoch)

    def to(self, device):
        self.vae_original.to(device)
        self.vae_segmented.to(device)
        super().to(device)

    def _get_snapshot(self):
        snapshot = super()._get_snapshot()
        assert 'vae' not in snapshot
        snapshot['vae_original'] = self.vae_original
        snapshot['vae_segmented'] = self.vae_segmented
        return snapshot

    """
    debug code
    """

    def check_replay_buffer(self):
        batch = self.replay_buffer.random_batch(self.batch_size)
        rewards = batch['rewards']
        terminals = batch['terminals']
        obs = batch['observations']
        actions = batch['actions']
        next_obs = batch['next_observations']
        goals = batch['resampled_goals']

        print("obs: ", type(obs))
        print("obs shape: ", obs.shape)
        decoded_obs = self.eval_env._decode(obs, self.eval_env.vae_original)
        for idx in range(10):
            self.eval_env.show_obs(decoded_obs[idx], "sac policy obs")

        print("next_obs: ", type(next_obs))
        print("next obs shape: ", next_obs.shape)
        decoded_next_obs = self.eval_env._decode(next_obs,
                                                 self.eval_env.vae_original)
        for idx in range(10):
            self.eval_env.show_obs(decoded_next_obs[idx],
                                   "sac policy next_obs")

        decoded_goal = self.eval_env._decode(goals,
                                             self.eval_env.vae_segmented)
        for idx in range(10):
            self.eval_env.show_obs(decoded_goal[idx], "sac policy goal")

    """
    VAE-specific Code
    """

    def _train_vae(self, epoch):
        if self.parallel_vae_train and self._vae_training_process is None:
            self.init_vae_training_subprocess()
        should_train, amount_to_train = self.vae_training_schedule(epoch)
        rl_start_epoch = int(self.min_num_steps_before_training /
                             (self.num_expl_steps_per_train_loop *
                              self.num_train_loops_per_epoch))
        print(" _train_vae called, should_train, amount_to_train",
              should_train, amount_to_train)
        if should_train or epoch <= (rl_start_epoch - 1):
            if self.parallel_vae_train:
                assert self._vae_training_process.is_alive()
                # Make sure the last vae update has finished before starting
                # another one
                if self._update_subprocess_vae_thread is not None:
                    self._update_subprocess_vae_thread.join()
                self._update_subprocess_vae_thread = Thread(
                    target=OnlineVaeAlgorithmSegmented.
                    update_vae_in_training_subprocess,
                    args=(self, epoch, ptu.device))
                self._update_subprocess_vae_thread.start()
                self._vae_conn_pipe.send((amount_to_train, epoch))
            else:
                _train_vae(self.vae_trainer_original,
                           self.replay_buffer,
                           epoch,
                           amount_to_train,
                           key='image_observation')

                # train segmentation vae using both oracle data and newly collected data
                # train using newly collected data
                if self.keep_train_segmentation_vae:
                    _train_vae(self.vae_trainer_segmented,
                               self.replay_buffer,
                               epoch,
                               amount_to_train // 3 * 2,
                               key='image_observation_segmented')

                    # train using pre-collected oracle data
                    _train_vae(self.vae_trainer_segmented,
                               self.replay_buffer,
                               epoch,
                               amount_to_train // 3,
                               key='image_observation_segmented',
                               oracle_data=True)

                self.replay_buffer.refresh_latents(epoch)

                _test_vae(self.vae_trainer_original,
                          epoch,
                          self.replay_buffer,
                          vae_save_period=self.vae_save_period,
                          uniform_dataset=self.uniform_dataset,
                          save_prefix='r_original_')
                _test_vae(self.vae_trainer_segmented,
                          epoch,
                          self.replay_buffer,
                          vae_save_period=self.vae_save_period,
                          uniform_dataset=self.uniform_dataset,
                          save_prefix='r_segmented_')

    def _log_vae_stats(self):
        logger.record_dict(
            self.vae_trainer_original.get_diagnostics(),
            prefix='vae_trainer_original/',
        )
        logger.record_dict(
            self.vae_trainer_segmented.get_diagnostics(),
            prefix='vae_trainer_segmented/',
        )

    def _cleanup(self):
        if self.parallel_vae_train:
            self._vae_conn_pipe.close()
            self._vae_training_process.terminate()

    def init_vae_training_subprocess(self):
        assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer)

        self._vae_conn_pipe, process_pipe = Pipe()
        self._vae_training_process = Process(
            target=subprocess_train_vae_loop,
            args=(
                process_pipe,
                self.vae,
                self.vae.state_dict(),
                self.replay_buffer,
                self.replay_buffer.get_mp_info(),
                ptu.device,
            ))
        self._vae_training_process.start()
        self._vae_conn_pipe.send(self.vae_trainer)

    def update_vae_in_training_subprocess(self, epoch, device):
        self.vae.__setstate__(self._vae_conn_pipe.recv())
        self.vae.to(device)
        _test_vae(
            self.vae_trainer,
            epoch,
            self.replay_buffer,
            vae_save_period=self.vae_save_period,
            uniform_dataset=self.uniform_dataset,
        )
コード例 #7
0
        for _ in range(num_step):
            if not is_training:
                time.sleep(0.05)

            agent.model.eval()
            agent.icm.eval()

            actions = agent.get_action(states)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards = [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards) * reward_scale
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)

            # total reward = int reward + ext Resard
            intrinsic_reward = agent.compute_intrinsic_reward(
                states, next_states, actions)
            rewards += intrinsic_reward
コード例 #8
0
def main(run_id=0, checkpoint=None, save_interval=1000):
    print({section: dict(config[section]) for section in config.sections()})

    train_method = default_config['TrainMethod']

    # Create environment
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        print('Mario environment not fully implemented - thomaseh')
        raise NotImplementedError
        env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    # Load configuration parameters
    is_load_model = checkpoint is not None
    is_render = False
    model_path = 'models/{}_{}_run{}_model'.format(env_id, train_method, run_id)
    if train_method == 'RND':
        predictor_path = 'models/{}_{}_run{}_pred'.format(env_id, train_method, run_id)
        target_path = 'models/{}_{}_run{}_target'.format(env_id, train_method, run_id)
    elif train_method == 'generative':
        predictor_path = 'models/{}_{}_run{}_vae'.format(env_id, train_method, run_id)
   

    writer = SummaryWriter(comment='_{}_{}_run{}'.format(env_id, train_method, run_id))

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])
    num_rollouts = int(default_config['NumRollouts'])
    num_pretrain_rollouts = int(default_config['NumPretrainRollouts'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    if train_method == 'RND':
        agent = RNDAgent
    elif train_method == 'generative':
        agent = GenerativeAgent
    else:
        raise NotImplementedError

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    else:
        raise NotImplementedError

    # Initialize agent
    agent = agent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net
    )

    # Load pre-existing model
    if is_load_model:
        print('load model...')
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
            if train_method == 'RND':
                agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
                agent.rnd.target.load_state_dict(torch.load(target_path))
            elif train_method == 'generative':
                agent.vae.load_state_dict(torch.load(predictor_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))
            if train_method == 'RND':
                agent.rnd.predictor.load_state_dict(
                    torch.load(predictor_path, map_location='cpu'))
                agent.rnd.target.load_state_dict(
                    torch.load(target_path, map_location='cpu'))
            elif train_method == 'generative':
                agent.vae.load_state_dict(torch.load(predictor_path, map_location='cpu'))
        print('load finished!')

    # Create workers to run in environments
    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(
            env_id, is_render, idx, child_conn, sticky_action=sticky_action,
            p=action_prob, life_done=life_done,
        )
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84], dtype='float32')

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # Initialize observation normalizers
    print('Start to initialize observation normalization parameter...')
    next_obs = np.zeros([num_worker * num_step, 1, 84, 84])
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker,))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for idx, parent_conn in enumerate(parent_conns):
            s, r, d, rd, lr, _ = parent_conn.recv()
            next_obs[(step % num_step) * num_worker + idx, 0, :, :] = s[3, :, :]

        if (step % num_step) == num_step - 1:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = np.zeros([num_worker * num_step, 1, 84, 84])
    print('End to initialize...')

    # Initialize stats dict
    stats = {
        'total_reward': [],
        'ep_length': [],
        'num_updates': [],
        'frames_seen': [],
    }

    # Main training loop
    while True:
        total_state = np.zeros([num_worker * num_step, 4, 84, 84], dtype='float32')
        total_next_obs = np.zeros([num_worker * num_step, 1, 84, 84])
        total_reward, total_done, total_next_state, total_action, \
            total_int_reward, total_ext_values, total_int_values, total_policy, \
            total_policy_np = [], [], [], [], [], [], [], [], []

        # Step 1. n-step rollout (collect data)
        for step in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(states/255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_obs = np.zeros([num_worker, 1, 84, 84])
            next_states = np.zeros([num_worker, 4, 84, 84])
            rewards, dones, real_dones, log_rewards = [], [], [], []
            for idx, parent_conn in enumerate(parent_conns):
                s, r, d, rd, lr, stat = parent_conn.recv()
                next_states[idx] = s
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs[idx, 0] = s[3, :, :]
                total_next_obs[idx * num_step + step, 0] = s[3, :, :]

                if rd:
                    stats['total_reward'].append(stat[0])
                    stats['ep_length'].append(stat[1])
                    stats['num_updates'].append(global_update)
                    stats['frames_seen'].append(global_step)

            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)

            # Compute total reward = intrinsic reward + external reward
            next_obs -= obs_rms.mean
            next_obs /= np.sqrt(obs_rms.var)
            next_obs.clip(-5, 5, out=next_obs)
            intrinsic_reward = agent.compute_intrinsic_reward(next_obs)
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            for idx, state in enumerate(states):
                total_state[idx * num_step + step] = state
            total_int_reward.append(intrinsic_reward)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              gamma,
                                              num_step,
                                              num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              int_gamma,
                                              num_step,
                                              num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        total_state /= 255.
        total_next_obs -= obs_rms.mean
        total_next_obs /= np.sqrt(obs_rms.var)
        total_next_obs.clip(-5, 5, out=total_next_obs)

        agent.train_model(total_state, ext_target, int_target, total_action,
                          total_adv, total_next_obs, total_policy)

        global_step += (num_worker * num_step)
        global_update += 1
        if global_update % save_interval == 0:
            print('Saving model at global step={}, num rollouts={}.'.format(
                global_step, global_update))
            torch.save(agent.model.state_dict(), model_path + "_{}.pt".format(global_update))
            if train_method == 'RND':
                torch.save(agent.rnd.predictor.state_dict(), predictor_path + '_{}.pt'.format(global_update))
                torch.save(agent.rnd.target.state_dict(), target_path + '_{}.pt'.format(global_update))
            elif train_method == 'generative':
                torch.save(agent.vae.state_dict(), predictor_path + '_{}.pt'.format(global_update))

            # Save stats to pickle file
            with open('models/{}_{}_run{}_stats_{}.pkl'.format(env_id, train_method, run_id, global_update),'wb') as f:
                pickle.dump(stats, f)

        if global_update == num_rollouts + num_pretrain_rollouts:
            print('Finished Training.')
            break
コード例 #9
0
        work.start()
        works.append(work)
        child_conns.append(child_conn)
        parent_conns.append(parent_conn)

    steps = 0
    next_obs = []
    print('Start to initialize observation normalization ...')
    while steps < pre_obs_norm_step:
        steps += num_worker
        actions = np.random.randint(0, output_size, size=(num_worker, ))
        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d = parent_conn.recv()
            next_obs.append(s)

        print('initializing...:', steps, '/', pre_obs_norm_step)

    next_obs = np.stack(next_obs)
    obs_rms.update(next_obs)
    print('End to initialize')

    states = np.zeros([num_worker, 2])
    global_update = 0
    global_step = 0
    sample_i_rall = 0
    sample_episode = 0
    sample_env_idx = 0
    sample_rall = 0
コード例 #10
0
ファイル: train.py プロジェクト: sidorovTV/CarRacing_agents
def main():
    if 'NAME' in os.environ.keys():
        NAME = os.environ['NAME']
    else:
        raise ValueError('set NAME via env variable')

    try:
        env_settings = json.load(open(default_config['CarIntersectConfigPath'], 'r'))
    except:
        env_settings = yaml.load(open(default_config['CarIntersectConfigPath'], 'r'))

    if 'home-test' not in NAME:
        wandb.init(
            project='CarRacing_RND',
            reinit=True,
            name=f'rnd_{NAME}',
            config={'env_config': env_settings, 'agent_config': default_config},
        )

    # print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']

    env_id = default_config['EnvID']
    # env_type = default_config['EnvType']

    # if env_type == 'mario':
    #     env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    # elif env_type == 'atari':
    #     env = gym.make(env_id)
    # else:
    #     raise NotImplementedError

    seed = np.random.randint(0, 2 ** 16 - 1)

    print(f'use name : {NAME}')
    print(f"use env config : {default_config['CarIntersectConfigPath']}")
    print(f'use seed : {seed}')
    print(f"use device : {os.environ['DEVICE']}")

    os.chdir('..')
    env = makeCarIntersect(env_settings)
    eval_env = create_eval_env(makeCarIntersect(env_settings))

    # input_size = env.observation_space.shape  # 4
    input_size = env.observation_space.shape
    assert isinstance(env.action_space, gym.spaces.Box)
    action_size = env.action_space.shape[0]  # 2

    env.close()

    is_load_model = True
    is_render = False
    # model_path = 'models/{}.model'.format(NAME)
    # predictor_path = 'models/{}.pred'.format(NAME)
    # target_path = 'models/{}.target'.format(NAME)

    # writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent(
        input_size,
        action_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net,
        device=os.environ['DEVICE'],
    )

    # if is_load_model:
    #     print('load model...')
    #     if use_cuda:
    #         agent.model.load_state_dict(torch.load(model_path))
    #         agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
    #         agent.rnd.target.load_state_dict(torch.load(target_path))
    #     else:
    #         agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))
    #         agent.rnd.predictor.load_state_dict(torch.load(predictor_path, map_location='cpu'))
    #         agent.rnd.target.load_state_dict(torch.load(target_path, map_location='cpu'))
    #     print('load finished!')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(env_id, is_render, idx, child_conn, sticky_action=sticky_action, p=action_prob,
                        life_done=life_done, settings=env_settings)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    os.chdir('rnd_continues')

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    logger = Logger(None, use_console=True, use_wandb=True, log_interval=1)

    print('Test evaluater:')
    evaluate_and_log(
        eval_env=eval_env,
        action_get_method=lambda eval_state: agent.get_action(
            np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255.
        )[0][0].cpu().numpy(),
        logger=logger,
        log_animation=False,
        exp_class='RND',
        exp_name=NAME,
        debug=True,
    )
    print('end evaluater test.')

    # normalize obs
    print('Start to initailize observation normalization parameter.....')

    # print('ALERT! pass section')
    # assert 'home-test' in NAME
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.uniform(-1, 1, size=(num_worker, action_size))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')

    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy_log_prob, total_policy_log_prob_np = \
            [], [], [], [], [], [], [], [], [], [], []

        # Step 1. n-step rollout
        for _ in range(num_step):
            global_step += num_worker
            # actions, value_ext, value_int, policy = agent.get_action(np.float32(states) / 255.)
            actions, value_ext, value_int, policy_log_prob = agent.get_action(np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action.cpu().numpy())

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions.cpu().numpy())
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)

            # total_policy.append(policy)
            # total_policy_np.append(policy.cpu().numpy())

            total_policy_log_prob.extend(policy_log_prob.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                # writer.add_scalar('data/step', sample_step, sample_episode)
                logger.log_it({
                    'reward_per_episode': sample_rall,
                    'intrinsic_reward': sample_i_rall,
                    'episode_steps': sample_step,
                    'global_step_cnt': global_step,
                    'updates_cnt': global_update,
                })
                logger.publish_logs(step=global_step)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)

        # total_action = np.stack(total_action).transpose().reshape([-1, action_size])
        total_action = np.array(total_action).reshape((-1, action_size))
        # total_log_prob_old = np.array(total_policy_log_prob).reshape((-1))

        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        # total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              gamma,
                                              num_step,
                                              num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              int_gamma,
                                              num_step,
                                              num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        global_update += 1
        # Step 5. Training!
        agent.train_model(np.float32(total_state) / 255., ext_target, int_target, total_action,
                          total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                          total_policy_log_prob)

        # if global_step % (num_worker * num_step * 100) == 0:
        #     print('Now Global Step :{}'.format(global_step))
        #     torch.save(agent.model.state_dict(), model_path)
        #     torch.save(agent.rnd.predictor.state_dict(), predictor_path)
        #     torch.save(agent.rnd.target.state_dict(), target_path)

        if global_update % 100 == 0:
            evaluate_and_log(
                eval_env=eval_env,
                action_get_method=lambda eval_state: agent.get_action(
                    np.tile(np.float32(eval_state), (1, 4, 1, 1)) / 255.
                )[0][0].cpu().numpy(),
                logger=logger,
                log_animation=True,
                exp_class='RND',
                exp_name=NAME,
            )
            logger.publish_logs(step=global_step)
コード例 #11
0
def main():

    args = parse_arguments()

    train_method = args.train_method
    env_id = args.env_id
    env_type = args.env_type

    if env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    env.close()

    is_load_model = False
    is_render = False
    os.makedirs('models', exist_ok=True)
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    results_dir = os.path.join('outputs', args.env_id)
    os.makedirs(results_dir, exist_ok=True)
    logger = Logger(results_dir)
    writer = SummaryWriter(
        os.path.join(results_dir, 'tensorboard', args.env_id))

    use_cuda = args.use_gpu
    use_gae = args.use_gae
    use_noisy_net = args.use_noisynet
    lam = args.lam
    num_worker = args.num_env
    num_step = args.num_step
    ppo_eps = args.ppo_eps
    epoch = args.epoch
    mini_batch = args.minibatch
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = args.learning_rate
    entropy_coef = args.entropy
    gamma = args.gamma
    int_gamma = args.int_gamma
    clip_grad_norm = args.clip_grad_norm
    ext_coef = args.ext_coef
    int_coef = args.int_coef
    sticky_action = args.sticky_action
    action_prob = args.action_prob
    life_done = args.life_done
    pre_obs_norm_step = args.obs_norm_step

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent

    if args.env_type == 'atari':
        env_type = AtariEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)

    logger.info('Start to initialize workers')
    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id,
                        is_render,
                        idx,
                        child_conn,
                        sticky_action=sticky_action,
                        p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    logger.info('Start to initailize observation normalization parameter.....')
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr, nr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    logger.info('End to initalize...')

    while True:
        logger.info('Iteration: {}'.format(global_update))
        #####################################################################################################
        total_state, total_reward, total_done, total_next_state, \
            total_action, total_int_reward, total_next_obs, total_ext_values, \
            total_int_values, total_policy, total_policy_np, total_num_rooms = \
            [], [], [], [], [], [], [], [], [], [], [], []
        #####################################################################################################
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            #################################################################################################
            next_states, rewards, dones, real_dones, log_rewards, next_obs, num_rooms = \
                [], [], [], [], [], [], []
            #################################################################################################
            for parent_conn in parent_conns:
                s, r, d, rd, lr, nr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                #############################################################################################
                num_rooms.append(nr)
                #############################################################################################
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)
            #################################################################################################
            num_rooms = np.hstack(num_rooms)
            #################################################################################################

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())
            #####################################################################################################
            total_num_rooms.append(num_rooms)
            #####################################################################################################

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/returns_vs_frames', sample_rall,
                                  global_step)
                writer.add_scalar('data/lengths_vs_frames', sample_step,
                                  global_step)
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(
            np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)
        #####################################################################################################
        total_num_rooms = np.stack(total_num_rooms).transpose().reshape(-1)
        total_done_cal = total_done.reshape(-1)
        if np.any(total_done_cal):
            avg_num_rooms = np.mean(total_num_rooms[total_done_cal])
        else:
            avg_num_rooms = 0
        #####################################################################################################

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / num_worker, global_update)
        #####################################################################################################
        writer.add_scalar('data/avg_num_rooms_per_iteration', avg_num_rooms,
                          global_update)
        writer.add_scalar('data/avg_num_rooms_per_step', avg_num_rooms,
                          global_step)
        #####################################################################################################
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          softmax(total_logging_policy).max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, gamma,
                                              num_step, num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, int_gamma,
                                              num_step, num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        agent.train_model(
            np.float32(total_state) / 255., ext_target, int_target,
            total_action, total_adv,
            ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(
                -5, 5), total_policy)

        if global_update % 1000 == 0:
            torch.save(agent.model.state_dict(),
                       'models/{}-{}.model'.format(env_id, global_update))
            logger.info('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)
コード例 #12
0
ファイル: atari_a2c.py プロジェクト: dnddnjs/mario_rl
    global_step = 0
    recent_prob = deque(maxlen=10)

    while True:
        total_state, total_reward, total_done, total_next_state, total_action = [], [], [], [], []
        global_step += (num_worker * num_step)

        for _ in range(num_step):
            actions = agent.get_action(states)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones = [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)

            total_state.append(states)
            total_next_state.append(next_states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
コード例 #13
0
def sim_games(N_games, N_MCTS, model, number_of_processes, v_resign, model2 = None, duel=False, batch_size = 8, board_size = 9):
    #### Function for generating games
    print("Starting sim games")
    process_workers = []
    torch.multiprocessing.set_start_method('spawn', force=True)
    # Make queues for sending data
    gpu_Q = Queue()
    if (duel==False):
        data_Q = Queue()
        # Also make pipe for receiving v_resign
        conn_rec, conn_send = Pipe(False)
        
        p_data = Process(target=data_handler, args=(data_Q, N_games, conn_send))
        process_workers.append(p_data)
    else:
        winner_Q = Queue()
        gpu_Q2 = Queue()
        process_workers.append(Process(target=gpu_worker, args=(gpu_Q2, batch_size, board_size, model2)))
    # Make counter and lock
    game_counter = Value('i', 0)
    lock = Lock()
    
    # Make process for gpu worker and data_loader
    
    process_workers.append(Process(target=gpu_worker, args=(gpu_Q, batch_size, board_size, model)))
    # Start gpu and data_loader worker
    print("GPU processes")
    for p in process_workers:
        p.start()
    # Construct tasks for workers
    procs = []
    torch.multiprocessing.set_start_method('fork', force=True)
    print("defining worker processes")
    for i in range(number_of_processes):
        seed = np.random.randint(int(2**31))
        if (duel==True):
            procs.append(Process(target=sim_duel_game_worker, args=(gpu_Q, gpu_Q2, N_MCTS, winner_Q, N_games, lock, game_counter, seed)))
        else:
            procs.append(Process(target=sim_game_worker, args=(gpu_Q, N_MCTS, data_Q, v_resign, N_games, lock, game_counter, seed)))
    
    print("Starting worker processes")
     # Begin running games
    for p in procs:
        p.start()
    # Join processes

    if (duel==False):
        # Receive new v_resign
        v_resign = conn_rec.recv()
    else:
        player1_wins = 0
        player2_wins = 0
        for i in range(N_games):
            player1_won = winner_Q.get(True)
            if (player1_won==1):
                player1_wins += 1
            else:
                player2_wins += 1
    
    for p in procs: 
        p.join()
                
    # Close processes
    for p in process_workers:
        p.terminate()
    
    # Returns v_resign if training else winrate when dueling
    if (duel==False):
        return v_resign
    else:
        return player1_wins, player2_wins
コード例 #14
0
class OnlineVaeOffpolicyAlgorithm(TorchBatchRLAlgorithm):
    def __init__(self,
                 vae,
                 vae_trainer,
                 *base_args,
                 vae_save_period=1,
                 vae_training_schedule=vae_schedules.never_train,
                 oracle_data=False,
                 parallel_vae_train=True,
                 vae_min_num_steps_before_training=0,
                 uniform_dataset=None,
                 dataset_path=None,
                 rl_offpolicy_num_training_steps=0,
                 **base_kwargs):
        super().__init__(*base_args, **base_kwargs)
        assert isinstance(self.replay_buffer, OnlineVaeRelabelingBuffer)
        self.vae = vae
        self.vae_trainer = vae_trainer
        self.vae_trainer.model = self.vae
        self.vae_save_period = vae_save_period
        self.vae_training_schedule = vae_training_schedule
        self.oracle_data = oracle_data

        self.parallel_vae_train = parallel_vae_train
        self.vae_min_num_steps_before_training = vae_min_num_steps_before_training
        self.uniform_dataset = uniform_dataset

        self._vae_training_process = None
        self._update_subprocess_vae_thread = None
        self._vae_conn_pipe = None

        self.dataset_path = dataset_path
        if self.dataset_path:
            self.load_dataset(dataset_path)

        # train Q and policy rl_offpolicy_num_training_steps times
        self.rl_offpolicy_num_training_steps = rl_offpolicy_num_training_steps

    def pretrain(self):
        for _ in range(self.rl_offpolicy_num_training_steps):
            train_data = self.replay_buffer.random_batch(self.batch_size)
            self.trainer.train(train_data)

    def load_dataset(self, dataset_path):
        dataset = load_local_or_remote_file(dataset_path)
        dataset = dataset.item()

        observations = dataset['observations']
        actions = dataset['actions']

        # dataset['observations'].shape # (2000, 50, 6912)
        # dataset['actions'].shape # (2000, 50, 2)
        # dataset['env'].shape # (2000, 6912)
        N, H, imlength = observations.shape

        self.vae.eval()
        for n in range(N):
            x0 = ptu.from_numpy(dataset['env'][n:n + 1, :] / 255.0)
            x = ptu.from_numpy(observations[n, :, :] / 255.0)
            latents = self.vae.encode(x, x0, distrib=False)

            r1, r2 = self.vae.latent_sizes
            conditioning = latents[0, r1:]
            goal = torch.cat(
                [ptu.randn(self.vae.latent_sizes[0]), conditioning])
            goal = ptu.get_numpy(goal)  # latents[-1, :]

            latents = ptu.get_numpy(latents)
            latent_delta = latents - goal
            distances = np.zeros((H - 1, 1))
            for i in range(H - 1):
                distances[i, 0] = np.linalg.norm(latent_delta[i + 1, :])

            terminals = np.zeros((H - 1, 1))
            # terminals[-1, 0] = 1
            path = dict(
                observations=[],
                actions=actions[n, :H - 1, :],
                next_observations=[],
                rewards=-distances,
                terminals=terminals,
            )

            for t in range(H - 1):
                # reward = -np.linalg.norm(latent_delta[i, :])

                obs = dict(
                    latent_observation=latents[t, :],
                    latent_achieved_goal=latents[t, :],
                    latent_desired_goal=goal,
                )
                next_obs = dict(
                    latent_observation=latents[t + 1, :],
                    latent_achieved_goal=latents[t + 1, :],
                    latent_desired_goal=goal,
                )

                path['observations'].append(obs)
                path['next_observations'].append(next_obs)

            # import ipdb; ipdb.set_trace()
            self.replay_buffer.add_path(path)

    def _end_epoch(self):
        timer.start_timer('vae training')
        self._train_vae(self.epoch)
        timer.stop_timer('vae training')
        super()._end_epoch()

    def _get_diagnostics(self):
        vae_log = self._get_vae_diagnostics().copy()
        vae_log.update(super()._get_diagnostics())
        return vae_log

    def to(self, device):
        self.vae.to(device)
        super().to(device)

    """
    VAE-specific Code
    """

    def _train_vae(self, epoch):
        if self.parallel_vae_train and self._vae_training_process is None:
            self.init_vae_training_subprocess()
        should_train, amount_to_train = self.vae_training_schedule(epoch)
        rl_start_epoch = int(self.min_num_steps_before_training /
                             (self.num_expl_steps_per_train_loop *
                              self.num_train_loops_per_epoch))
        if should_train:  # or epoch <= (rl_start_epoch - 1):
            if self.parallel_vae_train:
                assert self._vae_training_process.is_alive()
                # Make sure the last vae update has finished before starting
                # another one
                if self._update_subprocess_vae_thread is not None:
                    self._update_subprocess_vae_thread.join()
                self._update_subprocess_vae_thread = Thread(
                    target=OnlineVaeAlgorithm.
                    update_vae_in_training_subprocess,
                    args=(self, epoch, ptu.device))
                self._update_subprocess_vae_thread.start()
                self._vae_conn_pipe.send((amount_to_train, epoch))
            else:
                _train_vae(self.vae_trainer, epoch, self.replay_buffer,
                           amount_to_train)
                self.replay_buffer.refresh_latents(epoch)
                _test_vae(
                    self.vae_trainer,
                    epoch,
                    self.replay_buffer,
                    vae_save_period=self.vae_save_period,
                    uniform_dataset=self.uniform_dataset,
                )

    def _get_vae_diagnostics(self):
        return add_prefix(
            self.vae_trainer.get_diagnostics(),
            prefix='vae_trainer/',
        )

    def _cleanup(self):
        if self.parallel_vae_train:
            self._vae_conn_pipe.close()
            self._vae_training_process.terminate()

    def init_vae_training_subprocess(self):
        assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer)

        self._vae_conn_pipe, process_pipe = Pipe()
        self._vae_training_process = Process(
            target=subprocess_train_vae_loop,
            args=(
                process_pipe,
                self.vae,
                self.vae.state_dict(),
                self.replay_buffer,
                self.replay_buffer.get_mp_info(),
                ptu.device,
            ))
        self._vae_training_process.start()
        self._vae_conn_pipe.send(self.vae_trainer)

    def update_vae_in_training_subprocess(self, epoch, device):
        self.vae.__setstate__(self._vae_conn_pipe.recv())
        self.vae.to(device)
        _test_vae(
            self.vae_trainer,
            epoch,
            self.replay_buffer,
            vae_save_period=self.vae_save_period,
            uniform_dataset=self.uniform_dataset,
        )
コード例 #15
0
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')
    seed = np.random.randint(0, 100)

    env = ObstacleTowerEnv('../ObstacleTower/obstacletower', worker_id=seed,
                               retro=True, config={'total-floors': 12}, greyscale=True, timeout_wait=300)
    env._flattener = ActionFlattener([2, 3, 2, 1])
    env._action_space = env._flattener.action_space
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, 'main.model')
    predictor_path = os.path.join(args.save_dir, 'main.pred')
    target_path = os.path.join(args.save_dir, 'main.target')

    writer = SummaryWriter()#log_dir=args.log_dir)



    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) + list(rnd.predictor.parameters()), lr=args.lr)
   
    if args.load_model:
        "Loading model..."
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))


    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(
            args.env_name,
            is_render,
            idx,
            child_conn,
            sticky_action=args.sticky_action,
            p=args.sticky_action_prob,
            max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0   # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    print("Load RMS =", args.load_rms)
    if args.load_rms:
        print("Loading RMS values for observation and reward normalization")
        with open('reward_rms.pkl', 'rb') as f:
            reward_rms = dill.load(f)
        with open('obs_rms.pkl', 'rb') as f:
            obs_rms = dill.load(f)
    else:
        reward_rms = RunningMeanStd()
        obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))

        # normalize observation
        print('Initializing observation normalization...')
        next_obs = []
        for step in range(args.num_step * args.pre_obs_norm_steps):
            actions = np.random.randint(0, output_size, size=(args.num_worker,))

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            for parent_conn in parent_conns:
                next_state, reward, done, realdone, log_reward = parent_conn.recv()
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            if len(next_obs) % (args.num_step * args.num_worker) == 0:
                next_obs = np.stack(next_obs)
                obs_rms.update(next_obs)
                next_obs = []
        with open('reward_rms.pkl', 'wb') as f:
            dill.dump(reward_rms, f)
        with open('obs_rms.pkl', 'wb') as f:
            dill.dump(obs_rms, f)

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(model, device, np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv()
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device, np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose([1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / args.num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / args.num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', total_logging_action_probs.max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              args.ext_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              args.int_gamma,
                                              args.gae_lambda,
                                              args.num_step,
                                              args.num_worker,
                                              args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                        np.float32(total_state) / 255., ext_target, int_target, total_action,
                        total_adv, ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5),
                        total_action_probs)

        if global_step % (args.num_worker * args.num_step * args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)

            """
            checkpoint_list = np.array([int(re.search(r"\d+(\.\d+)?", x)[0]) for x in glob.glob(os.path.join('trained_models', args.env_name+'*.model'))])
            if len(checkpoint_list) == 0:
                last_checkpoint = -1
            else:
                last_checkpoint = checkpoint_list.max()
            next_checkpoint = last_checkpoint + 1
            print("Latest Checkpoint is #{}, saving checkpoint is #{}.".format(last_checkpoint, next_checkpoint))

            incre_model_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.model')
            incre_predictor_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.pred')
            incre_target_path = os.path.join(args.save_dir, args.env_name + str(next_checkpoint) + '.target')
            with open(incre_model_path, 'wb') as f:
                torch.save(model.state_dict(), f)
            with open(incre_predictor_path, 'wb') as f:
                torch.save(rnd.predictor.state_dict(), f)
            with open(incre_target_path, 'wb') as f:
                torch.save(rnd.target.state_dict(), f)
            """
            if args.terminate and (global_step > args.terminate_steps):
                with open('reward_rms.pkl', 'wb') as f:
                    dill.dump(reward_rms, f)
                with open('obs_rms.pkl', 'wb') as f:
                    dill.dump(obs_rms, f)
                break
コード例 #16
0
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker * num_worker_per_env, input_size])
    while True:
        total_state, total_reward, total_done, total_next_state, total_action = [], [], [], [], []

        for _ in range(num_step):
            actions = agent.get_action(states)
            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            rewards, dones, next_states = [], [], []
            for parent_conn in parent_conns:
                s, r, d, _ = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)

            next_states = np.vstack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)

            total_next_state.append(next_states)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)

            states = next_states[:, :]
コード例 #17
0
def MCTS(root_node, gpu_Q ,N, go_board, color, number_passes):
    start_color = color
    turn_switcher = {"black": "white",
                     "white": "black"}
    # Switch for adding calculated v relative to black
    relative_value =  {"black": 1,
                     "white": -1}
    # Define pipe for GPU process
    conn_rec, conn_send = Pipe(False)

    # Define variables to be used
    legal_board = np.empty((82), dtype=float)
    for i in range(N):
        #print(i)
        current_path = deque([])
        current_node = root_node
        color = start_color
        while True:
            current_node.N_total +=  1
            # Choose action
            current_node.U = (5*np.sqrt(current_node.N_total))*current_node.P*current_node.N_inv
            
            # Depending on current color Q_values are multiplied by -1
            if (color=="black"):
                a_chosen = np.argmax(current_node.U+current_node.illigal_board+current_node.Q)
            else:
                a_chosen = np.argmax(current_node.U+current_node.illigal_board-current_node.Q)
            # Add action and node to path and change color
            # Add current node to path
            current_path.append((current_node, a_chosen))
            if (current_node.N[a_chosen]!=0):
                # Case where edge is already explored
                #print("going down explored edge")
                # Increment visit count and 
                current_node.N[a_chosen] += 1
                current_node.N_inv[a_chosen] = 1/(1+current_node.N[a_chosen])
                
                # Left over code from virtual loss (not important)
                current_node.Q[a_chosen] = current_node.W[a_chosen]/current_node.N[a_chosen]
                
                # Case of already explored game end
                if (a_chosen==81):
                    game_done = False
                    if (number_passes==1) & (len(current_path)==1):
                        # Case where last action in game was a pass
                        game_done = True
                    else: 
                        # Normal rule of each game done after two passes in a row
                        try:
                            game_done = (81==current_path[-1][1]==current_path[-2][1])
                        except:
                            game_done = False
                    
                    # Count backwards if game has ended
                    if game_done:                        
                        v = current_node.W[81]/(current_node.N[81]-1)
                        for node, action in current_path:
                            node.W[action] += v
                            node.Q[action] = node.W[action]/node.N[action]
                        break
                
                
                # Update current node, color of turn, and repeat
                current_node = current_node.action_edges[a_chosen]
                # Switch collor
                color = turn_switcher[color]
                continue
            else:
                # Case where edge is not explored
                
                # Update visit count of action
                current_node.N[a_chosen] = 1
                current_node.N_inv[a_chosen] = 1/(1+current_node.N[a_chosen])
                
                new_go_state = current_node.go_state.copy_game()
                
                # First check if game is done
                # Simulate action
                if (a_chosen==81):
                    # Calculate if game has ended
                    game_done = False
                    if (number_passes==1) & (len(current_path)==1):
                        # Case where last action in game was a pass
                        game_done = True
                    else: 
                        # Normal rule of each game done after two passes in a row
                        try:
                            game_done = (81==current_path[-1][1]==current_path[-2][1])
                        except:
                            game_done = False
                    
                    # Count backwards if game has ended
                    if game_done:                        
                        # Compute who won
                        counted_points = new_go_state.count_points()
                        v = (counted_points>0)-int(counted_points<0)
                        for node, action in current_path:
                            node.W[action] += v
                            node.Q[action] = node.W[action]/node.N[action]
                        break
                    
                    # Take pass move
                    new_go_state.move('pass', color)
                else:
                    new_go_state.move(np.unravel_index(a_chosen, (9,9)), color)
                    
                # Get state
                color = turn_switcher[color]
                S = new_go_state.get_state(color)
                # Rotate and reflect state randomly
                S, rotation, reflection  = rotate_S(S)
                
                # Get policy and value
                gpu_Q.put([S, conn_send])
                
                # Construct legal and illigal board in the mean time
                legal_board[0:81] = np.ndarray.flatten(new_go_state.get_legal_board(color))
                legal_board[81] = 1
                illegal_board = (legal_board-1)*1000
                # Receive P, v
                P, v = conn_rec.recv()
                v = relative_value[color]*v
                # Reverse rotation of P 
                P = reverse_rotate(P, rotation, reflection)
                # Rescale P based on legal moves
                P = np.multiply(P,legal_board)
                P = P/np.sum(P)
                
                # Generate new node
                new_node = state_node(new_go_state, P, color)
                
                # Make large n_roundsnegative penalty to stop choosing illigal moves
                new_node.illigal_board = illegal_board
                
                # Add new node to tree
                current_node.action_edges[a_chosen] = new_node
                
                # Now back up 
                for node, action in current_path:
                    node.W[action] += v
                    node.Q[action] = node.W[action]/node.N[action]
                    # Normally we would update visit count N aswell,
                    #   but since virtual loss is not used, we can instead do it
                    #   at the start of the visit
                break
            
    return root_node
コード例 #18
0
ファイル: paac.py プロジェクト: DaomingLyu/paac-pytorch
def train(args):

    torch.multiprocessing.set_start_method('forkserver')

    num_envs = args.num_envs
    num_workers = args.num_workers
    total_envs = num_workers * num_envs
    game_name = args.env_name
    max_train_steps = args.max_train_steps
    n_steps = args.n_steps
    init_lr = args.lr
    gamma = args.gamma
    clip_grad_norm = args.clip_grad_norm
    num_action = gym.make(game_name).action_space.n
    image_size = 84
    n_stack = 4

    model = paac_ff(min_act=num_action).cuda()

    x = Variable(torch.zeros(total_envs, n_stack, image_size, image_size),
                 volatile=True).cuda()
    xs = [
        Variable(torch.zeros(total_envs, n_stack, image_size,
                             image_size)).cuda() for i in range(n_steps)
    ]

    share_reward = [
        Variable(torch.zeros(total_envs)).cuda() for _ in range(n_steps)
    ]
    share_mask = [
        Variable(torch.zeros(total_envs)).cuda() for _ in range(n_steps)
    ]
    constant_one = torch.ones(total_envs).cuda()

    optimizer = optim.Adam(model.parameters(), lr=init_lr)

    workers = []
    parent_conns = []
    child_conns = []
    for i in range(num_workers):
        parent_conn, child_conn = Pipe()
        w = worker(i, num_envs, game_name, n_stack, child_conn, args)
        w.start()
        workers.append(w)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    new_s = np.zeros((total_envs, n_stack, image_size, image_size))

    for global_step in range(1, max_train_steps + 1):

        cache_v_series = []
        entropies = []
        sampled_log_probs = []

        for step in range(n_steps):

            xs[step].data.copy_(torch.from_numpy(new_s))
            v, pi = model(xs[step])
            cache_v_series.append(v)

            sampling_action = pi.data.multinomial(1)

            log_pi = (pi + 1e-12).log()
            entropy = -(log_pi * pi).sum(1)
            sampled_log_prob = log_pi.gather(
                1, Variable(sampling_action)).squeeze()
            sampled_log_probs.append(sampled_log_prob)
            entropies.append(entropy)

            send_action = sampling_action.squeeze().cpu().numpy()
            send_action = np.split(send_action, num_workers)

            # send action and then get state
            for parent_conn, action in zip(parent_conns, send_action):
                parent_conn.send(action)

            batch_s, batch_r, batch_mask = [], [], []
            for parent_conn in parent_conns:
                s, r, mask = parent_conn.recv()
                batch_s.append(s)
                batch_r.append(r)
                batch_mask.append(mask)

            new_s = np.vstack(batch_s)
            r = np.hstack(batch_r).clip(-1, 1)  # clip reward
            mask = np.hstack(batch_mask)

            share_reward[step].data.copy_(torch.from_numpy(r))
            share_mask[step].data.copy_(torch.from_numpy(mask))

        x.data.copy_(torch.from_numpy(new_s))
        v, _ = model(x)  # v is volatile
        R = Variable(v.data.clone())
        v_loss = 0.0
        policy_loss = 0.0
        entropy_loss = 0.0

        for i in reversed(range(n_steps)):

            R = share_reward[i] + 0.99 * share_mask[i] * R
            advantage = R - cache_v_series[i]
            v_loss += advantage.pow(2).mul(0.5).mean()

            policy_loss -= sampled_log_probs[i].mul(advantage.detach()).mean()
            entropy_loss -= entropies[i].mean()

        total_loss = policy_loss + entropy_loss.mul(0.02) + v_loss * 0.5
        total_loss = total_loss.mul(1 / (n_steps))

        # adjust learning rate
        new_lr = init_lr - (global_step / max_train_steps) * init_lr
        for param_group in optimizer.param_groups:
            param_group['lr'] = new_lr

        optimizer.zero_grad()
        total_loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(),
                                                  clip_grad_norm)

        optimizer.step()

        if global_step % 10000 == 0:
            torch.save(model.state_dict(), './model/model_%s.pth' % game_name)

    for parent_conn in parent_conns:
        parent_conn.send(None)

    for w in workers:
        w.join()
コード例 #19
0
    valueloss_sample = []
    POLICYLOSS = []
    POLICYLOSS_MEAN = []
    policyloss_sample = []
    ENTROPY = []
    ENTROPY_MEAN = []
    entropy_sample = []

    EPISODES = []
    REWARDS = []
    REWARDS_MEAN = []

    episode = 0
    while True:
        (cpu, is_nstep, value_loss, policy_loss, entropy, reward,
         complete) = receiver.recv()

        dones[cpu] = complete

        exit = True
        for d in dones:
            if d == False:
                exit = False
                break
        if exit:
            break

        if complete:
            continue

        if is_nstep:
コード例 #20
0
class OnlineVaeAlgorithm(TorchBatchRLAlgorithm):
    def __init__(self,
                 vae,
                 vae_trainer,
                 *base_args,
                 vae_save_period=1,
                 vae_training_schedule=vae_schedules.never_train,
                 oracle_data=False,
                 parallel_vae_train=True,
                 vae_min_num_steps_before_training=0,
                 uniform_dataset=None,
                 **base_kwargs):
        super().__init__(*base_args, **base_kwargs)
        assert isinstance(self.replay_buffer, OnlineVaeRelabelingBuffer)
        self.vae = vae
        self.vae_trainer = vae_trainer
        self.vae_trainer.model = self.vae
        self.vae_save_period = vae_save_period
        self.vae_training_schedule = vae_training_schedule
        self.oracle_data = oracle_data

        self.parallel_vae_train = parallel_vae_train
        self.vae_min_num_steps_before_training = vae_min_num_steps_before_training
        self.uniform_dataset = uniform_dataset

        self._vae_training_process = None
        self._update_subprocess_vae_thread = None
        self._vae_conn_pipe = None

    def _train(self):
        super()._train()
        print("_train")
        self._cleanup()

    def _end_epoch(self, epoch):
        self._train_vae(epoch)
        gt.stamp('vae training')
        super()._end_epoch(epoch)

    def _log_stats(self, epoch):
        self._log_vae_stats()
        super()._log_stats(epoch)

    def to(self, device):
        self.vae.to(device)
        super().to(device)

    def _get_snapshot(self):
        snapshot = super()._get_snapshot()
        assert 'vae' not in snapshot
        snapshot['vae'] = self.vae
        return snapshot

    """
    VAE-specific Code
    """

    def _train_vae(self, epoch):
        if self.parallel_vae_train and self._vae_training_process is None:
            self.init_vae_training_subprocess()
        should_train, amount_to_train = self.vae_training_schedule(epoch)
        rl_start_epoch = int(self.min_num_steps_before_training /
                             (self.num_expl_steps_per_train_loop *
                              self.num_train_loops_per_epoch))
        if should_train or epoch <= (rl_start_epoch - 1):
            if self.parallel_vae_train:
                assert self._vae_training_process.is_alive()
                # Make sure the last vae update has finished before starting
                # another one
                if self._update_subprocess_vae_thread is not None:
                    self._update_subprocess_vae_thread.join()
                self._update_subprocess_vae_thread = Thread(
                    target=OnlineVaeAlgorithm.
                    update_vae_in_training_subprocess,
                    args=(self, epoch, ptu.device))
                self._update_subprocess_vae_thread.start()
                self._vae_conn_pipe.send((amount_to_train, epoch))
            else:
                _train_vae(self.vae_trainer, self.replay_buffer, epoch,
                           amount_to_train)
                self.replay_buffer.refresh_latents(epoch)
                _test_vae(
                    self.vae_trainer,
                    epoch,
                    self.replay_buffer,
                    vae_save_period=self.vae_save_period,
                    uniform_dataset=self.uniform_dataset,
                )

    def _log_vae_stats(self):
        logger.record_dict(
            self.vae_trainer.get_diagnostics(),
            prefix='vae_trainer/',
        )

    def _cleanup(self):
        if self.parallel_vae_train:
            self._vae_conn_pipe.close()
            self._vae_training_process.terminate()

    def init_vae_training_subprocess(self):
        assert isinstance(self.replay_buffer, SharedObsDictRelabelingBuffer)

        self._vae_conn_pipe, process_pipe = Pipe()
        self._vae_training_process = Process(
            target=subprocess_train_vae_loop,
            args=(
                process_pipe,
                self.vae,
                self.vae.state_dict(),
                self.replay_buffer,
                self.replay_buffer.get_mp_info(),
                ptu.device,
            ))
        self._vae_training_process.start()
        self._vae_conn_pipe.send(self.vae_trainer)

    def update_vae_in_training_subprocess(self, epoch, device):
        self.vae.__setstate__(self._vae_conn_pipe.recv())
        self.vae.to(device)
        _test_vae(
            self.vae_trainer,
            epoch,
            self.replay_buffer,
            vae_save_period=self.vae_save_period,
            uniform_dataset=self.uniform_dataset,
        )
コード例 #21
0
def main():
    print({section: dict(config[section]) for section in config.sections()})
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id),
                                            COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    elif env_type == 'vizdoom':
        input_size = (image_size, image_size)
        if env_id == 'battle':
            output_size = 4
            print('vizdoom battle init')
        elif env_id == 'my_way_home':
            output_size = 3
            print('vizdoom my way home init')
    else:
        raise NotImplementedError

    if env_type == 'mario' or env_type == 'atari':
        input_size = env.observation_space.shape  # 4
        output_size = env.action_space.n  # 2

        if 'Breakout' in env_id:
            output_size -= 1

        env.close()

    is_render = True
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    use_cuda = False
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = 1

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])

    sticky_action = False
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    agent = RNDAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    elif default_config['EnvType'] == 'vizdoom':
        print('Doom Environment')
        env_type = DoomEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)

    print('Loading Pre-trained model....')
    if use_cuda:
        print('using cuda')
        agent.model.load_state_dict(torch.load(model_path))
        agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
        agent.rnd.target.load_state_dict(torch.load(target_path))
    else:
        print('not using cuda')
        agent.model.load_state_dict(torch.load(model_path, map_location='cpu'))
        agent.rnd.predictor.load_state_dict(
            torch.load(predictor_path, map_location='cpu'))
        agent.rnd.target.load_state_dict(
            torch.load(target_path, map_location='cpu'))
    print('End load...')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id,
                        is_render,
                        idx,
                        child_conn,
                        sticky_action=sticky_action,
                        p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    print('start enjoy!')
    for i in range(1, 10):
        states = np.zeros([num_worker, 4, image_size, image_size])
        steps = 0
        rall = 0
        rd = False
        intrinsic_reward_list = []
        while not rd:
            if default_config['EnvType'] == 'vizdoom':
                time.sleep(0.05)

            steps += 1
            actions, value_ext, value_int, policy = agent.get_action(
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                rall += r
                next_states = s.reshape([1, 4, image_size, image_size])
                next_obs = s[3, :, :].reshape([1, 1, image_size, image_size])

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(next_obs)
            intrinsic_reward_list.append(intrinsic_reward)
            states = next_states[:, :, :, :]

            if rd:
                intrinsic_reward_list = (intrinsic_reward_list -
                                         np.mean(intrinsic_reward_list)
                                         ) / np.std(intrinsic_reward_list)
                '''
                with open('int_reward', 'wb') as f:
                    pickle.dump(intrinsic_reward_list, f)
                '''
                steps = 0
                rall = 0
コード例 #22
0
def sim_game(gpu_Q, N, data_Q, v_resign):
    print("Starting game")
    no_resign = np.random.rand(1)[0]>0.95
    
    # Hyperparameters
    temp_switch = 16  #Number of turns before other temperature measure is used
    eta_par = 0.03
    epsilon = 0.25
    
    # Switch for adding calculated v relative to black
    relative_value =  {"black": 1,
                     "white": -1}
    turn_switcher = {"black": "white",
                     "white": "black"}
    # Define pipe for GPU process
    conn_rec, conn_send = Pipe(False)
    
    # List for storing resignation values
    resign_list_black = []
    resign_list_white = []
    
    # Start game
    n_rounds = 0
    turn_color = "white"
    number_passes = 0
    go_game = go_board()
    data = []
    resign = False
    
    # Evalute first node
    S = go_game.get_state(turn_switcher[turn_color])
    # Get policy and value
    
    gpu_Q.put([S, conn_send])
    P, v = conn_rec.recv()
    # Generate start node
    root_node = state_node(go_game, P, turn_switcher[turn_color])
    root_node.illigal_board = np.zeros(82)
    
    # Run next moves
    while True:
        n_rounds += 1
        turn_color = turn_switcher[turn_color]
        
        # Case where early temperature is used
        if (n_rounds<=temp_switch):    
            # Simulate MCTS
            root_node = MCTS(root_node, gpu_Q , N, go_game, turn_color, number_passes)
            
            # Compute legal policy
            pi_legal = root_node.N/root_node.N_total
            
            # Selecet action
            action = np.random.choice(82, size=1, p=pi_legal)[0]
            
        # Case where later temperature is used
        else:
            # Get noise
            eta = np.random.dirichlet(np.ones(82)*eta_par)
            root_node.P = (1-epsilon)*root_node.P+epsilon*eta
            
            # Simulate MCTS
            root_node = MCTS(root_node, gpu_Q , N, go_game, turn_color, number_passes)
            
            # Compute legal actions visit count (needed for storing)
            pi_legal = root_node.N/root_node.N_total
            
            # Pick move
            action = np.argmax(root_node.N)
        
        # Save Data
        S = go_game.get_state(turn_color)
        data.append([S.copy(), pi_legal.copy(), turn_color])
        
        # Check for resignation
        if (turn_color=="black"):
            try:
                resign_req = max([np.max(root_node.action_edges[action].Q), root_node.Q[action]])
            except:
                resign_req = relative_value[turn_color]*root_node.Q[action]
        else:
            try:
                # To account for flipped v
                resign_req = -1*min([np.min(root_node.action_edges[action].Q), root_node.Q[action]])
            except:
                resign_req = relative_value[turn_color]*root_node.Q[action]
            
        # Add resign values for color
        if (turn_color=="black"):
            resign_list_black.append(resign_req)
        else:
            resign_list_white.append(resign_req)
        
        # Check if game ends
        if ((no_resign==False) & (resign_req<v_resign)):
            # resign
            resign = True
            break
        
        # Convert and take action
        #print("Move n. ",n_rounds, "New move was: ", action, "color was: ", turn_color)
        if (action==81):
                go_game.move('pass', turn_color)
                number_passes += 1
        else:
                go_game.move(np.unravel_index(action, (9,9)), turn_color)
                number_passes = 0
        # Check if game is over or too long (9*9*2)
        if ((number_passes==2) | (n_rounds>162)):
            break
        # Pick move
        root_node = root_node.action_edges[action]
        
    # Game is over
    
    # Find winner
    if (resign==True):
        # Set winner depending on resigned color
        if (turn_color=="black"):
            z = {"black": -1,
                 "white": 1}
        else:
            z = {"black": 1,
                 "white": -1}
    else:
        # No resignation, 
        points = go_game.count_points()
        
        # Black is winner
        if (points>0):
            z = {"black": 1,
                 "white": -1}
        else:
            z = {"black": -1,
                 "white": 1}
        
    # Define data arrays
    S_array = np.empty((n_rounds, 17, 9, 9), dtype=bool)
    P_array = np.empty((n_rounds, 82), dtype=float)
    z_array = np.empty((n_rounds), dtype=int)
    
    # Loop over each move and fill in arrays
    i = 0
    for S, P , turn_color in data:
        S_array[i] = S
        P_array[i] = P
        z_array[i] = z[turn_color]
        i += 1
    # Send data
    
    # In case game was used to check for false positives, compute lowest value
    if (no_resign==True):
        if (z["black"]==1):
            false_positive = min(resign_list_black)
        else:
            false_positive = min(resign_list_white)
        # Send data
        data_Q.put([S_array, P_array, z_array, false_positive])
    else:
        data_Q.put([S_array, P_array, z_array, None])
コード例 #23
0
def main():
    print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']
    assert train_method == 'RND'
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    is_load_model = False
    is_render = False
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    run_path = Path(
        f'runs/{env_id}_{datetime.now().strftime("%b%d_%H-%M-%S")}')
    log_path = run_path / 'logs'
    subgoals_path = run_path / 'subgoal_plots'
    data_path = run_path / 'json_data'

    run_path.mkdir(parents=True)
    log_path.mkdir()
    subgoals_path.mkdir()
    data_path.mkdir()

    writer = SummaryWriter(log_path)

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')
    torch.set_default_tensor_type(
        'torch.cuda.FloatTensor' if use_cuda else 'torch.FloatTensor')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)
    drn_model = DeepRelNov(agent.rnd,
                           input_size,
                           output_size,
                           use_cuda=use_cuda)

    if is_load_model:
        print('load model...')
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
            agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
            agent.rnd.target.load_state_dict(torch.load(target_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))
            agent.rnd.predictor.load_state_dict(
                torch.load(predictor_path, map_location='cpu'))
            agent.rnd.target.load_state_dict(
                torch.load(target_path, map_location='cpu'))
        print('load finished!')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id,
                        is_render,
                        idx,
                        child_conn,
                        sticky_action=sticky_action,
                        p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    print('Start to initailize observation normalization parameter.....')
    next_obs = []
    for _ in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr, i = parent_conn.recv()
            next_obs.append(s[-1, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')

    #this is for all envs
    accumulated_worker_episode_reward = np.zeros((num_worker, ))
    #this is fora single env (env = 0)
    accumulated_worker_episode_info = {
        "images": [],
        "visited_rooms": [],
        "current_room": [],
        "player_pos": []
    }
    episode_traj_buffer = []
    episode_counter = 0

    episode_rewards = [[] for _ in range(num_worker)]
    step_rewards = [[] for _ in range(num_worker)]
    global_ep = 0

    while True:
        total_state, total_reward, total_done, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy, total_policy_np = \
            [], [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for cur_step in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs, info = [], [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr, i = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[-1, :, :].reshape([1, 84, 84]))
                info.append(i)

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            accumulated_worker_episode_reward += rewards
            for i in range(len(rewards)):
                step_rewards[i].append(rewards[i])
                if real_dones[i]:
                    episode_rewards[i].append(
                        accumulated_worker_episode_reward[i])
                    accumulated_worker_episode_reward[i] = 0

            accumulated_worker_episode_info["images"].append(next_obs[0])
            accumulated_worker_episode_info["visited_rooms"].append(
                info[0].get('episode', {}).get('visited_rooms', {}))
            accumulated_worker_episode_info["current_room"].append(info[0].get(
                'current_room', {}))
            accumulated_worker_episode_info["player_pos"].append(info[0].get(
                'player_pos', {}))
            if real_dones[0]:
                episode_traj_buffer.append(accumulated_worker_episode_info)
                accumulated_worker_episode_info = {
                    "images": [],
                    "visited_rooms": [],
                    "current_room": [],
                    "player_pos": []
                }

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

            writer.add_scalar('data/avg_reward_per_step', np.mean(rewards),
                              global_step + num_worker * (cur_step - num_step))

        while all(episode_rewards):
            global_ep += 1
            avg_ep_reward = np.mean(
                [env_ep_rewards.pop(0) for env_ep_rewards in episode_rewards])
            writer.add_scalar('data/avg_reward_per_episode', avg_ep_reward,
                              global_ep)

        _, value_ext, value_int, _ = agent.get_action(
            np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        #total_int_reward = np.stack(total_int_reward).swapaxes(0, 1)
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          softmax(total_logging_policy).max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, gamma,
                                              num_step, num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, int_gamma,
                                              num_step, num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        agent.train_model(
            np.float32(total_state) / 255., ext_target, int_target,
            total_action, total_adv,
            ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(
                -5, 5), total_policy)

        if global_step % (num_worker * num_step * 100) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)

        #############################

        for traj_num, episode_dict in enumerate(episode_traj_buffer):
            traj = np.array(episode_dict["images"])
            obs_traj = ((traj - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(
                -5, 5)
            drn_model.train_rel_nov(obs_traj)
            episode_counter += 1

            if episode_counter % 100 == 0:
                subgoals = drn_model.get_filtered_subgoals(obs_traj, 1)

                #TODO Make Option

        episode_traj_buffer = []
コード例 #24
0
ファイル: train.py プロジェクト: unchartech-user/rnd
def main():
    args = get_args()
    device = torch.device('cuda' if args.cuda else 'cpu')

    env = gym.make(args.env_name)

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in args.env_name:
        output_size -= 1

    env.close()

    is_render = False
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
    model_path = os.path.join(args.save_dir, args.env_name + '.model')
    predictor_path = os.path.join(args.save_dir, args.env_name + '.pred')
    target_path = os.path.join(args.save_dir, args.env_name + '.target')

    writer = SummaryWriter(log_dir=args.log_dir)

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    discounted_reward = RewardForwardFilter(args.ext_gamma)

    model = CnnActorCriticNetwork(input_size, output_size, args.use_noisy_net)
    rnd = RNDModel(input_size, output_size)
    model = model.to(device)
    rnd = rnd.to(device)
    optimizer = optim.Adam(list(model.parameters()) +
                           list(rnd.predictor.parameters()),
                           lr=args.lr)

    if args.load_model:
        if args.cuda:
            model.load_state_dict(torch.load(model_path))
        else:
            model.load_state_dict(torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(args.num_worker):
        parent_conn, child_conn = Pipe()
        work = AtariEnvironment(args.env_name,
                                is_render,
                                idx,
                                child_conn,
                                sticky_action=args.sticky_action,
                                p=args.sticky_action_prob,
                                max_episode_steps=args.max_episode_steps)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([args.num_worker, 4, 84, 84])

    sample_env_index = 0  # Sample Environment index to log
    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize observation
    print('Initializes observation normalization...')
    next_obs = []
    for step in range(args.num_step * args.pre_obs_norm_steps):
        actions = np.random.randint(0, output_size, size=(args.num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            next_state, reward, done, realdone, log_reward = parent_conn.recv()
            next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (args.num_step * args.num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []

    print('Training...')
    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_action_probs = [], [], [], [], [], [], [], [], [], []
        global_step += (args.num_worker * args.num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(args.num_step):
            actions, value_ext, value_int, action_probs = get_action(
                model, device,
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                next_state, reward, done, real_done, log_reward = parent_conn.recv(
                )
                next_states.append(next_state)
                rewards.append(reward)
                dones.append(done)
                real_dones.append(real_done)
                log_rewards.append(log_reward)
                next_obs.append(next_state[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = compute_intrinsic_reward(
                rnd, device,
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_index]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_action_probs.append(action_probs)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_index]

            sample_step += 1
            if real_dones[sample_env_index]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = get_action(model, device,
                                                np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_action_probs = np.vstack(total_action_probs)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / args.num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / args.num_worker,
                          global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          total_logging_action_probs.max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, args.ext_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, args.int_gamma,
                                              args.gae_lambda, args.num_step,
                                              args.num_worker, args.use_gae)

        # add ext adv and int adv
        total_adv = int_adv * args.int_coef + ext_adv * args.ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        train_model(args, device, output_size, model, rnd, optimizer,
                    np.float32(total_state) / 255., ext_target, int_target,
                    total_action, total_adv,
                    ((total_next_obs - obs_rms.mean) /
                     np.sqrt(obs_rms.var)).clip(-5, 5), total_action_probs)

        if global_step % (args.num_worker * args.num_step *
                          args.save_interval) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(model.state_dict(), model_path)
            torch.save(rnd.predictor.state_dict(), predictor_path)
            torch.save(rnd.target.state_dict(), target_path)
コード例 #25
0
ファイル: adapt_e.py プロジェクト: zhenchangXia/MORL
        for _ in range(REPEAT):

            total_state, total_reward, total_target_reward, total_done, total_action, total_moreward\
                = [], [], [], [], [], []

            while True:
                actions = agent.get_action(states, explore_w)

                for parent_conn, action in zip(parent_conns, actions):
                    parent_conn.send(action)

                next_states, rewards, target_rewards, dones, real_dones, morewards, scores\
                    = [], [], [], [], [], [], []

                for parent_conn in parent_conns:
                    s, r, d, rd, mor, sc = parent_conn.recv()
                    next_states.append(s)
                    rewards.append(explore_w.dot(mor))
                    target_rewards.append(UNKNOWN_PREFERENCE.dot(mor))
                    dones.append(d)
                    real_dones.append(rd)
                    morewards.append(mor)
                    scores.append(sc)
                    # resample if done
                    # if d:
                    #     explore_w = renew_w(explore_w, cnt, pref_param)

                next_states = np.stack(next_states)
                rewards = np.hstack(rewards) * args.reward_scale
                target_rewards = np.hstack(target_rewards) * args.reward_scale
                dones = np.hstack(dones)
コード例 #26
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = env.action_space.n - 1
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = FuN(num_actions, args, device)
    optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    workers = []
    parent_conns = []
    child_conns = []

    for i in range(args.num_envs):
        parent_conn, child_conn = Pipe()
        worker = EnvWorker(args.env_name, args.render, child_conn)
        worker.start()
        workers.append(worker)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    net.to(device)
    net.train()

    global_steps = 0
    score = np.zeros(args.num_envs)
    count = 0
    grad_norm = 0

    histories = torch.zeros([args.num_envs, 3, 84, 84]).to(device)

    m_hx = torch.zeros(args.num_envs, num_actions * 16).to(device)
    m_cx = torch.zeros(args.num_envs, num_actions * 16).to(device)
    m_lstm = (m_hx, m_cx)

    w_hx = torch.zeros(args.num_envs, num_actions * 16).to(device)
    w_cx = torch.zeros(args.num_envs, num_actions * 16).to(device)
    w_lstm = (w_hx, w_cx)

    goals_horizon = torch.zeros(args.num_envs, args.horizon + 1,
                                num_actions * 16).to(device)

    while True:
        count += 1
        memory = Memory()
        global_steps += (args.num_envs * args.num_step)

        # gather samples from the environment
        for i in range(args.num_step):
            # TODO: think about net output
            net_output = net(histories.to(device), m_lstm, w_lstm,
                             goals_horizon)
            policies, goal, goals_horizon, m_lstm, w_lstm, m_value, w_value_ext, w_value_int, m_state = net_output

            actions = get_action(policies, num_actions)

            # send action to each worker environment and get state information
            next_histories, rewards, masks, dones = [], [], [], []

            for i, (parent_conn,
                    action) in enumerate(zip(parent_conns, actions)):
                parent_conn.send(action)
                next_history, reward, dead, done = parent_conn.recv()
                next_histories.append(next_history)
                rewards.append(reward)
                masks.append(1 - dead)
                dones.append(done)

                if dead:
                    m_hx_mask = torch.ones(args.num_envs,
                                           num_actions * 16).to(device)
                    m_hx_mask[i, :] = m_hx_mask[i, :] * 0
                    m_cx_mask = torch.ones(args.num_envs,
                                           num_actions * 16).to(device)
                    m_cx_mask[i, :] = m_cx_mask[i, :] * 0
                    m_hx, m_cx = m_lstm
                    m_hx = m_hx * m_hx_mask
                    m_cx = m_cx * m_cx_mask
                    m_lstm = (m_hx, m_cx)

                    w_hx_mask = torch.ones(args.num_envs,
                                           num_actions * 16).to(device)
                    w_hx_mask[i, :] = w_hx_mask[i, :] * 0
                    w_cx_mask = torch.ones(args.num_envs,
                                           num_actions * 16).to(device)
                    w_cx_mask[i, :] = w_cx_mask[i, :] * 0
                    w_hx, w_cx = w_lstm
                    w_hx = w_hx * w_hx_mask
                    w_cx = w_cx * w_cx_mask
                    w_lstm = (w_hx, w_cx)

                    goal_init = torch.zeros(args.horizon + 1,
                                            num_actions * 16).to(device)
                    goals_horizon[i] = goal_init

            score += rewards[0]

            # if agent in first environment dies, print and log score
            for i in range(args.num_envs):
                if dones[i]:
                    entropy = -policies * torch.log(policies + 1e-5)
                    entropy = entropy.mean().data.cpu()
                    print(
                        'global steps {} | score: {} | entropy: {:.4f} | grad norm: {:.3f} '
                        .format(global_steps, score[i], entropy, grad_norm))
                    if i == 0:
                        writer.add_scalar('log/score', score[i], global_steps)
                    score[i] = 0

            next_histories = torch.Tensor(next_histories).to(device)
            rewards = np.hstack(rewards)
            masks = np.hstack(masks)
            memory.push(histories, next_histories, actions, rewards, masks,
                        goal, policies, m_lstm, w_lstm, m_value, w_value_ext,
                        w_value_int, m_state)
            histories = next_histories

        # Train every args.num_step
        if (global_steps % args.num_step) == 0:  # Need to fix logic
            transitions = memory.sample()
            loss, grad_norm = train_model(net, optimizer, transitions, args)
            m_hx, m_cx = m_lstm
            m_lstm = (m_hx.detach(), m_cx.detach())
            w_hx, w_cx = w_lstm
            w_lstm = (w_hx.detach(), w_cx.detach())
            goals_horizon = goals_horizon.detach()
            # avg_loss.append(loss.cpu().data)

        if count % args.save_interval == 0:
            ckpt_path = args.save_path + 'model.pt'
            torch.save(net.state_dict(), ckpt_path)
コード例 #27
0
def main():
    print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        env = JoypadSpace(gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    is_load_model = True
    is_render = False
    model_path = 'models/{}.model'.format(env_id)
    predictor_path = 'models/{}.pred'.format(env_id)
    target_path = 'models/{}.target'.format(env_id)

    writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = int(default_config['NumEnv'])

    num_step = int(default_config['NumStep'])

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    int_gamma = float(default_config['IntGamma'])
    clip_grad_norm = float(default_config['ClipGradNorm'])
    ext_coef = float(default_config['ExtCoef'])
    int_coef = float(default_config['IntCoef'])

    sticky_action = default_config.getboolean('StickyAction')
    action_prob = float(default_config['ActionProb'])
    life_done = default_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    agent = RNDAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)

    if is_load_model:
        print('load model...')
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
            agent.rnd.predictor.load_state_dict(torch.load(predictor_path))
            agent.rnd.target.load_state_dict(torch.load(target_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))
            agent.rnd.predictor.load_state_dict(
                torch.load(predictor_path, map_location='cpu'))
            agent.rnd.target.load_state_dict(
                torch.load(target_path, map_location='cpu'))
        print('load finished!')

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id,
                        is_render,
                        idx,
                        child_conn,
                        sticky_action=sticky_action,
                        p=action_prob,
                        life_done=life_done)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    print('Start to initailize observation normalization parameter.....')
    next_obs = []
    for step in range(num_step * pre_obs_norm_step):
        actions = np.random.randint(0, output_size, size=(num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

        if len(next_obs) % (num_step * num_worker) == 0:
            next_obs = np.stack(next_obs)
            obs_rms.update(next_obs)
            next_obs = []
    print('End to initalize...')

    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_ext_values, total_int_values, total_policy, total_policy_np = \
            [], [], [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(
                np.float32(states) / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs.append(s[3, :, :].reshape([1, 84, 84]))

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)
            next_obs = np.stack(next_obs)

            # total reward = int reward + ext Reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                ((next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(-5, 5))
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_next_obs.append(next_obs)
            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(
            np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_next_obs = np.stack(total_next_obs).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 1, 84, 84])
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          softmax(total_logging_policy).max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward, total_done,
                                              total_ext_values, gamma,
                                              num_step, num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values, int_gamma,
                                              num_step, num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        agent.train_model(
            np.float32(total_state) / 255., ext_target, int_target,
            total_action, total_adv,
            ((total_next_obs - obs_rms.mean) / np.sqrt(obs_rms.var)).clip(
                -5, 5), total_policy)

        if global_step % (num_worker * num_step * 100) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.rnd.predictor.state_dict(), predictor_path)
            torch.save(agent.rnd.target.state_dict(), target_path)
コード例 #28
0
def main():
    print({section: dict(config[section]) for section in config.sections()})
    train_method = default_config['TrainMethod']
    env_id = default_config['EnvID']
    env_type = default_config['EnvType']

    if env_type == 'mario':
        env = BinarySpaceToDiscreteSpaceEnv(gym_super_mario_bros.make(env_id),
                                            COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    is_load_model = False
    is_render = False
    model_path = 'models/{}.model'.format(env_id)
    icm_path = 'models/{}.icm'.format(env_id)

    writer = SummaryWriter()

    use_cuda = default_config.getboolean('UseGPU')
    use_gae = default_config.getboolean('UseGAE')
    use_noisy_net = default_config.getboolean('UseNoisyNet')

    lam = float(default_config['Lambda'])
    num_worker = 32

    num_step = 128

    ppo_eps = float(default_config['PPOEps'])
    epoch = int(default_config['Epoch'])
    mini_batch = int(default_config['MiniBatch'])
    batch_size = 256
    learning_rate = float(default_config['LearningRate'])
    entropy_coef = float(default_config['Entropy'])
    gamma = float(default_config['Gamma'])
    eta = float(default_config['ETA'])
    clip_grad_norm = float(default_config['ClipGradNorm'])

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))

    pre_obs_norm_step = int(default_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(gamma)

    agent = ICMAgent

    if default_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif default_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    else:
        raise NotImplementedError

    agent = agent(input_size,
                  output_size,
                  num_worker,
                  num_step,
                  gamma,
                  lam=lam,
                  learning_rate=learning_rate,
                  ent_coef=entropy_coef,
                  clip_grad_norm=clip_grad_norm,
                  epoch=epoch,
                  batch_size=batch_size,
                  ppo_eps=ppo_eps,
                  eta=eta,
                  use_cuda=use_cuda,
                  use_gae=use_gae,
                  use_noisy_net=use_noisy_net)

    if is_load_model:
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))

    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(env_id, is_render, idx, child_conn)
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 4, 84, 84])

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # normalize obs
    print('Start to initailize observation normalization parameter.....')
    next_obs = []
    steps = 0
    while steps < pre_obs_norm_step:
        steps += num_worker
        actions = np.random.randint(0, output_size, size=(num_worker, ))

        for parent_conn, action in zip(parent_conns, actions):
            parent_conn.send(action)

        for parent_conn in parent_conns:
            s, r, d, rd, lr = parent_conn.recv()
            next_obs.append(s[3, :, :].reshape([1, 84, 84]))

    next_obs = np.stack(next_obs)
    obs_rms.update(next_obs)
    print('End to initalize...')

    while True:
        total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy = \
            [], [], [], [], [], [], [], [], []
        global_step += (num_worker * num_step)
        global_update += 1

        # Step 1. n-step rollout
        for _ in range(num_step):
            actions, value, policy = agent.get_action(
                (np.float32(states) - obs_rms.mean) / np.sqrt(obs_rms.var))

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
            for parent_conn in parent_conns:
                s, r, d, rd, lr = parent_conn.recv()
                next_states.append(s)
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)

            next_states = np.stack(next_states)
            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)

            # total reward = int reward
            intrinsic_reward = agent.compute_intrinsic_reward(
                (states - obs_rms.mean) / np.sqrt(obs_rms.var),
                (next_states - obs_rms.mean) / np.sqrt(obs_rms.var), actions)
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            total_int_reward.append(intrinsic_reward)
            total_state.append(states)
            total_next_state.append(next_states)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_values.append(value)
            total_policy.append(policy)

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall,
                                  sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall,
                                  global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value, _ = agent.get_action(
            (np.float32(states) - obs_rms.mean) / np.sqrt(obs_rms.var))
        total_values.append(value)
        # --------------------------------------------------

        total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape(
            [-1, 4, 84, 84])
        total_next_state = np.stack(total_next_state).transpose(
            [1, 0, 2, 3, 4]).reshape([-1, 4, 84, 84])
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_reward = np.stack(total_reward).transpose()
        total_done = np.stack(total_done).transpose()
        total_values = np.stack(total_values).transpose()
        total_logging_policy = np.vstack(total_policy)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([
            discounted_reward.update(reward_per_step)
            for reward_per_step in total_int_reward.T
        ])
        mean, std, count = np.mean(total_reward_per_env), np.std(
            total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std**2, count)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi',
                          np.sum(total_int_reward) / num_worker,
                          sample_episode)
        writer.add_scalar('data/int_reward_per_rollout',
                          np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob',
                          softmax(total_logging_policy).max(1).mean(),
                          sample_episode)

        # Step 3. make target and advantage
        target, adv = make_train_data_icm(total_int_reward,
                                          np.zeros_like(total_int_reward),
                                          total_values, gamma, num_step,
                                          num_worker)

        adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8)
        # -----------------------------------------------

        # Step 5. Training!
        print('training')
        agent.train_model(
            (np.float32(total_state) - obs_rms.mean) / np.sqrt(obs_rms.var),
            (np.float32(total_next_state) - obs_rms.mean) /
            np.sqrt(obs_rms.var), target, total_action, adv, total_policy)

        if global_step % (num_worker * num_step * 100) == 0:
            print('Now Global Step :{}'.format(global_step))
            torch.save(agent.model.state_dict(), model_path)
            torch.save(agent.icm.state_dict(), icm_path)