def __init__(self,
                 env_id,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=1,
                 h=84,
                 w=84,
                 life_done=True,
                 sticky_action=False,
                 p=0.25):
        super(GridEnvironment, self).__init__()
        self.daemon = True
        self.env = ImgObsWrapper(
            RGBImgObsWrapper(ReseedWrapper(gym.make(env_id))))
        self.env_id = env_id
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()
def which_env(name):
    if 'Grid' in name:
        env = ImgObsWrapper(gym.make(name))
        test_env = ImgObsWrapper(gym.make(name))
    else:
        env = make_atari(name)
        test_env = make_atari(name)
    return env, test_env, (env.observation_space.shape, env.action_space.n)
Beispiel #3
0
 def init_env(self):
     env = ImgObsWrapper(self.init())
     env.reset()
     print("agent pos: {}".format(env.agent_pos))
     self.action_space = env.action_space
     self.action_dim = env.action_space.n
     self.obs_dim = env.observation_space.shape
     return env
Beispiel #4
0
 def thunk():
     env = gym.make(gym_id)
     env = ImgObsWrapper(env)
     env = gym.wrappers.RecordEpisodeStatistics(env)
     if args.capture_video:
         if idx == 0:
             env = Monitor(env, f'videos/{experiment_name}')
     env.seed(seed)
     env.action_space.seed(seed)
     env.observation_space.seed(seed)
     return env
Beispiel #5
0
def main(argv):
    env = e_lib.EmptyMultigoal(size=FLAGS.size,
                               n_goals=FLAGS.n_goals,
                               n_traps=FLAGS.n_traps)
    env = e_lib.SymbolicObsWrapper(env)
    env = ImgObsWrapper(env)
    env = helx.rl.environment.from_gym(env)

    n_features = jnp.prod(env.observation_spec().shape)
    logger = helx.logging.TerminalLogger()
    agent = a_lib.SarsaLambda(env, FLAGS.alpha, FLAGS.lamda, n_features,
                              logger)

    helx.rl.experiment.run(env, agent, FLAGS.train_episodes)
    helx.rl.experiment.run(env, agent, FLAGS.eval_episodes, True)
Beispiel #6
0
def inner_objective(
    ind: cgp.IndividualSingleGenome,
    network_params: dict,
    curriculum_params: dict,
    seeds
) -> float:

    rule = ind.to_torch()

    reward_per_seed = []
    reward_per_seed_mean = []
    for seed in seeds:
        seed = int(seed)

        torch.manual_seed(seed=seed)
        rng = np.random.default_rng(seed=seed)

        # environment and network initialization
        env = DynamicMiniGrid(seed=seed)
        env = ImgObsWrapper(env)
        state = env.respawn()["image"][:,:,0].flatten()

        policy_net = Network(n_inputs=np.size(state), **network_params)

        rewards_over_alterations = run_curriculum(env=env, net=policy_net, rule=rule, **curriculum_params, rng=rng)

        reward_per_seed.append(rewards_over_alterations)
        reward_per_seed_mean.append(np.mean(rewards_over_alterations))

    ind.reward_matrix = reward_per_seed
    reward_mean = np.mean(reward_per_seed_mean)

    return float(reward_mean)
Beispiel #7
0
def get_env(env_name):
    env = gym.make(env_name)
    if env_name.startswith('MiniGrid'):
        env = ImgObsWrapper(env)

    # TODO include atari here or put get_env in gym_utils
    # env = make_atari('SpaceInvadersNoFrameskip-v0')
    # env = WrapFrame(env)

    return env
Beispiel #8
0
def mini_grid_wrapper(env_id, max_frames=0, clip_rewards=True):
    env = gym.make(env_id)
    env = ReseedWrapper(env, seeds=[0])
    env = RGBImgObsWrapper(env)
    env = ImgObsWrapper(env)
    if max_frames:
        env = pfrl.wrappers.ContinuingTimeLimit(
            env, max_episode_steps=max_frames)
    # env = atari_wrappers.MaxAndSkipEnv(env, skip=0)
    env = atari_wrappers.wrap_deepmind(
        env, episode_life=False, clip_rewards=clip_rewards)
    return env
Beispiel #9
0
    def __init__(self, name, horizon=None, gamma=0.99, history_length=4,
                 fixed_seed=None, use_pixels=False):
        """
        Constructor.

        Args:
             name (str): name of the environment;
             horizon (int, None): the horizon;
             gamma (float, 0.99): the discount factor;
             history_length (int, 4): number of frames to form a state;
             fixed_seed (int, None): if passed, it fixes the seed of the
                environment at every reset. This way, the environment is fixed
                rather than procedurally generated;
             use_pixels (bool, False): if True, MiniGrid's default 7x7x3
                observations is converted to an image of resolution 56x56x3.

        """
        # MDP creation
        self._not_pybullet = True
        self._first = True

        env = gym.make(name)
        obs_high = 10.
        if use_pixels:
            env = RGBImgPartialObsWrapper(env) # Get pixel observations
            obs_high = 255.
        env = ImgObsWrapper(env) # Get rid of the 'mission' field
        self.env = env

        self._fixed_seed = fixed_seed

        self._img_size = env.observation_space.shape[0:2]
        self._history_length = history_length

        # Get the default horizon
        if horizon is None:
            horizon = self.env.max_steps

        # MDP properties
        action_space = Discrete(self.env.action_space.n)
        observation_space = Box(
            low=0., high=obs_high, shape=(history_length, self._img_size[1], self._img_size[0]))
        self.env.max_steps = horizon + 1 # Hack to ignore gym time limit (do not use np.inf, since MiniGrid returns r(t) = 1 - 0.9t/T)
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        Environment.__init__(self, mdp_info)

        self._state = None
Beispiel #10
0
def calculate_validation_fitness(champion, seed, network_params, curriculum_params):

    rule = champion.to_torch()

    torch.manual_seed(seed=seed)
    rng = np.random.default_rng(seed=seed)

    # environment and network initialization
    env = DynamicMiniGrid(seed=seed)
    env = ImgObsWrapper(env)
    state = env.respawn()["image"][:, :, 0].flatten()

    policy_net = Network(n_inputs=np.size(state), **network_params)

    rewards_over_alterations = run_curriculum(env=env, net=policy_net, rule=rule, **curriculum_params, rng=rng)

    return rewards_over_alterations
Beispiel #11
0
    def get_env_constructor(self, env_name):
        env_type = self.get_env_type(env_name)
        if env_type == 'mg':
            constructor = lambda: MiniGridRewardNormalize(
                ImgObsWrapper(gym.make(env_name)),
                scale=self.env_infos[env_name].reward_scale,
                shift=self.env_infos[env_name].reward_shift)
        elif env_type == 'gym':
            constructor = lambda: GymRewardNormalize(
                gym.make(env_name),
                scale=self.env_infos[env_name].reward_scale,
                shift=self.env_infos[env_name].reward_shift)
        elif env_type in ['tab', 'vcomp']:
            constructor = self.env_infos[env_name].constructor
        else:
            assert False

        return constructor
Beispiel #12
0
def set_env(params):

    if params.env == 'hanoi':
        from hanoi_env.env import HanoiEnv
        params.model_type = 'rnn'
        env = HanoiEnv()
        env.set_env_parameters(max_count=params.max_count,
                               num_disks=params.num_disks,
                               num_pegs=params.num_pegs,
                               allow_impossible=params.allow_impossible,
                               continual=params.continual,
                               initial_peg=params.initial_peg)

    elif params.env == 'lightbot_minigrid':
        from gym_minigrid.envs import LightbotEnv as LightbotMinigridEnv
        from gym_minigrid.wrappers import ImgObsWrapper, AgentViewWrapper
        params.model_type = 'cnn'
        env = LightbotMinigridEnv(params.puzzle_name,
                                  reward_fn=params.rewards,
                                  max_steps=params.max_count,
                                  toggle_ontop=False)
        env = ImgObsWrapper(AgentViewWrapper(env, agent_view_size=9))

    elif params.env == 'lightbot':
        from lightbot_env.env import LightbotEnv
        params.model_type = 'mlp'
        env = LightbotEnv(params.puzzle_name)
        env.set_env_parameters(max_count=params.max_count,
                               testing=params.testing,
                               reward_fn=params.rewards,
                               random_init=params.random_init,
                               allow_impossible=params.allow_impossible)

    elif params.env == 'fourrooms':
        from fourrooms.fourrooms import Fourrooms
        params.model_type = 'mlp'
        env = Fourrooms(max_count=params.max_count)

    elif params.env == 'fourrooms_minigrid':
        params.model_type = 'cnn'
        raise NotImplementedError
    return env, params
Beispiel #13
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    gin.parse_config_files_and_bindings(
        [os.path.join(mon_minigrid.GIN_FILES_PREFIX, 'classic_fourrooms.gin')],
        bindings=FLAGS.gin_bindings,
        skip_unknown=False)
    env_id = mon_minigrid.register_environment()
    env = gym.make(env_id)
    env = RGBImgObsWrapper(env)  # Get pixel observations
    env = ImgObsWrapper(env)  # Get rid of the 'mission' field
    env.reset()

    num_frames = 0
    max_num_frames = 500

    if not tf.io.gfile.exists(FLAGS.file_path):
        tf.io.gfile.makedirs(FLAGS.file_path)

    undisc_return = 0
    while num_frames < max_num_frames:
        # Act randomly
        obs, reward, done, _ = env.step(env.action_space.sample())
        undisc_return += reward
        num_frames += 1

        # Draw environment frame just for simple visualization
        plt.imshow(obs)
        path = os.path.join(FLAGS.file_path, 'obs_{}.png'.format(num_frames))

        plt.savefig(path)
        plt.clf()

        if done:
            break

    print('Undiscounted return: %.2f' % undisc_return)
    env.close()
Beispiel #14
0
    def get_env_constructor(self, env_name):
        env_type = self.get_env_type(env_name)
        if env_type == 'mg':
            constructor = lambda: MiniGridRewardNormalize(
                ImgObsWrapper(gym.make(env_name)),
                scale=self.env_infos[env_name].reward_scale,
                shift=self.env_infos[env_name].reward_shift)

        elif env_type == 'gym':
            constructor = lambda: GymRewardNormalize(
                gym.make(env_name),
                scale=self.env_infos[env_name].reward_scale,
                shift=self.env_infos[env_name].reward_shift)
        elif env_type == 'tab':
            """
            Here you should explicitly design the reward structure
            """
            constructor = self.env_infos[env_name].constructor
        else:
            assert False

        return constructor
Beispiel #15
0
        self.grid.wall_rect(0, 0, width, height)

        # Place the goals
        for _ in range(self.n_goals):
            self.place_obj(Goal())

        # Place the traps
        for _ in range(self.n_traps):
            self.place_obj(Lava())

        # Place the agent
        if self.agent_start_pos is not None:
            self.agent_pos = self.agent_start_pos
            self.agent_dir = self.agent_start_dir
        else:
            self.place_agent()

        self.mission = "get to the green goal square, avoid the lava"


if __name__ == "__main__":
    # debugging
    env = EmptyMultigoal(size=5, n_goals=1, n_traps=1)
    env = SymbolicObsWrapper(env)
    env = PartialObsWrapper(env, agent_view_size=1)
    env = ImgObsWrapper(env)
    o = env.reset()
    print(o.shape)
    env.render()
    print(o)
Beispiel #16
0
def main(args):
    env = gym.make(args.env)
    if 'MiniGrid' in args.env:
        env = ImgObsWrapper(env)
    path = args.base_path + args.env
    os.makedirs(path, exist_ok=True)
    # obs_shape = np.prod(env.observation_space.shape).astype(int)
    obs_shape = env.observation_space.shape
    act_shape = env.action_space.n

    q = QNetwork(obs_shape, act_shape)
    q_target = QNetwork(obs_shape, act_shape)
    opt = optim.Adam(lr=args.lr, params=q.parameters())
    memory = Memory(capacity=args.memory)
    scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01)

    avg_rw = deque(maxlen=40)
    avg_len = deque(maxlen=40)

    def get_action(s, t):

        s = torch.Tensor(s[None,:])
        _q = q(s)
        if np.random.sample() > scheduler.value:
            best_action = np.argmax(_q.detach(), axis=-1).item()
        else:
            best_action = np.random.randint(0, act_shape)
            scheduler.update(t)
        return best_action

    def train(batch):
        batch = Transition(*zip(*batch))
        s = torch.Tensor(batch.state)
        a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape))
        r = torch.Tensor(batch.reward)
        d = torch.Tensor(batch.done)
        s1 = torch.Tensor(batch.next_state)

        value = (q(s) * a).sum(dim=-1)
        next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0]
        loss = (.5 * (next_value - value) ** 2).mean()
        opt.zero_grad()
        loss.backward()
        opt.step()

    state = env.reset()

    q_target.load_state_dict(q.state_dict())

    ep_rw = 0
    ep_len = 0
    ep = 0
    for t in range(args.max_steps):
        action = get_action(state, t)
        next_state, reward, done, _ = env.step(action)
        memory.push(state, action, next_state, reward, done)
        ep_rw += reward
        ep_len += 1

        state = next_state.copy()
        if done:
            ep += 1
            avg_rw.append(ep_rw)
            avg_len.append(ep_len)
            ep_rw = 0
            ep_len = 0
            state = env.reset()

        if t % args.train_every == 0 and len(memory) > args.batch_size:
            batch = memory.sample(batch_size=args.batch_size)
            train(batch)

        if t % args.update_every == 0:
            q_target.load_state_dict(q.state_dict())
            print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}')

    env = Monitor(env, directory=path)

    for ep in range(4):
        s = env.reset()
        while True:
            a = get_action(s, t=0)
            s1, r, d, _ = env.step(a)
            s = s1.copy()
            if d:
                break
Beispiel #17
0
    import wandb
    wandb.init(project=args.wandb_project_name,
               entity=args.wandb_entity,
               sync_tensorboard=True,
               config=vars(args),
               name=experiment_name,
               monitor_gym=True,
               save_code=True)
    writer = SummaryWriter(f"/tmp/{experiment_name}")

# TRY NOT TO MODIFY: seeding
device = torch.device(
    'cuda' if torch.cuda.is_available() and args.cuda else 'cpu')
env = gym.make(args.gym_id)
#env = wrap_atari(env)
env = ImgObsWrapper(env)

#env = gym.wrappers.RecordEpisodeStatistics(env) # records episode reward in `info['episode']['r']`
if args.capture_video:
    env = Monitor(env, f'videos/{experiment_name}')
#env = wrap_deepmind(
#    env,
#    clip_rewards=True,
#    frame_stack=True,
#    scale=False,
#)

random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic
def main(run_id=0, checkpoint=None, rec_interval=10, save_interval=100):
    print({section: dict(config[section]) for section in config.sections()})

    train_method = grid_config['TrainMethod']

    # Create environment
    env_id = grid_config['EnvID']
    env_type = grid_config['EnvType']

    if env_type == 'mario':
        print('Mario environment not fully implemented - thomaseh')
        raise NotImplementedError
        env = BinarySpaceToDiscreteSpaceEnv(
            gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT)
    elif env_type == 'atari':
        env = gym.make(env_id)
    elif env_type == 'grid':
        env = ImgObsWrapper(RGBImgObsWrapper(gym.make(env_id))) 
    else:
        raise NotImplementedError

    input_size = env.observation_space.shape  # 4
    output_size = env.action_space.n  # 2

    if 'Breakout' in env_id:
        output_size -= 1

    env.close()

    # Load configuration parameters
    is_load_model = checkpoint is not None
    is_render = False
    model_path = 'models/{}_{}_run{}_model'.format(env_id, train_method, run_id)
    predictor_path = 'models/{}_{}_run{}_vae'.format(env_id, train_method, run_id)
   

    writer = SummaryWriter(logdir='runs/{}_{}_run{}'.format(env_id, train_method, run_id))

    use_cuda = grid_config.getboolean('UseGPU')
    use_gae = grid_config.getboolean('UseGAE')
    use_noisy_net = grid_config.getboolean('UseNoisyNet')

    lam = float(grid_config['Lambda'])
    num_worker = int(grid_config['NumEnv'])

    num_step = int(grid_config['NumStep'])
    num_rollouts = int(grid_config['NumRollouts'])
    num_pretrain_rollouts = int(grid_config['NumPretrainRollouts'])

    ppo_eps = float(grid_config['PPOEps'])
    epoch = int(grid_config['Epoch'])
    mini_batch = int(grid_config['MiniBatch'])
    batch_size = int(num_step * num_worker / mini_batch)
    learning_rate = float(grid_config['LearningRate'])
    entropy_coef = float(grid_config['Entropy'])
    gamma = float(grid_config['Gamma'])
    int_gamma = float(grid_config['IntGamma'])
    clip_grad_norm = float(grid_config['ClipGradNorm'])
    ext_coef = float(grid_config['ExtCoef'])
    int_coef = float(grid_config['IntCoef'])

    sticky_action = grid_config.getboolean('StickyAction')
    action_prob = float(grid_config['ActionProb'])
    life_done = grid_config.getboolean('LifeDone')

    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd(shape=(1, 1, 84, 84))
    pre_obs_norm_step = int(grid_config['ObsNormStep'])
    discounted_reward = RewardForwardFilter(int_gamma)

    hidden_dim = int(grid_config['HiddenDim'])

    if train_method == 'RND':
        agent = RNDAgent
    elif train_method == 'generative':
        agent = GenerativeAgent
    else:
        raise NotImplementedError

    if grid_config['EnvType'] == 'atari':
        env_type = AtariEnvironment
    elif grid_config['EnvType'] == 'mario':
        env_type = MarioEnvironment
    elif grid_config['EnvType'] == 'grid':
        env_type = GridEnvironment 
    else:
        raise NotImplementedError

    # Initialize agent
    agent = agent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        history_size=1,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=batch_size,
        ppo_eps=ppo_eps,
        use_cuda=use_cuda,
        use_gae=use_gae,
        use_noisy_net=use_noisy_net,
        update_proportion=1.0,
        hidden_dim=hidden_dim
    )

    # Load pre-existing model
    if is_load_model:
        print('load model...')
        if use_cuda:
            agent.model.load_state_dict(torch.load(model_path))
            agent.vae.load_state_dict(torch.load(predictor_path))
        else:
            agent.model.load_state_dict(
                torch.load(model_path, map_location='cpu'))
            agent.vae.load_state_dict(torch.load(predictor_path, map_location='cpu'))
        print('load finished!')

    # Create workers to run in environments
    works = []
    parent_conns = []
    child_conns = []
    for idx in range(num_worker):
        parent_conn, child_conn = Pipe()
        work = env_type(
            env_id, is_render, idx, child_conn, sticky_action=sticky_action,
            p=action_prob, life_done=life_done,
        )
        work.start()
        works.append(work)
        parent_conns.append(parent_conn)
        child_conns.append(child_conn)

    states = np.zeros([num_worker, 1, 84, 84], dtype='float32')

    sample_episode = 0
    sample_rall = 0
    sample_step = 0
    sample_env_idx = 0
    sample_i_rall = 0
    global_update = 0
    global_step = 0

    # Initialize stats dict
    stats = {
        'total_reward': [],
        'ep_length': [],
        'num_updates': [],
        'frames_seen': [],
    }

    # Main training loop
    while True:
        total_state = np.zeros([num_worker * num_step, 1, 84, 84], dtype='float32')
        total_next_obs = np.zeros([num_worker * num_step, 1, 84, 84])
        total_reward, total_done, total_next_state, total_action, \
            total_int_reward, total_ext_values, total_int_values, total_policy, \
            total_policy_np = [], [], [], [], [], [], [], [], []

        # Step 1. n-step rollout (collect data)
        for step in range(num_step):
            actions, value_ext, value_int, policy = agent.get_action(states / 255.)

            for parent_conn, action in zip(parent_conns, actions):
                parent_conn.send(action)

            next_obs = np.zeros([num_worker, 1, 84, 84])
            next_states = np.zeros([num_worker, 1, 84, 84])
            rewards, dones, real_dones, log_rewards = [], [], [], []
            for idx, parent_conn in enumerate(parent_conns):
                s, r, d, rd, lr, stat = parent_conn.recv()
                next_states[idx] = s
                rewards.append(r)
                dones.append(d)
                real_dones.append(rd)
                log_rewards.append(lr)
                next_obs[idx, 0] = s[0, :, :]
                total_next_obs[idx * num_step + step, 0] = s[0, :, :]

                if rd:
                    stats['total_reward'].append(stat[0])
                    stats['ep_length'].append(stat[1])
                    stats['num_updates'].append(global_update)
                    stats['frames_seen'].append(global_step)

            rewards = np.hstack(rewards)
            dones = np.hstack(dones)
            real_dones = np.hstack(real_dones)

            # Compute total reward = intrinsic reward + external reward
            intrinsic_reward = agent.compute_intrinsic_reward(next_obs / 255.)
            intrinsic_reward = np.hstack(intrinsic_reward)
            sample_i_rall += intrinsic_reward[sample_env_idx]

            for idx, state in enumerate(states):
                total_state[idx * num_step + step] = state
            total_int_reward.append(intrinsic_reward)
            total_reward.append(rewards)
            total_done.append(dones)
            total_action.append(actions)
            total_ext_values.append(value_ext)
            total_int_values.append(value_int)
            total_policy.append(policy)
            total_policy_np.append(policy.cpu().numpy())

            states = next_states[:, :, :, :]

            sample_rall += log_rewards[sample_env_idx]

            sample_step += 1
            if real_dones[sample_env_idx]:
                sample_episode += 1
                writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
                writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
                writer.add_scalar('data/step', sample_step, sample_episode)
                sample_rall = 0
                sample_step = 0
                sample_i_rall = 0

        # calculate last next value
        _, value_ext, value_int, _ = agent.get_action(np.float32(states) / 255.)
        total_ext_values.append(value_ext)
        total_int_values.append(value_int)
        # --------------------------------------------------

        total_reward = np.stack(total_reward).transpose().clip(-1, 1)
        total_action = np.stack(total_action).transpose().reshape([-1])
        total_done = np.stack(total_done).transpose()
        total_ext_values = np.stack(total_ext_values).transpose()
        total_int_values = np.stack(total_int_values).transpose()
        total_logging_policy = np.vstack(total_policy_np)

        # Step 2. calculate intrinsic reward
        # running mean intrinsic reward
        total_int_reward = np.stack(total_int_reward).transpose()
        total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                         total_int_reward.T])
        mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
        reward_rms.update_from_moments(mean, std ** 2, count)

        writer.add_scalar('data/raw_int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        writer.add_scalar('data/raw_int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)

        # normalize intrinsic reward
        total_int_reward /= np.sqrt(reward_rms.var)
        writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
        writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
        # -------------------------------------------------------------------------------------------

        # logging Max action probability
        writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

        # Step 3. make target and advantage
        # extrinsic reward calculate
        ext_target, ext_adv = make_train_data(total_reward,
                                              total_done,
                                              total_ext_values,
                                              gamma,
                                              num_step,
                                              num_worker)

        # intrinsic reward calculate
        # None Episodic
        int_target, int_adv = make_train_data(total_int_reward,
                                              np.zeros_like(total_int_reward),
                                              total_int_values,
                                              int_gamma,
                                              num_step,
                                              num_worker)

        # add ext adv and int adv
        total_adv = int_adv * int_coef + ext_adv * ext_coef
        # -----------------------------------------------

        # Step 4. update obs normalize param
        # obs_rms.update(total_next_obs)
        # -----------------------------------------------

        # Step 5. Training!
        # random_obs_choice = np.random.randint(total_next_obs.shape[0])
        # random_obs = total_next_obs[random_obs_choice].copy()
        total_next_obs /= 255.
        if global_update < num_pretrain_rollouts:
            recon_losses, kld_losses = agent.train_just_vae(total_state / 255., total_next_obs)
        else:
            recon_losses, kld_losses = agent.train_model(total_state / 255., ext_target, int_target, total_action,
                        total_adv, total_next_obs, total_policy)

        writer.add_scalar('data/reconstruction_loss_per_rollout', np.mean(recon_losses), global_update)
        writer.add_scalar('data/kld_loss_per_rollout', np.mean(kld_losses), global_update)

        global_step += (num_worker * num_step)
        
        if global_update % rec_interval == 0:
            with torch.no_grad():
                # random_obs_norm = total_next_obs[random_obs_choice]
                # reconstructed_state = agent.reconstruct(random_obs_norm)

                # random_obs_norm = (random_obs_norm - random_obs_norm.min()) / (random_obs_norm.max() - random_obs_norm.min())
                # reconstructed_state = (reconstructed_state - reconstructed_state.min()) / (reconstructed_state.max() - reconstructed_state.min())

                # writer.add_image('Original', random_obs, global_update)
                # writer.add_image('Original Normalized', random_obs_norm, global_update)

                random_state = total_next_obs[np.random.randint(total_next_obs.shape[0])]
                reconstructed_state = agent.reconstruct(random_state)

                writer.add_image('Original', random_state, global_update)
                writer.add_image('Reconstructed', reconstructed_state, global_update)

        if global_update % save_interval == 0:
            print('Saving model at global step={}, num rollouts={}.'.format(
                global_step, global_update))
            torch.save(agent.model.state_dict(), model_path + "_{}.pt".format(global_update))
            torch.save(agent.vae.state_dict(), predictor_path + '_{}.pt'.format(global_update))

            # Save stats to pickle file
            with open('models/{}_{}_run{}_stats_{}.pkl'.format(env_id, train_method, run_id, global_update),'wb') as f:
                pickle.dump(stats, f)

        global_update += 1

        if global_update == num_rollouts + num_pretrain_rollouts:
            print('Finished Training.')
            break
class GridEnvironment(Environment):
    def __init__(self,
                 env_id,
                 is_render,
                 env_idx,
                 child_conn,
                 history_size=1,
                 h=84,
                 w=84,
                 life_done=True,
                 sticky_action=False,
                 p=0.25):
        super(GridEnvironment, self).__init__()
        self.daemon = True
        self.env = ImgObsWrapper(
            RGBImgObsWrapper(ReseedWrapper(gym.make(env_id))))
        self.env_id = env_id
        self.is_render = is_render
        self.env_idx = env_idx
        self.steps = 0
        self.episode = 0
        self.rall = 0
        self.recent_rlist = deque(maxlen=100)
        self.child_conn = child_conn

        self.sticky_action = sticky_action
        self.last_action = 0
        self.p = p

        self.history_size = history_size
        self.history = np.zeros([history_size, h, w])
        self.h = h
        self.w = w

        self.reset()

    def run(self):
        super(GridEnvironment, self).run()
        while True:
            action = self.child_conn.recv()

            # sticky action
            if self.sticky_action:
                if np.random.rand() <= self.p:
                    action = self.last_action
                self.last_action = action

            s, reward, done, info = self.env.step(action)

            if max_step_per_episode < self.steps:
                done = True

            log_reward = reward
            force_done = done

            self.history[0, :, :] = self.pre_proc(s)

            self.rall += reward
            self.steps += 1

            if done:
                self.recent_rlist.append(self.rall)
                print(
                    "[Episode {}({})] Step: {}  Reward: {}  Recent Reward: {}  Visited Room: [{}]"
                    .format(self.episode, self.env_idx, self.steps, self.rall,
                            np.mean(self.recent_rlist),
                            info.get('episode', {}).get('visited_rooms', {})))

                self.history = self.reset()

            self.child_conn.send([
                self.history[:, :, :], reward, force_done, done, log_reward,
                [self.rall, self.steps]
            ])

    def reset(self):
        self.last_action = 0
        self.steps = 0
        self.episode += 1
        self.rall = 0
        s = self.env.reset()
        self.get_init_state(self.pre_proc(s))
        return self.history[:, :, :]

    def pre_proc(self, X):
        X = np.array(Image.fromarray(X).convert('L')).astype('float32')
        x = cv2.resize(X, (self.h, self.w))
        return x

    def get_init_state(self, s):
        for i in range(self.history_size):
            self.history[i, :, :] = self.pre_proc(s)
Beispiel #20
0
def BobEnv(size):
    return ImgObsWrapper(RGBImgPartialObsWrapper(_BobEnv(size)))
Beispiel #21
0
def _wrap_minigrid_env(env):
    from gym_minigrid.wrappers import ImgObsWrapper
    env = ImgObsWrapper(env)  # Get rid of the 'mission' field
    env = bench.Monitor(env, logger.get_dir())
    return env
Beispiel #22
0
def callback(_locals, _globals):
    n_steps = _locals['_']
    if n_steps and (n_steps % 1000 == 0):
        print(n_steps)
        print(_locals['episode_successes'])
        # env.render()
        # time.sleep(0.2)

    n_steps += 1
    # Returning False will stop training early
    return True


# Create log dir
log_dir = f"{EXPERIMENT_DIR}/sb/gym"
os.makedirs(log_dir, exist_ok=True)

# Create environment
env_name = 'MiniGrid-FourRooms-v1'
env = FullyObsWrapper(ImgObsWrapper(gym.make(env_name)))
env.max_steps = 100000
# env.step = partial(stochastic_step, env)
env = DummyVecEnv([lambda: env])

# Train a model
model = DQN(policy=MlpPolicy,
            env=env,
            tensorboard_log=f"{EXPERIMENT_DIR}/sb/tensorboard/{env_name}")
model.learn(total_timesteps=10000000, callback=callback)
Beispiel #23
0
def wrap_env(env, opt):
    env = ImgObsWrapper(env)
    env = FrameStack(env, k=opt.er.hist_len)
    env = TorchWrapper(env, device=opt.device)
    env = SeedWrapper(env, opt.seed) if opt.seed is not None else env
    return env
Beispiel #24
0
              'max_timesteps': 500,
              'action_mode': 'discrete'}

    # create env
    ENV = gym.make('BallBeamThrow-v0', **kwargs)
    BD_BOUNDS = [[0, 3]]
    INITIAL_GENOTYPE_SIZE = 11
    MINI = False
    EVALUATE_INDIVIDUAL = evaluate_beam
    BD_GENOTYPE = 1

if ENV_NAME == 'grid':
    import gym_minigrid  # must still be imported
    from gym_minigrid.wrappers import ImgObsWrapper  # must still be imported
    # create env
    ENV = ImgObsWrapper(gym.make('MiniGrid-Empty-8x8-v0'))
    BD_BOUNDS = [[0, 7], [0, 7]]
    NB_CELLS = 64
    INITIAL_GENOTYPE_SIZE = 11
    MINI = False
    EVALUATE_INDIVIDUAL = evaluate_grid
    BD_GENOTYPE = 1

if ENV_NAME == 'bipedal':
    # global variable for the environment
    ENV = gym.make('BipedalWalker-v3')
    BD_BOUNDS = [[-1, 1], [0, 1]]
    INITIAL_GENOTYPE_SIZE = 118
    MINI = False
    EVALUATE_INDIVIDUAL = evaluate_bipedal
    BD_GENOTYPE = 1
            if reward_mean > 500:
                break

    def play(self, num_episodes, render=True):
        """Test the trained agent.
        """
        for episode in range(num_episodes):
            state = self.env.reset()
            total_reward = 0.0
            while True:
                if render:
                    self.env.render()
                action = self.get_action(state)
                state, reward, done, _ = self.env.step(action)
                total_reward += reward
                if done:
                    print(
                        f"Total reward: {total_reward} in episode {episode + 1}"
                    )
                    break


if __name__ == "__main__":
    env = gym.make("MiniGrid-Empty-8x8-v0")
    env = RGBImgPartialObsWrapper(env)  # Get pixel observations
    env = ImgObsWrapper(env)  # Get rid of the 'mission' field
    agent = Agent(env)
    print("Number of actions: ", agent.actions)
    agent.train(percentile=99.9, num_iterations=64, num_episodes=128)
    agent.play(num_episodes=3)