コード例 #1
0
def experiment_factory(opt, only_env=False):
    env = gym_wrapper.GymFromDMEnv(bsuite.load_from_id(opt.env.name))
    env = TorchWrapper(env, opt.device)
    if only_env:
        return env

    replay = ExperienceReplay(**opt.replay)
    layers = [
        reduce(lambda x, y: x * y, env.observation_space.shape),  # input
        *opt.estimator["layers"],  # hidden
        env.action_space.n,  # output
    ]
    estimator = MLP(layers, spectral=opt.spectral, **opt.estimator)
    estimator.to(opt.device)

    optimizer = getattr(torch.optim, opt.optim.name)(
        estimator.parameters(), **opt.optim.kwargs
    )
    policy_improvement = C51PolicyImprovement(
        estimator, opt.epsilon, env.action_space.n
    )
    policy_evaluation = C51PolicyEvaluation(estimator, optimizer, opt.gamma)
    rlog.info(replay)
    rlog.info(estimator)
    return env, (replay, policy_improvement, policy_evaluation)
コード例 #2
0
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_from_id(FLAGS.bsuite_id)
    environment = single_precision.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    # Build demonstration dataset.
    if hasattr(raw_environment, 'raw_env'):
        raw_environment = raw_environment.raw_env

    batch_dataset = bsuite_demonstrations.make_dataset(raw_environment)
    # Combine with demonstration dataset.
    transition = functools.partial(_n_step_transition_from_episode,
                                   n_step=1,
                                   additional_discount=1.)

    dataset = batch_dataset.map(transition)

    # Batch and prefetch.
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    # Create the networks to optimize.
    policy_network = make_policy_network(environment_spec.actions)

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    evaluator_network = snt.Sequential([
        policy_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_network, [environment_spec.observations])

    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # Create the actor which defines how we take actions.
    evaluation_network = actors_tf2.FeedForwardActor(evaluator_network)

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluation_network,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # The learner updates the parameters (and initializes them).
    learner = learning.BCLearner(network=policy_network,
                                 learning_rate=FLAGS.learning_rate,
                                 dataset=dataset,
                                 counter=learner_counter)

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)
コード例 #3
0
 def __init__(self, name):
     super().__init__()
     self.env = bsuite.load_from_id(name)
     self.cache = None
     self.reset()
     print('Env', name, 'is with', self.env.observation_spec(),
           'observed space')
     print('Env', name, 'is with', self.env.action_spec(), 'action space')
コード例 #4
0
    def __init__(self, env_id):
        self.id = env_id
        self.env = bsuite.load_from_id(env_id)
        self.action_space = ActionSpace(np.random.RandomState(0),
                                        self.env.action_spec())

        shape = self.env.observation_spec().shape
        if shape[0] < 2:
            self.observation_space = np.zeros(shape=shape[1:])
        else:
            self.observation_space = np.zeros(shape=shape)
コード例 #5
0
ファイル: run_dqn.py プロジェクト: yyht/acme
def main(_):
    # Create an environment and grab the spec.
    environment = bsuite.load_from_id('catch/0')
    environment = wrappers.SinglePrecisionWrapper(environment)
    environment_spec = specs.make_environment_spec(environment)

    network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP([50, 50, environment_spec.actions.num_values])
    ])

    # Construct the agent.
    agent = dqn.DQN(environment_spec=environment_spec, network=network)

    # Run the environment loop.
    loop = acme.EnvironmentLoop(environment, agent)
    loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
コード例 #6
0
def main(_):
  # Create an environment and grab the spec.
  raw_environment = bsuite.load_from_id(FLAGS.bsuite_id)
  environment = wrappers.SinglePrecisionWrapper(raw_environment)
  environment_spec = specs.make_environment_spec(environment)

  # Construct the agent.
  agent = dqfd.DQfD(
      environment_spec=environment_spec,
      network=make_network(environment_spec.actions),
      demonstration_dataset=bsuite_demonstrations.make_dataset(raw_environment),
      demonstration_ratio=FLAGS.demonstration_ratio,
      samples_per_insert=FLAGS.samples_per_insert,
      learning_rate=FLAGS.learning_rate)

  # Run the environment loop.
  loop = acme.EnvironmentLoop(environment, agent)
  loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
コード例 #7
0
 def __init__(self,
              id: str,
              exp_kwargs: dict = None,
              external_logging: str = 'none',
              save_path: str = '',
              overwrite: bool = True):
     assert (id in VALID_ENV_SWEEP_IDS) or (
         id in VALID_ENV_IDS and exp_kwargs is not None
     )  # Either using one of presets or using base experiment with other settings
     aug_path = osp.join(LOG_DIR, save_path)  # LOG_DIR + save_path
     if id in VALID_ENV_SWEEP_IDS:  # Pre-parameterized experiments
         if external_logging == 'none':
             env = bsuite.load_from_id(id)  # No recording
         else:
             env = bsuite.load_and_record(
                 id, aug_path, external_logging, overwrite=overwrite
             )  # Record in sql or csv. same sql for each id
         self.num_episodes = env.bsuite_num_episodes
     else:
         noise_scale = exp_kwargs.pop('noise_scale', 0.)
         noise_scale_seed = exp_kwargs.pop('noise_scale_seed', 0.)
         reward_scale = exp_kwargs.pop('reward_scale', 0.)
         env = bsuite.load(id, **exp_kwargs)
         if noise_scale:
             env = RewardNoise(env, noise_scale, noise_scale_seed)
         if reward_scale: env = RewardScale(env, reward_scale)
         self.num_episodes = 1e4  # Default
     self.env = env
     self._action_space = IntBox(low=0,
                                 high=self.env.action_spec().num_values)
     o_spec = self.env.observation_spec()
     if isinstance(o_spec, specs.BoundedArray):
         self._observation_space = FloatBox(low=o_spec.minimum.item(),
                                            high=o_spec.maximum.item(),
                                            shape=o_spec.shape,
                                            dtype=o_spec.dtype)
     else:
         self._observation_space = FloatBox(low=-float('inf'),
                                            high=float('inf'),
                                            shape=o_spec.shape,
                                            dtype=o_spec.dtype)
     self._last_observation = None
     self.game_over = False,
     self.viewer = None
コード例 #8
0
def load_offline_bsuite_dataset(
    bsuite_id: str,
    random_prob: float,
    path: str,
    batch_size: int,
    valid_batch_size: int,
    num_shards: int = 1,
    num_valid_shards: int = 1,
    num_threads: int = 1,
    single_precision_wrapper: bool = True,
    shuffle_buffer_size: int = 100000,
    shuffle: bool = True,
    repeat: bool = True
) -> Tuple[tf.data.Dataset, tf.data.Dataset, dm_env.Environment]:
    """Load bsuite offline dataset."""
    # Data file path format: {path}-?????-of-{num_shards:05d}
    # The dataset is not deterministic and not repeated if shuffle = False.
    environment = bsuite.load_from_id(bsuite_id)
    if single_precision_wrapper:
        environment = single_precision.SinglePrecisionWrapper(environment)
    if random_prob > 0.:
        environment = RandomActionWrapper(environment, random_prob)
    params = bsuite_offline_dataset.dataset_params(environment)
    if os.path.basename(path):
        path += '_'
    train_path = path + 'train'
    train_dataset = bsuite_offline_dataset.dataset(
        path=train_path,
        num_threads=num_threads,
        batch_size=batch_size,
        num_shards=num_shards,
        shuffle_buffer_size=shuffle_buffer_size,
        shuffle=shuffle,
        repeat=repeat,
        **params)
    valid_path = path + 'valid'
    valid_dataset = bsuite_offline_dataset.dataset(path=valid_path,
                                                   num_threads=num_threads,
                                                   batch_size=valid_batch_size,
                                                   num_shards=num_valid_shards,
                                                   shuffle=False,
                                                   repeat=False,
                                                   **params)
    return train_dataset, valid_dataset, environment
コード例 #9
0
def make_env_and_model(
        bsuite_id: str, results_dir: str,
        overwrite: bool) -> Tuple[dm_env.Environment, models.Model]:
    """Create environment and corresponding model (learned or simulator)."""
    raw_env = bsuite.load_from_id(bsuite_id)
    if FLAGS.simulator:
        model = simulator.Simulator(raw_env)  # pytype: disable=attribute-error
    else:
        model = mlp.MLPModel(
            specs.make_environment_spec(raw_env),
            replay_capacity=1000,
            batch_size=16,
            hidden_sizes=(50, ),
        )
    environment = csv_logging.wrap_environment(raw_env, bsuite_id, results_dir,
                                               overwrite)
    environment = wrappers.SinglePrecisionWrapper(environment)

    return environment, model
コード例 #10
0
        def _thunk():
            random_seed(seed)
            if env_id.startswith('bsuite'):
                id = env_id.split('bsuite-')[1]
                self.video_enabled = False
                bsuite_env = bsuite.load_from_id(id)
                env = gym_wrapper.GymFromDMEnv(bsuite_env)

            elif env_id.startswith("dm"):
                import dm_control2gym
                _, domain, task = env_id.split('-')
                env = dm_control2gym.make(domain_name=domain, task_name=task)

            else:
                if special_args is not None:
                    if 'NChain' in special_args[0]:
                        print('starting chain N = ', special_args[1])
                        env = gym.make(env_id, n=special_args[1])
                else:
                    env = gym.make(env_id)

            if self.video_enabled:
                env = Monitor(env,
                              self.log_dir,
                              video_callable=self.video_callable)

            is_atari = hasattr(gym.envs, 'atari') and isinstance(
                env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
            if is_atari:
                env = make_atari(env_id)
            env.seed(seed + rank)
            env = OriginalReturnWrapper(env)
            if is_atari:
                env = wrap_deepmind(env,
                                    episode_life=episode_life,
                                    clip_rewards=False,
                                    frame_stack=False,
                                    scale=False)
                obs_shape = env.observation_space.shape
                if len(obs_shape) == 3:
                    env = TransposeImage(env)
                env = FrameStack(env, 4)
            return env
コード例 #11
0
import bsuite

from bsuite import sweep

# Valid Ids across all experiments:
print('All possible values for bsuite_id:')
print(sweep.SWEEP)

# Ids for an example experiment:
print('List bsuite_id for "bandit_noise" experiment:')
print(sweep.BANDIT_NOISE)

# List the configurations for the given experiment
for bsuite_id in sweep.BANDIT_NOISE:
    env = bsuite.load_from_id(bsuite_id)
    print('bsuite_id={}, settings={}, num_episodes={}'.format(
        bsuite_id, sweep.SETTINGS[bsuite_id], env.bsuite_num_episodes))
コード例 #12
0
def load_env(env_id):
    env = bsuite.load_from_id(env_id)
    env = gym_wrapper.GymFromDMEnv(env)
    return env
コード例 #13
0
#
# Gym Wrapper (Incomplete)
#
import bsuite

from bsuite import sweep
import gym

from bsuite.utils import gym_wrapper
raw_env = bsuite.load_from_id(bsuite_id='memory_len/0')
env = gym_wrapper.GymWrapper(raw_env)
isinstance(env, gym.Env)


コード例 #14
0
import torch
import os
from base_dqns import Agent
import numpy as np
import bsuite
from utils import save_results



if __name__ == '__main__':
    path = 'saves/'
    env = bsuite.load_from_id('deep_sea/0')
    num_actions = env.action_spec().num_values

    agent = Agent(gamma=0.99, eps=1.0, lr=(0.0002*0.0001), input_dims=100, output_dims=2,
                         batch_size=128, n_actions=2, max_mem_size=100000, eps_end=0.01, eps_dec=1e-4, langevin=True)

    scores = []
    avg_scores = []
    eps_history = []
    episodes = 10000
    try:
        for i in range(episodes):
            score = 0
            eps_history.append(agent.eps)
            timestep = env.reset()
            while not timestep.last():
                observation = timestep.observation
                observation = np.reshape(observation, (-1))
                action = agent.choose_action(observation)
                timestep_ = env.step(action)