コード例 #1
0
ファイル: runner.py プロジェクト: nimishsantosh107/bsuite
    def __init__(self,
                 env_id,
                 agent,
                 verbose=True,
                 log_interval=100,
                 eval=False):
        '''
        PARAMETERS:
        'env_id'       - Environment ID eg: environments.CARTPOLE
        'agent'        - Instance of an Agent class with the necessary methods implemented
        'verbose'      - True: prints logs, False: doesn't print logs
        'log_interval' - Interval between episodes to print logs at
        'eval'         - Use custom private results path as results dir
        '''
        self.agent = agent
        self.env_id = env_id
        self.verbose = verbose
        self.log_interval = log_interval

        if (eval):
            results_dir = os.environ.get('PRIVATE_RESULTS_DIR')
        else:
            results_dir = os.environ.get('RESULTS_DIR')

        env = bsuite.load_and_record_to_csv(env_id,
                                            results_dir=results_dir,
                                            overwrite=True)
        self.env = gym_wrapper.GymFromDMEnv(env)
コード例 #2
0
def make_bsuite_environment(bsuite_id: str = 'deep_sea/0',
                            results_dir: str = '/tmp/bsuite',
                            overwrite: bool = False) -> dm_env.Environment:
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=bsuite_id,
        results_dir=results_dir,
        overwrite=overwrite,
    )
    return wrappers.SinglePrecisionWrapper(raw_environment)
コード例 #3
0
 def get_env(*args, **kwargs):
     return GymEnvWrapper(
         TransformObservation(env=FrameStack(
             num_stack=4,
             env=(gym_wrapper.GymFromDMEnv(
                 bsuite.load_and_record_to_csv(
                     bsuite_id=bsuite_id,
                     results_dir=results_dir,
                     overwrite=True,
                 )) if not gym_id else gym.make(gym_id))),
                              f=lambda lazy_frames: np.reshape(
                                  np.stack(lazy_frames._frames), -1)))
コード例 #4
0
def run_random():
    for env_name in sweep.SWEEP:  #  Or for a specific suite: sweep.DEEP_SEA
        dm_env = bsuite.load_and_record_to_csv(env_name,
                                               results_dir=RANDOM_RESULTS_PATH,
                                               overwrite=True)

        #  Instanciate the agent
        env = gym_wrapper.GymWrapper(dm_env)
        env = ch.envs.Runner(env)
        policy = ch.models.RandomPolicy(env)

        #  Generate the results
        print('Running', env_name)
        env.run(policy, episodes=env.bsuite_num_episodes)
コード例 #5
0
ファイル: run_dqn.py プロジェクト: deepmind/acme
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=FLAGS.bsuite_id,
        results_dir=FLAGS.results_dir,
        overwrite=FLAGS.overwrite,
    )
    environment = wrappers.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    network = snt.Sequential([
        snt.Flatten(),
        snt.nets.MLP([50, 50, environment_spec.actions.num_values])
    ])

    # Construct the agent.
    agent = dqn.DQN(environment_spec=environment_spec, network=network)

    # Run the environment loop.
    loop = acme.EnvironmentLoop(environment, agent)
    loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
コード例 #6
0
def main(_):
  # Create an environment and grab the spec.
  raw_environment = bsuite.load_and_record_to_csv(
      bsuite_id=FLAGS.bsuite_id,
      results_dir=FLAGS.results_dir,
      overwrite=FLAGS.overwrite,
  )
  environment = wrappers.SinglePrecisionWrapper(raw_environment)
  environment_spec = specs.make_environment_spec(environment)

  # Construct the agent.
  agent = dqfd.DQfD(
      environment_spec=environment_spec,
      network=make_network(environment_spec.actions),
      demonstration_dataset=bsuite_demonstrations.make_dataset(raw_environment),
      demonstration_ratio=FLAGS.demonstration_ratio,
      samples_per_insert=FLAGS.samples_per_insert,
      learning_rate=FLAGS.learning_rate)

  # Run the environment loop.
  loop = acme.EnvironmentLoop(environment, agent)
  loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
コード例 #7
0
def run_trpo():
    ch.debug.debug()
    for i, env_name in enumerate(sweep.SWEEP):
        dm_env = bsuite.load_and_record_to_csv(env_name,
                                               results_dir=TRPO_RESULTS_PATH,
                                               overwrite=True)

        #  Instanciate the env and agent
        env = gym_wrapper.GymWrapper(dm_env)
        env = ch.envs.Torch(env)
        env = ch.envs.Runner(env)
        policy = Policy(env)
        baseline = LinearValue(env.state_size)

        #  Generate the results
        replay = ch.ExperienceReplay()
        for episode in tqdm(range(1, 1 + env.bsuite_num_episodes),
                            desc=env_name):
            replay += env.run(policy, episodes=1)
            if episode % 10 == 0:
                trpo_update(replay, policy, baseline)
                replay.empty()
コード例 #8
0
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=FLAGS.bsuite_id,
        results_dir=FLAGS.results_dir,
        overwrite=FLAGS.overwrite,
    )
    environment = wrappers.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    # Create the networks to optimize.
    network = make_network(environment_spec.actions)

    agent = impala.IMPALA(
        environment_spec=environment_spec,
        network=network,
        sequence_length=3,
        sequence_period=3,
    )

    # Run the environment loop.
    loop = acme.EnvironmentLoop(environment, agent)
    loop.run(num_episodes=environment.bsuite_num_episodes)  # pytype: disable=attribute-error
コード例 #9
0
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=FLAGS.bsuite_id,
        results_dir=FLAGS.results_dir,
        overwrite=FLAGS.overwrite,
    )
    environment = single_precision.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    # Build demonstration dataset.
    if hasattr(raw_environment, 'raw_env'):
        raw_environment = raw_environment.raw_env

    batch_dataset = bsuite_demonstrations.make_dataset(raw_environment,
                                                       stochastic=False)
    # Combine with demonstration dataset.
    transition = functools.partial(_n_step_transition_from_episode,
                                   n_step=1,
                                   additional_discount=1.)

    dataset = batch_dataset.map(transition)

    # Batch and prefetch.
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    # Create the networks to optimize.
    policy_network = make_policy_network(environment_spec.actions)

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    evaluator_network = snt.Sequential([
        policy_network,
        lambda q: trfl.epsilon_greedy(q, epsilon=FLAGS.epsilon).sample(),
    ])

    # Ensure that we create the variables before proceeding (maybe not needed).
    tf2_utils.create_variables(policy_network, [environment_spec.observations])

    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # Create the actor which defines how we take actions.
    evaluation_network = actors.FeedForwardActor(evaluator_network)

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluation_network,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # The learner updates the parameters (and initializes them).
    learner = learning.BCLearner(network=policy_network,
                                 learning_rate=FLAGS.learning_rate,
                                 dataset=dataset,
                                 counter=learner_counter)

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)
コード例 #10
0
    plot_ext = "." + plot_format

    #envs = sweep.BANDIT
    envs = ["bandit/0"]

    for bsuite_id in envs:
        b_env = 'bandit'
        env_plot_path = Path(plot_dir + bsuite_id.replace("/", "-") + "/")
        env_plot_path.mkdir(parents=True, exist_ok=True)
        env_plot_path = str(env_plot_path.resolve())

        args = get_args()

        # Initialize the environment
        bsuite_env = load_and_record_to_csv(bsuite_id,
                                            results_dir=csv_dir,
                                            overwrite=True)
        gym_env = gym_wrapper.GymFromDMEnv(bsuite_env)
        env = GymEnv(gym_env)
        env_builder = lambda: env

        algo = setup_test(args, env)

        off_policy_trainer = OffPolicyTrainer()
        off_policy_trainer.train(args, env_builder, algo)

        # Analyze performance
        df, sweep_vars = csv_load.load_bsuite(csv_dir)

        bandit_df = df[df.bsuite_env == b_env].copy()
コード例 #11
0
ファイル: run_bc_jax.py プロジェクト: pchtsp/acme
def main(_):
    # Create an environment and grab the spec.
    raw_environment = bsuite.load_and_record_to_csv(
        bsuite_id=FLAGS.bsuite_id,
        results_dir=FLAGS.results_dir,
        overwrite=FLAGS.overwrite,
    )
    environment = single_precision.SinglePrecisionWrapper(raw_environment)
    environment_spec = specs.make_environment_spec(environment)

    # Build demonstration dataset.
    if hasattr(raw_environment, 'raw_env'):
        raw_environment = raw_environment.raw_env

    batch_dataset = bsuite_demonstrations.make_dataset(raw_environment)
    # Combine with demonstration dataset.
    transition = functools.partial(_n_step_transition_from_episode,
                                   n_step=1,
                                   additional_discount=1.)

    dataset = batch_dataset.map(transition)

    # Batch and prefetch.
    dataset = dataset.batch(FLAGS.batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    dataset = tfds.as_numpy(dataset)

    # Create the networks to optimize.
    policy_network = make_policy_network(environment_spec.actions)
    policy_network = hk.without_apply_rng(hk.transform(policy_network))

    # If the agent is non-autoregressive use epsilon=0 which will be a greedy
    # policy.
    def evaluator_network(params: hk.Params, key: jnp.DeviceArray,
                          observation: jnp.DeviceArray) -> jnp.DeviceArray:
        action_values = policy_network.apply(params, observation)
        return rlax.epsilon_greedy(FLAGS.epsilon).sample(key, action_values)

    counter = counting.Counter()
    learner_counter = counting.Counter(counter, prefix='learner')

    # The learner updates the parameters (and initializes them).
    learner = learning.BCLearner(network=policy_network,
                                 optimizer=optax.adam(FLAGS.learning_rate),
                                 obs_spec=environment.observation_spec(),
                                 dataset=dataset,
                                 counter=learner_counter,
                                 rng=hk.PRNGSequence(FLAGS.seed))

    # Create the actor which defines how we take actions.
    variable_client = variable_utils.VariableClient(learner, '')
    evaluator = actors.FeedForwardActor(evaluator_network,
                                        variable_client=variable_client,
                                        rng=hk.PRNGSequence(FLAGS.seed))

    eval_loop = acme.EnvironmentLoop(environment=environment,
                                     actor=evaluator,
                                     counter=counter,
                                     logger=loggers.TerminalLogger(
                                         'evaluation', time_delta=1.))

    # Run the environment loop.
    while True:
        for _ in range(FLAGS.evaluate_every):
            learner.step()
        learner_counter.increment(learner_steps=FLAGS.evaluate_every)
        eval_loop.run(FLAGS.evaluation_episodes)