def main(_): environment_factory = lp_utils.partial_kwargs(helpers.make_environment, domain_name=FLAGS.domain, task_name=FLAGS.task) batch_size = 32 sequence_length = 20 gradient_steps_per_actor_step = 1.0 samples_per_insert = (gradient_steps_per_actor_step * batch_size * sequence_length) num_actors = 1 program = svg0_prior.DistributedSVG0( environment_factory=environment_factory, network_factory=lp_utils.partial_kwargs( svg0_prior.make_default_networks), batch_size=batch_size, sequence_length=sequence_length, samples_per_insert=samples_per_insert, entropy_regularizer_cost=1e-4, max_replay_size=int(2e6), target_update_period=250, num_actors=num_actors).build() lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_: Any) -> None: # Environment. environment_factory = functools.partial( debugging_utils.make_environment, env_name=FLAGS.env_name, action_space=FLAGS.action_space, ) # Networks. network_factory = lp_utils.partial_kwargs( madqn.make_default_networks, archecture_type=ArchitectureType.recurrent) # Checkpointer appends "Checkpoints" to checkpoint_dir checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}" # Log every [log_every] seconds. log_every = 10 logger_factory = functools.partial( logger_utils.make_logger, directory=FLAGS.base_dir, to_terminal=True, to_tensorboard=True, time_stamp=FLAGS.mava_id, time_delta=log_every, ) # Distributed program. program = madqn.MADQN( environment_factory=environment_factory, network_factory=network_factory, logger_factory=logger_factory, num_executors=1, exploration_scheduler_fn=LinearExplorationScheduler, epsilon_min=0.05, epsilon_decay=5e-4, optimizer=snt.optimizers.Adam(learning_rate=1e-4), checkpoint_subpath=checkpoint_dir, trainer_fn=madqn.training.MADQNRecurrentTrainer, executor_fn=madqn.execution.MADQNRecurrentExecutor, batch_size=32, ).build() # Ensure only trainer runs on gpu, while other processes run on cpu. gpu_id = -1 env_vars = {"CUDA_VISIBLE_DEVICES": str(gpu_id)} local_resources = { "trainer": [], "evaluator": PythonProcess(env=env_vars), "executor": PythonProcess(env=env_vars), } # Launch. lp.launch( program, lp.LaunchType.LOCAL_MULTI_PROCESSING, terminal="current_terminal", local_resources=local_resources, )
def test_agent(self, distributional_critic): # Create objectives. reward_objectives, qvalue_objectives = make_objectives() network_factory = lp_utils.partial_kwargs( make_networks, distributional_critic=distributional_critic) agent = mompo.DistributedMultiObjectiveMPO( reward_objectives, qvalue_objectives, environment_factory=make_environment, network_factory=network_factory, num_actors=2, batch_size=32, min_replay_size=32, max_replay_size=1000, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def main(_: Any) -> None: # environment environment_factory = functools.partial(smac_utils.make_environment, map_name=FLAGS.map_name) # Networks. network_factory = lp_utils.partial_kwargs( vdn.make_default_networks, policy_networks_layer_sizes=[64, 64]) # Checkpointer appends "Checkpoints" to checkpoint_dir checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}" # Log every [log_every] seconds. log_every = 10 logger_factory = functools.partial( logger_utils.make_logger, directory=FLAGS.base_dir, to_terminal=True, to_tensorboard=True, time_stamp=FLAGS.mava_id, time_delta=log_every, ) # distributed program program = vdn.VDN( environment_factory=environment_factory, network_factory=network_factory, logger_factory=logger_factory, num_executors=1, exploration_scheduler_fn=LinearExplorationScheduler, epsilon_min=0.05, epsilon_decay=1e-5, optimizer=snt.optimizers.SGD(learning_rate=1e-2), checkpoint_subpath=checkpoint_dir, batch_size=512, executor_variable_update_period=100, target_update_period=200, max_gradient_norm=10.0, eval_loop_fn=MonitorParallelEnvironmentLoop, eval_loop_fn_kwargs={ "path": checkpoint_dir, "record_every": 100 }, ).build() # launch gpu_id = -1 env_vars = {"CUDA_VISIBLE_DEVICES": str(gpu_id)} local_resources = { "trainer": [], "evaluator": PythonProcess(env=env_vars), "executor": PythonProcess(env=env_vars), } lp.launch( program, lp.LaunchType.LOCAL_MULTI_PROCESSING, terminal="current_terminal", local_resources=local_resources, )
def test_agent(self): env_factory = lambda x: fakes.fake_atari_wrapped(oar_wrapper=True) net_factory = lambda spec: networks.R2D2AtariNetwork(spec.num_values) agent = r2d2.DistributedR2D2( environment_factory=env_factory, network_factory=net_factory, num_actors=2, batch_size=32, min_replay_size=32, max_replay_size=1000, replay_period=1, burn_in_length=1, trace_length=10, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def main(_: Any) -> None: # Environment. environment_factory = functools.partial( debugging_utils.make_environment, env_name=FLAGS.env_name, action_space=FLAGS.action_space, ) # Networks. network_factory = lp_utils.partial_kwargs(maddpg.make_default_networks, shared_weights=False) # Checkpointer appends "Checkpoints" to checkpoint_dir. checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}" # Log every [log_every] seconds. log_every = 10 logger_factory = functools.partial( logger_utils.make_logger, directory=FLAGS.base_dir, to_terminal=True, to_tensorboard=True, time_stamp=FLAGS.mava_id, time_delta=log_every, ) # Distributed program. program = maddpg.MADDPG( environment_factory=environment_factory, network_factory=network_factory, logger_factory=logger_factory, num_executors=1, policy_optimizer=snt.optimizers.Adam(learning_rate=1e-4), critic_optimizer=snt.optimizers.Adam(learning_rate=1e-4), checkpoint_subpath=checkpoint_dir, max_gradient_norm=40.0, trainer_fn=maddpg.MADDPGNetworkedTrainer, architecture=architectures.NetworkedQValueCritic, connection_spec=custom_connected_network_spec, shared_weights=False, ).build() # Ensure only trainer runs on gpu, while other processes run on cpu. gpu_id = -1 env_vars = {"CUDA_VISIBLE_DEVICES": str(gpu_id)} local_resources = { "trainer": [], "evaluator": PythonProcess(env=env_vars), "executor": PythonProcess(env=env_vars), } # Launch. lp.launch( program, lp.LaunchType.LOCAL_MULTI_PROCESSING, terminal="current_terminal", local_resources=local_resources, )
def main(_: Any) -> None: # Environment. environment_factory = functools.partial( pettingzoo_utils.make_environment, env_class=FLAGS.env_class, env_name=FLAGS.env_name, ) # Networks. network_factory = lp_utils.partial_kwargs( maddpg.make_default_networks, archecture_type=ArchitectureType.recurrent) # Checkpointer appends "Checkpoints" to checkpoint_dir. checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}" # Log every [log_every] seconds. log_every = 10 logger_factory = functools.partial( logger_utils.make_logger, directory=FLAGS.base_dir, to_terminal=True, to_tensorboard=True, time_stamp=FLAGS.mava_id, time_delta=log_every, ) # Distributed program. program = maddpg.MADDPG( environment_factory=environment_factory, network_factory=network_factory, logger_factory=logger_factory, num_executors=1, policy_optimizer=snt.optimizers.Adam(learning_rate=1e-4), critic_optimizer=snt.optimizers.Adam(learning_rate=1e-4), checkpoint_subpath=checkpoint_dir, max_gradient_norm=40.0, trainer_fn=maddpg.training.MADDPGDecentralisedRecurrentTrainer, executor_fn=maddpg.execution.MADDPGRecurrentExecutor, batch_size=32, ).build() # Ensure only trainer runs on gpu, while other processes run on cpu. gpu_id = -1 env_vars = {"CUDA_VISIBLE_DEVICES": str(gpu_id)} local_resources = { "trainer": [], "evaluator": PythonProcess(env=env_vars), "executor": PythonProcess(env=env_vars), } # Launch. lp.launch( program, lp.LaunchType.LOCAL_MULTI_PROCESSING, terminal="current_terminal", local_resources=local_resources, )
def main(_): # Define a program which describes the topology of communicating nodes and # edges. In more involved examples, several programs can be defined and # launched at once. program = make_program(num_producers=FLAGS.num_producers) # Note that at launch time, none of the producers has been instantiated. # Producers are instantiated only at runtime. lp.launch(program)
def main(_): config = build_experiment_config() # Evaluation is disabled for performance reasons. Set `num_eval_episodes` to # a positive number and remove `evaluator_factories=[]` to enable it. if FLAGS.run_distributed: program = experiments.make_distributed_experiment( experiment=config, num_actors=4) lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program)) else: experiments.run_experiment(experiment=config, num_eval_episodes=0)
def main(_): environment_factory = lp_utils.partial_kwargs(helpers.make_environment, task=FLAGS.task) program = d4pg.DistributedD4PG(environment_factory=environment_factory, network_factory=lp_utils.partial_kwargs( helpers.make_networks), num_actors=2).build() lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): environment_factory = lp_utils.partial_kwargs(helpers.make_environment, task=FLAGS.task) program = d4pg.DistributedD4PG(environment_factory=environment_factory, network_factory=lp_utils.partial_kwargs( helpers.make_networks), num_actors=2).build() lp.launch(program, lp.LaunchType.LOCAL_MULTI_PROCESSING)
def main(_): config = build_experiment_config() if FLAGS.run_distributed: program = experiments.make_distributed_experiment( experiment=config, num_actors=4) lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program)) else: experiments.run_experiment( experiment=config, eval_every=FLAGS.eval_every, num_eval_episodes=FLAGS.evaluation_episodes)
def main(_): task = FLAGS.task environment_factory = lambda seed: helpers.make_environment(task) program = sac.DistributedSAC( environment_factory=environment_factory, network_factory=sac.make_networks, config=sac.SACConfig(**{'num_sgd_steps_per_step': 64}), num_actors=4, seed=1, max_number_of_steps=100).build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_: Any) -> None: # Environment. environment_factory = lp_utils.partial_kwargs(robocup_utils.make_environment) # Networks. network_factory = lp_utils.partial_kwargs( mad4pg.make_default_networks, archecture_type=ArchitectureType.recurrent ) # Checkpointer appends "Checkpoints" to checkpoint_dir. checkpoint_dir = f"{FLAGS.base_dir}/{FLAGS.mava_id}" # Log every [log_every] seconds. log_every = 10 logger_factory = functools.partial( logger_utils.make_logger, directory=FLAGS.base_dir, to_terminal=True, to_tensorboard=True, time_stamp=FLAGS.mava_id, time_delta=log_every, ) program = mad4pg.MAD4PG( architecture=StateBasedQValueCritic, environment_factory=environment_factory, network_factory=network_factory, logger_factory=logger_factory, num_executors=int(FLAGS.num_executors), samples_per_insert=None, trainer_fn=MAD4PGStateBasedRecurrentTrainer, executor_fn=MAD4PGRecurrentExecutor, shared_weights=True, checkpoint_subpath=checkpoint_dir, batch_size=265, ).build() # launch gpu_id = -1 env_vars = {"CUDA_VISIBLE_DEVICES": str(gpu_id)} local_resources = { "trainer": [], "evaluator": PythonProcess(env=env_vars), "executor": PythonProcess(env=env_vars), } lp.launch( program, lp.LaunchType.LOCAL_MULTI_PROCESSING, terminal="current_terminal", local_resources=local_resources, )
def test_maddpg_on_debugging_env(self) -> None: """Tests that the system can run on the simple spread debugging environment without crashing.""" # environment environment_factory = functools.partial( debugging_utils.make_environment, env_name="simple_spread", action_space="continuous", ) # networks network_factory = lp_utils.partial_kwargs(make_networks) # system system = maddpg.MADDPG( environment_factory=environment_factory, network_factory=network_factory, num_executors=2, batch_size=32, min_replay_size=32, max_replay_size=1000, policy_optimizer=snt.optimizers.Adam(learning_rate=1e-4), critic_optimizer=snt.optimizers.Adam(learning_rate=1e-4), checkpoint=False, ) program = system.build() (trainer_node,) = program.groups["trainer"] trainer_node.disable_run() # Launch gpu config - don't use gpu gpu_id = -1 env_vars = {"CUDA_VISIBLE_DEVICES": str(gpu_id)} local_resources = { "trainer": PythonProcess(env=env_vars), "evaluator": PythonProcess(env=env_vars), "executor": PythonProcess(env=env_vars), } lp.launch( program, launch_type="test_mt", local_resources=local_resources, ) trainer: mava.Trainer = trainer_node.create_handle().dereference() for _ in range(5): trainer.step()
def main(_): task = FLAGS.task environment_factory = lambda seed: helpers.make_environment(task) sac_config = sac.SACConfig(num_sgd_steps_per_step=64) sac_builder = sac.SACBuilder(sac_config) ail_config = ail.AILConfig(direct_rl_batch_size=sac_config.batch_size * sac_config.num_sgd_steps_per_step) def network_factory(spec: specs.EnvironmentSpec) -> ail.AILNetworks: def discriminator(*args, **kwargs) -> networks_lib.Logits: return ail.DiscriminatorModule(environment_spec=spec, use_action=True, use_next_obs=True, network_core=ail.DiscriminatorMLP( [4, 4], ))(*args, **kwargs) discriminator_transformed = hk.without_apply_rng( hk.transform_with_state(discriminator)) return ail.AILNetworks(ail.make_discriminator( spec, discriminator_transformed), imitation_reward_fn=ail.rewards.gail_reward(), direct_rl_networks=sac.make_networks(spec)) def policy_network( network: ail.AILNetworks, eval_mode: bool = False) -> actor_core_lib.FeedForwardPolicy: return sac.apply_policy_and_sample(network.direct_rl_networks, eval_mode=eval_mode) program = ail.DistributedAIL( environment_factory=environment_factory, rl_agent=sac_builder, config=ail_config, network_factory=network_factory, seed=0, batch_size=sac_config.batch_size * sac_config.num_sgd_steps_per_step, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name), policy_network=policy_network, evaluator_policy_network=(lambda n: policy_network(n, eval_mode=True)), num_actors=4, max_number_of_steps=100, discriminator_loss=ail.losses.gail_loss()).build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): # Configure the environment factory with requested task. make_environment = functools.partial(helpers.make_environment, domain_name=_DOMAIN.value, task_name=_TASK.value) # Construct the program. program_builder = mpo.DistributedMPO( make_environment, make_networks, target_policy_update_period=25, max_actor_steps=_MAX_ACTOR_STEPS.value, num_actors=4) lp.launch(programs=program_builder.build())
def main(_): task = FLAGS.task env_factory = lambda seed: helpers.make_environment(task) environment_spec = specs.make_environment_spec(env_factory(True)) program = td3.DistributedTD3( environment_factory=env_factory, environment_spec=environment_spec, network_factory=td3.make_networks, config=td3.TD3Config(), num_actors=4, seed=1, max_number_of_steps=100).build() lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): # Configure the environment factory with requested task. make_environment = functools.partial(helpers.make_environment, domain_name=_DOMAIN.value, task_name=_TASK.value) # Construct the program. program_builder = d4pg.DistributedD4PG( make_environment, make_networks, max_actor_steps=_MAX_ACTOR_STEPS.value, num_actors=4) # Launch experiment. lp.launch(programs=program_builder.build())
def main(_): task = FLAGS.task environment_factory = lambda seed: helpers.make_environment(task) config = ppo.PPOConfig(unroll_length=16, num_minibatches=32, num_epochs=10, batch_size=2048 // 16) program = ppo.DistributedPPO(environment_factory=environment_factory, network_factory=ppo.make_continuous_networks, config=config, seed=FLAGS.seed, num_actors=4, max_number_of_steps=100).build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def main(_): task = FLAGS.env_name environment_factory = lambda seed: helpers.make_environment(task) config = value_dice.ValueDiceConfig(num_sgd_steps_per_step=64) agent = value_dice.DistributedValueDice( environment_factory=environment_factory, network_factory=value_dice.make_networks, config=config, num_actors=4, log_to_bigtable=True, max_number_of_steps=100, seed=1, make_demonstrations=functools.partial( helpers.make_demonstration_iterator, dataset_name=FLAGS.dataset_name)) program = agent.build() # Launch experiment. lp.launch(program, xm_resources=lp_utils.make_xm_docker_resources(program))
def test_agent(self): env_factory = lambda seed: fakes.fake_atari_wrapped(oar_wrapper=True) config = r2d2.R2D2Config(batch_size=1, trace_length=5, sequence_period=1, samples_per_insert=1., min_replay_size=32, burn_in_length=1, prefetch_size=2, target_update_period=2500, max_replay_size=100_000, importance_sampling_exponent=0.6, priority_exponent=0.9, max_priority_weight=0.9, bootstrap_n=5, clip_rewards=False, variable_update_period=400) dummy_seed = 1 agent = r2d2.DistributedR2D2FromConfig( environment_factory=env_factory, environment_spec=acme.make_environment_spec( env_factory(dummy_seed)), network_factory=functools.partial(r2d2.make_atari_networks, config.batch_size), config=config, seed=0, num_actors=1, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() # pytype: disable=attribute-error lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def test_distributed_sac_fd(self): def make_env(seed): del seed return fakes.ContinuousEnvironment(episode_length=10, action_dim=3, observation_dim=5, bounded=True) spec = specs.make_environment_spec(make_env(seed=0)) batch_size = 10 sac_config = sac.SACConfig( batch_size=batch_size, target_entropy=sac.target_entropy_from_env_spec(spec), min_replay_size=16, samples_per_insert=2) lfd_config = config.LfdConfig(initial_insert_count=0, demonstration_ratio=0.2) sac_fd_config = sacfd_agents.SACfDConfig(lfd_config=lfd_config, sac_config=sac_config) agent = sacfd_agents.DistributedSACfD( environment_factory=make_env, network_factory=sac.make_networks, sac_fd_config=sac_fd_config, lfd_iterator_fn=fake_demonstration_iterator, seed=0, num_actors=2) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() # pytype: disable=attribute-error lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def main(_): # Configure the environment factory with requested task. make_environment = functools.partial( helpers.make_environment, domain_name=_DOMAIN.value, task_name=_TASK.value, from_pixels=True, frames_to_stack=3, num_action_repeats=2) # Construct the program. program_builder = dmpo.DistributedDistributionalMPO( make_environment, make_networks, n_step=3, # Reduce the n-step to account for action-repeat. max_actor_steps=_MAX_ACTOR_STEPS.value, num_actors=4) # Launch experiment. lp.launch( programs=program_builder.build() )
def test_control_suite(self): """Tests that the agent can run on the control suite without crashing.""" agent = svg0_prior.DistributedSVG0( environment_factory=lambda x: fakes.ContinuousEnvironment(), network_factory=make_networks, num_actors=2, batch_size=32, min_replay_size=32, max_replay_size=1000, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def test_agent(self): agent = mpo.DistributedMPO( environment_factory=lambda x: fakes.ContinuousEnvironment(bounded= True), network_factory=make_networks, num_actors=2, batch_size=32, min_replay_size=32, max_replay_size=1000, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def test_consumer_steps(self): """Runs the program and makes sure the consumer can run 10 steps.""" program = launch.make_program(num_producers=2) # Retrieve the consumer node from the program. Nodes are organized as a # mapping of label->nodes, stored as a dict in `program.groups` (consumer_node, ) = program.groups['consumer'] # Disable the automatic execution of its `run()` method. consumer_node.disable_run() # pytype: disable=attribute-error # Launch all workers declared by the program. Remember to set the launch # type here (test & multithreaded). lp.launch(program, launch_type='test_mt') # Dereference `consumer_node`'s courier handle explicitly to obtain courier # client of it. consumer = consumer_node.create_handle().dereference() # Success criteria for this integration test defined as consumer being # able to take 10 steps. for _ in range(10): consumer.step()
def test_atari(self): """Tests that the agent can run for some steps without crashing.""" env_factory = lambda x: fakes.fake_atari_wrapped(oar_wrapper=True) net_factory = lambda spec: networks.IMPALAAtariNetwork(spec.num_values) agent = impala.DistributedIMPALA( environment_factory=env_factory, network_factory=net_factory, num_actors=2, batch_size=32, sequence_length=5, sequence_period=1, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def test_atari(self): """Tests that the agent can run for some steps without crashing.""" env_factory = lambda x: fakes.fake_atari_wrapped() net_factory = lambda spec: networks.DQNAtariNetwork(spec.num_values) agent = dqn.DistributedDQN( environment_factory=env_factory, network_factory=net_factory, num_actors=2, batch_size=32, min_replay_size=32, max_replay_size=1000, ) program = agent.build() (learner_node, ) = program.groups['learner'] learner_node.disable_run() lp.launch(program, launch_type='test_mt') learner: acme.Learner = learner_node.create_handle().dereference() for _ in range(5): learner.step()
def main(_): # Configure the environment factory with requested task. make_environment = functools.partial(helpers.make_environment, domain_name=_DOMAIN.value, task_name=_TASK.value, from_pixels=True, frames_to_stack=3, flatten_stack=True, num_action_repeats=2) # Construct the program. program_builder = dmpo.DistributedDistributionalMPO( make_environment, make_networks, target_policy_update_period=100, max_actor_steps=_MAX_ACTOR_STEPS.value, num_actors=4, samples_per_insert=256, n_step=3, # Reduce the n-step to account for action-repeat. observation_augmentation=image_augmentation.pad_and_crop, ) # Launch experiment. lp.launch(programs=program_builder.build())