def test_worker_weight_syncing(self): """ Tests weight synchronization with a local agent and a remote worker. """ # First, create a local agent env_spec = dict( type="openai", gym_env="PongNoFrameskip-v4", # The frameskip in the agent config will trigger worker skips, this # is used for internal env. frameskip=4, max_num_noops=30, episodic_life=True) env = Environment.from_spec(env_spec) agent_config = config_from_path("configs/ray_apex_for_pong.json") # Remove unneeded apex params. if "apex_replay_spec" in agent_config: agent_config.pop("apex_replay_spec") ray_spec = agent_config["execution_spec"].pop("ray_spec") local_agent = Agent.from_spec(agent_config, state_space=env.state_space, action_space=env.action_space) ray_spec["worker_spec"]["worker_sample_size"] = 50 # Create a remote worker with the same agent config. worker = RayValueWorker.as_remote().remote(agent_config, ray_spec["worker_spec"], env_spec, auto_build=True) # This imitates the initial executor sync without ray.put weights = RayWeight(local_agent.get_weights()) print('Weight type in init sync = {}'.format(type(weights))) ret = worker.set_weights.remote(weights) ray.wait([ret]) print('Init weight sync successful.') # Replicate worker syncing steps as done in e.g. Ape-X executor: weights = RayWeight(local_agent.get_weights()) print('Weight type returned by ray put = {}'.format(type(weights))) ret = worker.set_weights.remote(weights) ray.wait([ret]) print('Object store weight sync successful.')
def __init__(self, environment_spec, agent_config): """ Args: environment_spec (dict, callable): Environment spec or callable returning an environment. Each worker in the cluster will instantiate an environment using this spec or callable. agent_config (dict): Config dict containing agent and execution specs. """ ray_spec = agent_config["execution_spec"].pop("ray_spec") self.worker_spec = ray_spec.pop("worker_spec") self.compress_states = self.worker_spec["compress_states"] super(SyncBatchExecutor, self).__init__(executor_spec=ray_spec.pop("executor_spec"), environment_spec=environment_spec, worker_spec=self.worker_spec) # Must specify an agent type. assert "type" in agent_config self.agent_config = agent_config environment = None if isinstance(self.environment_spec, dict): environment = Environment.from_spec(self.environment_spec) elif hasattr(self.environment_spec, '__call__'): environment = self.environment_spec() self.agent_config["state_space"] = environment.state_space self.agent_config["action_space"] = environment.action_space self.local_agent = self.build_agent_from_config(self.agent_config) self.update_batch_size = self.agent_config["update_spec"]["batch_size"] # Create remote sample workers based on ray cluster spec. self.num_sample_workers = self.executor_spec["num_sample_workers"] # These are the tasks actually interacting with the environment. self.worker_sample_size = self.executor_spec["num_worker_samples"] assert not ray_spec, "ERROR: ray_spec still contains items: {}".format( ray_spec) self.logger.info("Setting up execution for Apex executor.") self.setup_execution()
def __init__(self, num_envs, env_spec): """ Args: num_envs (int): Number of environments env_spec Union[callable, dict]: Environment spec dict. """ self.num_envs = num_envs self.environments = list() for _ in range_(num_envs): if isinstance(env_spec, dict): env = Environment.from_spec(env_spec) elif hasattr(env_spec, '__call__'): env = env_spec() else: raise ValueError( "Env_spec must be either a dict containing an environment spec or a callable" "returning a new environment object.") self.environments.append(env) super(VectorEnv, self).__init__(state_space=self.environments[0].state_space, action_space=self.environments[0].action_space)
def create_env(): return Environment.from_spec(env_spec)
def test_environment_stepper_component_with_large_impala_architecture( self): env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( ## The images from the env are divided by 255. #RGB_INTERLEAVED=[dict(type="divide", divisor=255)], # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. dict(network_spec=LargeIMPALANetwork(), action_space=action_space), # Exploration spec. Exploration(epsilon_spec=dict(decay_spec=dict(type="linear_decay", from_=1.0, to_=0.1, start_timestep=0, num_timesteps=100)))) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=100, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest( component=environment_stepper, action_space=action_space, ) # Reset the stepper. test.test("reset") # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.perf_counter() steps = 10 for _ in range(steps): out = test.test("step") time_total = time.perf_counter() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)." .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0]["INSTR"].dtype == np.object) self.assertTrue(out[0]["RGB_INTERLEAVED"].dtype == np.float32) self.assertTrue(out[0]["RGB_INTERLEAVED"].min() >= 0.0) # make sure we have pixels / 255 self.assertTrue(out[0]["RGB_INTERLEAVED"].max() <= 1.0) self.assertTrue(out[1].dtype == np.int32) # actions self.assertTrue(out[2].dtype == np.float32) # rewards self.assertTrue(out[3].dtype == np.float32) # episode return self.assertTrue(out[4].dtype == np.bool_) # next-state is terminal? self.assertTrue(out[5]["INSTR"].dtype == np.object) # next state (raw, not preprocessed) self.assertTrue(out[5]["RGB_INTERLEAVED"].dtype == np.uint8) # next state (raw, not preprocessed) self.assertTrue( out[5]["RGB_INTERLEAVED"].min() >= 0) # make sure we have pixels self.assertTrue(out[5]["RGB_INTERLEAVED"].max() <= 255) # action probs (test whether sum to one). self.assertTrue(out[6].dtype == np.float32) self.assertTrue(out[6].min() >= 0.0) self.assertTrue(out[6].max() <= 1.0) recursive_assert_almost_equal( out[6].sum(axis=-1, keepdims=False), np.ones(shape=(environment_stepper.num_steps, )), decimals=4) # internal states (c- and h-state) self.assertTrue(out[7][0].dtype == np.float32) self.assertTrue(out[7][1].dtype == np.float32) self.assertTrue(out[7][0].shape == (environment_stepper.num_steps, 256)) self.assertTrue(out[7][1].shape == (environment_stepper.num_steps, 256)) # Check whether episode returns match single rewards (including terminal signals). episode_returns = 0.0 for i in range(environment_stepper.num_steps): episode_returns += out[2][i] self.assertAlmostEqual(episode_returns, out[3][i]) # Terminal: Reset for next step. if out[4][i] is np.bool_(True): episode_returns = 0.0 test.terminate()
def setup_execution(self): # Create local worker agent according to spec. # Extract states and actions space. environment = None if isinstance(self.environment_spec, dict): environment = Environment.from_spec(self.environment_spec) elif hasattr(self.environment_spec, '__call__'): environment = self.environment_spec() self.agent_config["state_space"] = environment.state_space self.agent_config["action_space"] = environment.action_space # Start Ray cluster and connect to it. self.local_agent = Agent.from_spec(self.agent_config) # Set up worker thread for performing updates. self.update_worker = UpdateWorker( agent=self.local_agent, in_queue_size=self.executor_spec["learn_queue_size"]) self.ray_init() # Create remote sample workers based on ray cluster spec. self.num_replay_workers = self.executor_spec["num_replay_workers"] self.num_sample_workers = self.executor_spec["num_sample_workers"] self.logger.info("Initializing {} local replay memories.".format( self.num_replay_workers)) # Update memory size for num of workers shard_size = int(self.apex_replay_spec["memory_spec"]["capacity"] / self.num_replay_workers) self.apex_replay_spec["memory_spec"]["capacity"] = shard_size self.logger.info("Shard size per memory: {}".format( self.apex_replay_spec["memory_spec"]["capacity"])) min_sample_size = self.apex_replay_spec["min_sample_memory_size"] self.apex_replay_spec["min_sample_memory_size"] = int( min_sample_size / self.num_replay_workers) self.logger.info("Sampling for learning starts at: {}".format( self.apex_replay_spec["min_sample_memory_size"])) # Set sample batch size: self.apex_replay_spec["sample_batch_size"] = self.agent_config[ "update_spec"]["batch_size"] self.logger.info("Sampling batch size {}".format( self.apex_replay_spec["sample_batch_size"])) self.ray_local_replay_memories = create_colocated_ray_actors( cls=RayMemoryActor.as_remote( num_cpus=self.num_cpus_per_replay_actor), config=self.apex_replay_spec, num_agents=self.num_replay_workers) # Create remote workers for data collection. self.worker_spec["worker_sample_size"] = self.worker_sample_size self.logger.info( "Initializing {} remote data collection agents, sample size: {}". format(self.num_sample_workers, self.worker_spec["worker_sample_size"])) self.ray_env_sample_workers = self.create_remote_workers( RayValueWorker, self.num_sample_workers, self.agent_config, # *args self.worker_spec, self.environment_spec, self.worker_frame_skip) self.init_tasks()
def test_environment_stepper_component_with_large_impala_architecture( self): worker_sample_size = 100 env_spec = dict(type="deepmind_lab", level_id="seekavoid_arena_01", observations=["RGB_INTERLEAVED", "INSTR"], frameskip=4) dummy_env = Environment.from_spec(env_spec) state_space = dummy_env.state_space action_space = dummy_env.action_space actor_component = ActorComponent( # Preprocessor spec (only for image and prev-action channel). dict( type="dict-preprocessor-stack", preprocessors=dict( # The prev. action/reward from the env must be flattened/bumped-up-to-(1,). previous_action=[ dict(type="reshape", flatten=True, flatten_categories=action_space.num_categories) ], previous_reward=[ dict(type="reshape", new_shape=(1, )), dict(type="convert_type", to_dtype="float32") ], )), # Policy spec. worker_sample_size=1 as its an actor network. dict(network_spec=LargeIMPALANetwork(worker_sample_size=1), action_space=action_space)) environment_stepper = EnvironmentStepper( environment_spec=env_spec, actor_component_spec=actor_component, state_space=state_space, reward_space="float32", internal_states_space=self.internal_states_space, num_steps=worker_sample_size, # Add both prev-action and -reward into the state sent through the network. add_previous_action_to_state=True, add_previous_reward_to_state=True, add_action_probs=True, action_probs_space=self.action_probs_space) test = ComponentTest(component=environment_stepper, action_space=action_space, execution_spec=dict(disable_monitoring=True)) environment_stepper.environment_server.start_server() # Step n times through the Env and collect results. # 1st return value is the step-op (None), 2nd return value is the tuple of items (3 steps each), with each # step containing: Preprocessed state, actions, rewards, episode returns, terminals, (raw) next-states. time_start = time.perf_counter() steps = 10 for _ in range(steps): out = test.test("step") time_total = time.perf_counter() - time_start print( "Done running {}x{} steps in Deepmind Lab env using IMPALA network in {}sec ({} actions/sec)." .format(steps, environment_stepper.num_steps, time_total, environment_stepper.num_steps * steps / time_total)) # Check types of outputs. self.assertTrue(isinstance( out, DataOpTuple)) # the step results as a tuple (see below) # Check types of single data. self.assertTrue(out[0].dtype == np.bool_) # next-state is terminal? self.assertTrue(out[1]["INSTR"].dtype == np.object) self.assertTrue(out[1]["RGB_INTERLEAVED"].dtype == np.uint8) self.assertTrue( out[1]["RGB_INTERLEAVED"].shape == (worker_sample_size + 1, ) + state_space["RGB_INTERLEAVED"].shape) self.assertTrue( out[1]["RGB_INTERLEAVED"].min() >= 0) # make sure we have pixels self.assertTrue(out[1]["RGB_INTERLEAVED"].max() <= 255) self.assertTrue(out[1]["previous_action"].dtype == np.int32) # actions self.assertTrue( out[1]["previous_action"].shape == (worker_sample_size + 1, )) self.assertTrue( out[1]["previous_reward"].dtype == np.float32) # rewards self.assertTrue( out[1]["previous_reward"].shape == (worker_sample_size + 1, )) # action probs (test whether sum to one). self.assertTrue(out[2].dtype == np.float32) self.assertTrue(out[2].shape == (100, action_space.num_categories)) self.assertTrue(out[2].min() >= 0.0) self.assertTrue(out[2].max() <= 1.0) recursive_assert_almost_equal(out[2].sum(axis=-1, keepdims=False), np.ones(shape=(worker_sample_size, )), decimals=4) # internal states (c- and h-state) self.assertTrue(out[3][0].dtype == np.float32) self.assertTrue(out[3][0].shape == (worker_sample_size + 1, 256)) self.assertTrue(out[3][1].dtype == np.float32) self.assertTrue(out[3][1].shape == (worker_sample_size + 1, 256)) environment_stepper.environment_server.stop_server() test.terminate()
def main(argv): try: FLAGS(argv) except flags.Error as e: print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS)) agent_config_path = os.path.join(os.getcwd(), FLAGS.config) with open(agent_config_path, 'rt') as fp: agent_config = json.load(fp) cluster_spec_config_path = os.path.join(os.getcwd(), FLAGS.cluster_spec) with open(cluster_spec_config_path, 'rt') as fp: cluster_spec = json.load(fp) # Environment options env_spec = { "type": "deepmind-lab", "level_id": FLAGS.level, "frameskip": 4, "observations": ["RGB_INTERLEAVED", "INSTR"] } # Verbose usage errors if FLAGS.actor and FLAGS.learner: print("Please only use either --actor or --learner, not both.") sys.exit(1) # We dynamically update the distributed spec according to the job and task index if FLAGS.actor: agent_type = 'actor' distributed_spec = dict( job='actor', task_index=FLAGS.task, cluster_spec=cluster_spec ) # Actors should only act on CPUs agent_config['execution_spec']['gpu_spec'].update({ "gpus_enabled": False, "max_usable_gpus": 0, "num_gpus": 0 }) elif FLAGS.learner: agent_type = 'learner' distributed_spec = dict( job='learner', task_index=FLAGS.task, cluster_spec=cluster_spec ) else: print("Please pass either --learner or --actor (or look at the CartPole example for single trainig mode).") sys.exit(1) # Set the sample size for the workers worker_sample_size = 100 # Since we dynamically update the cluster spec according to the job and task index, we need to # manually update the execution spec as well. execution_spec = agent_config['execution_spec'] execution_spec.update(dict( mode="distributed", distributed_spec=distributed_spec )) # Now, create the environment env = Environment.from_spec(env_spec) agent_spec = dict( type=agent_type, architecture="large", environment_spec=env_spec, worker_sample_size=worker_sample_size, state_space=env.state_space, action_space=env.action_space, # TODO: automate this (by lookup from NN). internal_states_space=IMPALAAgent.default_internal_states_space, execution_spec=execution_spec, # update_spec=dict(batch_size=2), # Summarize time-steps to have an overview of the env-stepping speed. summary_spec=dict(summary_regexp="time-step", directory="/tmp/impala_{}_{}/".format(agent_type, FLAGS.task)) ) agent_config.update(agent_spec) agent = IMPALAAgent( **agent_config ) if FLAGS.learner: print("Starting learner for {} updates.".format(FLAGS.updates)) for _ in range(FLAGS.updates): start_time = time.perf_counter() results = agent.update() else: # Actor just acts print("Starting actor. Terminate with SIGINT (Ctrl+C).") while True: agent.call_api_method("perform_n_steps_and_insert_into_fifo") #.monitored_session.run([agent.enqueue_op]) learn_updates = 100 mean_returns = [] for i in range(learn_updates): ret = agent.update() mean_return = _calc_mean_return(ret) mean_returns.append(mean_return) print("Iteration={} Loss={:.4f} Avg-reward={:.2f}".format(i, float(ret[1]), mean_return)) print("Mean return: {:.2f} / over the last 10 episodes: {:.2f}".format( np.nanmean(mean_returns), np.nanmean(mean_returns[-10:]) )) time.sleep(1) agent.terminate() time.sleep(1)