def main(_): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) env = config.create_environment(FLAGS.task) # Unique ID to identify a specific run of an actor. run_id = np.random.randint(np.iinfo(np.int64).max) observation = env.reset() reward = 0.0 raw_reward = 0.0 done = False while True: env_output = utils.EnvOutput(reward, done, np.array(observation)) action = client.inference((FLAGS.task, run_id, env_output, raw_reward)) observation, reward, done, info = env.step(action.numpy()) raw_reward = float(info.get('score_reward', reward)) if done: observation = env.reset() except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) env.close()
def testRunLearner(self): FLAGS.unroll_length = 6 FLAGS.batch_size = 2 logdir = FLAGS.test_tmpdir mock_problem = testing_utils.MockProblem( unroll_length=FLAGS.unroll_length) actor_output_spec = mock_problem.get_actor_output_spec() utils.write_specs(logdir, actor_output_spec) # Create dummy tensors with the right structure. zero_actor_output = tf.nest.map_structure( lambda sp: tf.zeros(shape=sp.shape, dtype=sp.dtype), actor_output_spec) server_address = 'unix:/tmp/learner_test_grpc' hparams = {} hparams['logdir'] = logdir hparams['final_iteration'] = 5 hparams['iter_frame_ratio'] = FLAGS.batch_size * FLAGS.unroll_length # Create a learner in a background thread. (Otherwise this call would # block.) thread = threading.Thread(target=learner.run_with_address, args=(mock_problem, server_address, hparams)) thread.start() # Creating a client blocks until the learner responds. client = grpc.Client(server_address) # Send a number of enqueue RPCs to the learner. for _ in range(FLAGS.batch_size * hparams['final_iteration']): client.enqueue(tf.nest.flatten(zero_actor_output)) # pytype: disable=attribute-error # The learner should terminate after a fixed number of iterations. thread.join()
def actor_loop(create_env_fn): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. """ logging.info('Starting actor loop') if are_summaries_enabled(): summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.logdir, 'actor_{}'.format(FLAGS.task)), flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer else: summary_writer = tf.summary.create_noop_writer() timer_cls = utils.nullcontext actor_step = 0 with summary_writer.as_default(): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) env = create_env_fn(FLAGS.task) # Unique ID to identify a specific run of an actor. run_id = np.random.randint(np.iinfo(np.int64).max) observation = env.reset() reward = 0.0 raw_reward = 0.0 done = False while True: tf.summary.experimental.set_step(actor_step) env_output = utils.EnvOutput(reward, done, observation) with timer_cls('actor/elapsed_inference_s', 1000): action = client.inference( (FLAGS.task, run_id, env_output, raw_reward)) with timer_cls('actor/elapsed_env_step_s', 1000): observation, reward, done, info = env.step(action.numpy()) raw_reward = float(info.get('score_reward', reward)) if done: with timer_cls('actor/elapsed_env_reset_s', 10): observation = env.reset() actor_step += 1 except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) env.close()
def test_run_eval_aggregator_many_times(self): server_address = 'unix:/tmp/eval_aggregator_test_grpc' hparams = {} hparams['logdir'] = os.path.join(FLAGS.test_tmpdir, 'mode') hparams['num_samples'] = 10 # Create an eval aggregator in a background thread. (Otherwise this call # would block.) thread = threading.Thread(target=eval_aggregator.run_with_address, args=(server_address, hparams)) thread.start() # Creating a client blocks until the aggregator responds. client = grpc.Client(server_address) # Send a number of eval_enqueue RPCs to the aggregator. for i in range(hparams['num_samples'] + 1): msg = pickle.dumps({common.STEP: i / 2, 'eval/a_number': 1}) client.eval_enqueue(msg) # pytype: disable=attribute-error # The aggregator should terminate after num_samples RPCs. Wait for it to # exit. thread.join()
def run_with_learner(problem_type: framework_problem_type.ProblemType, learner_address: Text, hparams: Dict[Text, Any]): """Runs actor with the given learner address and problem type. Args: problem_type: An instance of `framework_problem_type.ProblemType`. learner_address: The network address of a learner exposing two methods: `variable_values`: which returns latest value of trainable variables. `enqueue`: which accepts nested tensors of type `ActorOutput` tuple. hparams: A dict containing hyperparameter settings. """ env = problem_type.get_environment() agent = problem_type.get_agent() env_output = env.reset() initial_agent_state = agent.get_initial_state(utils.add_batch_dim( env_output.observation), batch_size=1) # Agent always expects time,batch dimensions. First add and then remove. env_output = utils.add_time_batch_dim(env_output) agent_output, _ = agent(env_output, initial_agent_state) env_output, agent_output = utils.remove_time_batch_dim( env_output, agent_output) actor_action = common.ActorAction( chosen_action_idx=tf.zeros([], dtype=tf.int32), oracle_next_action_idx=tf.zeros([], dtype=tf.int32)) # Remove batch_dim from returned agent's initial state. initial_agent_state = tf.nest.map_structure(lambda t: tf.squeeze(t, 0), initial_agent_state) # Write TensorSpecs the learner can use for initialization. logging.info('My task id is %d', FLAGS.task) if FLAGS.task == 0: _write_tensor_specs(initial_agent_state, env_output, agent_output, actor_action) # gRPC Client creation blocks until the server responds to an RPC. Since the # server blocks at startup looking for TensorSpecs, and will not respond to # gRPC calls until these TensorSpecs are written, client creation must happen # after the actor writes TensorSpecs in order to prevent a deadlock. logging.info('Connecting to learner: %s', learner_address) client = grpc.Client(learner_address) iter_steps = 0 num_steps = 0 sum_reward = 0. # add batch_dim agent_state = tf.nest.map_structure(lambda t: tf.expand_dims(t, 0), initial_agent_state) iterations = 0 while iter_steps < hparams['max_iter'] or hparams['max_iter'] == -1: logging.info('Iteration %d of %d', iter_steps + 1, hparams['max_iter']) # Get fresh parameters from the trainer. var_dtypes = [v.dtype for v in agent.trainable_variables] # trainer also adds `iterations` to the list of variables -- which is a # counter tracking number of iterations done so far. var_dtypes.append(tf.int64) new_values = [] if iter_steps % hparams['sync_agent_every_n_steps'] == 0: new_values = client.variable_values() # pytype: disable=attribute-error if new_values: logging.debug('Fetched variables from learner.') iterations = new_values[-1].numpy() updated_agent_vars = new_values[:-1] assert len(updated_agent_vars) == len(agent.trainable_variables) for x, y in zip(agent.trainable_variables, updated_agent_vars): x.assign(y) infos = [] # Unroll agent. # Every episode sent by actor includes previous episode's final agent # state and output as well as final environment output. initial_agent_state = tf.nest.map_structure(lambda t: tf.squeeze(t, 0), agent_state) env_outputs = [env_output] agent_outputs = [agent_output] actor_actions = [actor_action] loss_type = problem_type.get_episode_loss_type(iterations) for i in range(FLAGS.unroll_length): logging.debug('Unroll step %d of %d', i + 1, FLAGS.unroll_length) # Agent expects time,batch dimensions in `env_output` and batch # dimension in `agent_state`. `agent_state` already has batch_dim. env_output = utils.add_time_batch_dim(env_output) agent_output, agent_state = agent(env_output, agent_state) env_output, agent_output = utils.remove_time_batch_dim( env_output, agent_output) actor_action, action_val = problem_type.select_actor_action( env_output, agent_output) env_output = env.step(action_val) env_outputs.append(env_output) agent_outputs.append(agent_output) actor_actions.append(actor_action) num_steps += 1 sum_reward += env_output.reward if env_output.done: infos.append( problem_type.get_actor_info(env_output, sum_reward, num_steps)) num_steps = 0 sum_reward = 0. processed_env_output = problem_type.postprocessing( utils.stack_nested_tensors(env_outputs)) actor_output = common.ActorOutput( initial_agent_state=initial_agent_state, env_output=processed_env_output, agent_output=utils.stack_nested_tensors(agent_outputs), actor_action=utils.stack_nested_tensors(actor_actions), loss_type=tf.convert_to_tensor(loss_type, tf.int32), info=pickle.dumps(infos)) flattened = tf.nest.flatten(actor_output) client.enqueue(flattened) # pytype: disable=attribute-error iter_steps += 1
def actor_loop(create_env_fn): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. """ logging.info('Starting actor eval loop') summary_writer = tf.summary.create_file_writer(os.path.join( FLAGS.logdir, 'actor_{}'.format(FLAGS.task)), flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer actor_step = 0 with summary_writer.as_default(): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) env = create_env_fn(FLAGS.task, color='black') env1 = create_env_fn(FLAGS.task, color='white') # Unique ID to identify a specific run of an actor. run_id = np.random.randint(np.iinfo(np.int64).max) observation = env.reset() reward = 0.0 raw_reward = 0.0 done = False episode_step = 0 episode_return = 0 episode_raw_return = 0 eval_times = 0 eval_state = 'black' print("starting eval: ", eval_state) while True: tf.summary.experimental.set_step(actor_step) env_output = utils.EnvOutput( tf.cast(reward, tf.float32), done, tf.cast(observation, tf.float32)) with timer_cls('actor/elapsed_inference_s', 1000): action = client.inference_eval(FLAGS.task, run_id, env_output, raw_reward) if eval_state == 'black': with timer_cls('actor/elapsed_env_step_s', 1000): observation, reward, done, info = env.step( action.numpy()) else: with timer_cls('actor/elapsed_env_step_s', 1000): observation, reward, done, info = env1.step( action.numpy()) if is_rendering_enabled(): env.render() episode_step += 1 episode_return += reward raw_reward = float((info or {}).get('score_reward', reward)) episode_raw_return += raw_reward if done: eval_times += 1 if eval_times >= 50: tf.summary.scalar( 'actor/eval_return_' + eval_state, episode_return) logging.info( '%s win/all: %d/%d Raw return: %f Steps: %i', eval_state, (episode_return + eval_times) / 2, eval_times, episode_raw_return, episode_step) episode_step = 0 episode_return = 0 episode_raw_return = 0 time.sleep(300) eval_times = 0 eval_state = 'white' if eval_state == 'black' else 'black' print("starting eval: ", eval_state) if eval_state == 'black': with timer_cls('actor/elapsed_env_reset_s', 10): observation = env.reset() else: with timer_cls('actor/elapsed_env_reset_s', 10): observation = env1.reset() actor_step += 1 except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) env.close()
def actor_loop(create_env_fn, mzconfig, share_of_supervised_episodes_fn=lambda _: 0.): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. mzconfig: MuZeroConfig instance. share_of_supervised_episodes_fn: Function that specifies the share of episodes that should be supervised based on the learner iteration. """ logging.info('Starting actor loop') actor_log_dir = os.path.join(FLAGS.logdir, 'actor_{}'.format(TASK.value)) if are_summaries_enabled(): summary_writer = tf.summary.create_file_writer(actor_log_dir, flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer if FLAG_FILE.value: mzutils.write_flags(FLAGS.__flags, FLAG_FILE.value) # pylint: disable=protected-access else: summary_writer = tf.summary.create_noop_writer() timer_cls = utils.nullcontext batch_queue = collections.deque() actor_step = tf.Variable(0, dtype=tf.int64) num_episodes = tf.Variable(0, dtype=tf.int64) # We use the checkpoint to keep track of the actor_step and num_episodes. actor_checkpoint = tf.train.Checkpoint(actor_step=actor_step, num_episodes=num_episodes) ckpt_manager = tf.train.CheckpointManager(checkpoint=actor_checkpoint, directory=actor_log_dir, max_to_keep=1) if ckpt_manager.latest_checkpoint: logging.info('Restoring actor checkpoint: %s', ckpt_manager.latest_checkpoint) actor_checkpoint.restore( ckpt_manager.latest_checkpoint).assert_consumed() reward_agg, length_agg = profiling.Aggregator(), profiling.Aggregator() with summary_writer.as_default(): tf.summary.experimental.set_step(actor_step) while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) def _create_training_samples(episode, start_idx=0): start_idx += random.choice(range(ACTOR_SKIP.value + 1)) for i in range(start_idx, len(episode.history), ACTOR_SKIP.value + 1): target = episode.make_target( state_index=i, num_unroll_steps=mzconfig.num_unroll_steps, td_steps=mzconfig.td_steps, rewards=episode.rewards, policy_distributions=episode.child_visits, discount=episode.discount, value_approximations=episode.root_values) priority = np.float32( 1e-2) # preventing all zero priorities if len(episode) > 0: # pylint: disable=g-explicit-length-test last_value_idx = min( len(episode) - 1 - i, len(target.value) - 1) priority = np.maximum( priority, np.float32( np.abs(episode.root_values[ i + last_value_idx] - target.value[last_value_idx]))) # This will be batched and given to add_to_replay_buffer on the # learner. sample = ( priority, episode.make_image(i), tf.stack( episode.history_range( i, i + mzconfig.num_unroll_steps)), ) + tuple( map(lambda x: tf.cast(tf.stack(x), tf.float32), target)) batch_queue.append(sample) if ENABLE_ACTOR_LOGGING.value: logging.info( 'Added %d samples to the batch_queue. Size: %d of needed %d', len(episode.history) - start_idx, len(batch_queue), mzconfig.train_batch_size) def _add_queue_to_replay_buffer(): with timer_cls('actor/elapsed_add_to_buffer_s', 10 * ACTOR_LOG_FREQUENCY.value): while len(batch_queue) >= mzconfig.train_batch_size: batch = [ batch_queue.popleft() for _ in range(mzconfig.train_batch_size) ] flat_batch = [tf.nest.flatten(b) for b in batch] stacked_batch = list( map(tf.stack, zip(*flat_batch))) structured_batch = tf.nest.pack_sequence_as( batch[0], stacked_batch) client.add_to_replay_buffer(*structured_batch) if ENABLE_ACTOR_LOGGING.value: logging.info( 'Added batch of size %d into replay_buffer.', len(batch)) env = create_env_fn(TASK.value, training=is_training_actor()) def recurrent_inference_fn(*args, **kwargs): with timer_cls('actor/elapsed_recurrent_inference_s', 100 * ACTOR_LOG_FREQUENCY.value): output = client.recurrent_inference(*args, **kwargs) output = tf.nest.map_structure(lambda t: t.numpy(), output) return output def get_legal_actions_fn(episode): def legal_actions_fn(*args, **kwargs): with timer_cls('actor/elapsed_get_legal_actions_s', 100 * ACTOR_LOG_FREQUENCY.value): output = episode.legal_actions(*args, **kwargs) return output return legal_actions_fn while True: episode = mzconfig.new_episode(env) is_supervised_episode = is_training_actor() and \ random.random() < share_of_supervised_episodes_fn( client.learning_iteration().numpy()) if is_supervised_episode: if ENABLE_ACTOR_LOGGING.value: logging.info('Supervised Episode.') try: with timer_cls( 'actor/elapsed_load_supervised_episode_s', ACTOR_LOG_FREQUENCY.value): episode_example = env.load_supervised_episode() with timer_cls( 'actor/elapsed_run_supervised_episode_s', ACTOR_LOG_FREQUENCY.value): targets, samples = env.run_supervised_episode( episode_example) episode.rewards = samples['reward'] episode.history = samples['to_predict'] for target in targets: batch_queue.append(target) except core.RLEnvironmentError as e: logging.warning('Environment not ready %s', str(e)) # restart episode continue except core.BadSupervisedEpisodeError as e: logging.warning('Abort supervised episode: %s', str(e)) # restart episode continue else: if ENABLE_ACTOR_LOGGING.value: logging.info('RL Episode.') try: last_enqueued_idx = 0 legal_actions_fn = get_legal_actions_fn(episode) except core.RLEnvironmentError as e: logging.warning('Environment not ready: %s', str(e)) # restart episode continue except core.SkipEpisode as e: logging.warning('Episode is skipped due to: %s', str(e)) # restart episode continue while (not episode.terminal() and len(episode.history) < mzconfig.max_moves): # This loop is the agent playing the episode. current_observation = episode.make_image(-1) # Map the observation to hidden space. with timer_cls('actor/elapsed_initial_inference_s', 10 * ACTOR_LOG_FREQUENCY.value): initial_inference_output = client.initial_inference( current_observation) initial_inference_output = tf.nest.map_structure( lambda t: t.numpy(), initial_inference_output) # Run MCTS using recurrent_inference_fn. with timer_cls('actor/elapsed_mcts_s', 10 * ACTOR_LOG_FREQUENCY.value): legal_actions = legal_actions_fn() root = core.prepare_root_node( mzconfig, legal_actions, initial_inference_output) with timer_cls('actor/elapsed_run_mcts_s', 10 * ACTOR_LOG_FREQUENCY.value): core.run_mcts(mzconfig, root, episode.action_history(), legal_actions_fn, recurrent_inference_fn, episode.visualize_mcts) action = core.select_action( mzconfig, len(episode.history), root, train_step=actor_step.numpy(), use_softmax=mzconfig. use_softmax_for_action_selection, is_training=is_training_actor()) try: # Perform chosen action. with timer_cls('actor/elapsed_env_step_s', 10 * ACTOR_LOG_FREQUENCY.value): training_steps = client.learning_iteration( ).numpy() episode.apply( action=action, training_steps=training_steps) except core.RLEnvironmentError as env_error: logging.warning('Environment failed: %s', str(env_error)) episode.failed = True # terminate episode break episode.store_search_statistics( root, use_softmax=( USE_SOFTMAX_FOR_TARGET.value == 1)) actor_step.assign_add(delta=1) if is_training_actor( ) and ACTOR_ENQUEUE_EVERY.value > 0 and ( len(episode.history) - last_enqueued_idx ) >= ACTOR_ENQUEUE_EVERY.value: _create_training_samples( episode, start_idx=last_enqueued_idx) last_enqueued_idx = len(episode.history) _add_queue_to_replay_buffer() if episode.failed: # restart episode logging.warning('Episode failed, restarting.') continue # Post-episode stats num_episodes.assign_add(delta=1) reward_agg.add(episode.total_reward()) length_agg.add(len(episode)) if ENABLE_ACTOR_LOGGING.value: logging.info( 'Episode done. Length: %d, ' 'Total Reward: %d, Min Reward: %d, Max Reward: %d', len(episode), episode.total_reward(), min(episode.rewards), max(episode.rewards)) if reward_agg.count % ACTOR_LOG_FREQUENCY.value == 0: tf.summary.experimental.set_step(actor_step) tf.summary.scalar('actor/total_reward', reward_agg.average()) tf.summary.scalar('actor/episode_length', length_agg.average()) tf.summary.scalar('actor/num_episodes', num_episodes) tf.summary.scalar('actor/step', actor_step) tf.summary.scalar( 'actor/share_of_supervised_episodes', share_of_supervised_episodes_fn( client.learning_iteration().numpy())) if episode.mcts_visualizations: tf.summary.text( 'actor/mcts_vis', '\n\n'.join(episode.mcts_visualizations)) if are_summaries_enabled( ) and MCTS_VIS_FILE.value is not None: # write it also into a txt file with tf.io.gfile.GFile( MCTS_VIS_FILE.value, 'a') as f: f.write('Step {}\n{}\n\n\n\n'.format( actor_step, '\n\n'.join( episode.mcts_visualizations))) special_stats = episode.special_statistics() for stat_name, stat_value in special_stats.items(): if isinstance(stat_value, float) or isinstance( stat_value, int): tf.summary.scalar( 'actor/{}'.format(stat_name), stat_value) elif isinstance(stat_value, str): tf.summary.text( 'actor/{}'.format(stat_name), stat_value) else: logging.warning( 'Special statistic %s could not be tracked. ' 'Type %s is not supported.', stat_name, type(stat_value)) ckpt_manager.save() reward_agg.reset() length_agg.reset() if is_training_actor(): # Create samples for training. _create_training_samples( episode, start_idx=last_enqueued_idx) # Send training samples to the learner after the episode is finished if is_training_actor(): _add_queue_to_replay_buffer() summary_name = 'train' if is_training_actor() else 'test' if is_supervised_episode: summary_name += ' (supervised)' with timer_cls('actor/elapsed_add_to_reward_s', 10 * ACTOR_LOG_FREQUENCY.value): # This is just for statistics. client.add_to_reward_queue( summary_name, np.float32(episode.total_reward()), np.int64(len(episode)), *episode.special_statistics_learner()) del episode except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) env.close()
def run_with_aggregator(problem_type, aggregator_address: Text, hparams): """Run evaluation actor with given problem_type, aggregator and hparams. Args: problem_type: An instance of `framework_problem_type.ProblemType`. aggregator_address: The aggregator address to which we will send data for batching. hparams: A dict containing hyperparameter settings. """ assert isinstance(problem_type, framework_problem_type.ProblemType) env = problem_type.get_environment() agent = problem_type.get_agent() env_output = env.reset() agent_state = agent.get_initial_state(utils.add_batch_dim( env_output.observation), batch_size=1) # Agent always expects time,batch dimensions. _, _ = agent(utils.add_time_batch_dim(env_output), agent_state) logging.info('Connecting to aggregator %s', aggregator_address) aggregator = grpc.Client(aggregator_address) iter_steps = 0 latest_checkpoint_path = '' while hparams['max_iter'] == -1 or iter_steps < hparams['max_iter']: logging.info('Iteration %d of %d', iter_steps + 1, hparams['max_iter']) checkpoint_directory = os.path.join(hparams['logdir'], 'model.ckpt') checkpoint_path = tf.train.latest_checkpoint(checkpoint_directory) if checkpoint_path == latest_checkpoint_path or not checkpoint_path: logging.info( 'Waiting for next checkpoint. Previously evaluated checkpoint %s', latest_checkpoint_path) time.sleep(30) continue ckpt = tf.train.Checkpoint(agent=agent) ckpt.restore(checkpoint_path) latest_checkpoint_path = checkpoint_path logging.info('Evaluating latest checkpoint - %s', latest_checkpoint_path) step = int(latest_checkpoint_path.split('-')[-1]) logging.debug('Step %d', step) for i in range(hparams['num_episodes_per_iter']): logging.debug('Episode number %d of %d', i + 1, hparams['num_episodes_per_iter']) action_list = [] env_output_list = [env_output] while True: env_output = utils.add_time_batch_dim(env_output) agent_output, agent_state = agent(env_output, agent_state) env_output, agent_output = utils.remove_time_batch_dim( env_output, agent_output) _, action_val = problem_type.select_actor_action( env_output, agent_output) env_output = env.step(action_val) action_list.append(action_val) env_output_list.append(env_output) if env_output.done: eval_result = problem_type.eval(action_list, env_output_list) # eval_result is a dict. eval_result[common.STEP] = step aggregator.eval_enqueue(pickle.dumps(eval_result)) # pytype: disable=attribute-error break iter_steps += 1
def actor_loop(create_env_fn, config=None, log_period=1): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. config: Configuration of the training. log_period: How often to log in seconds. """ if not config: config = FLAGS env_batch_size = FLAGS.env_batch_size logging.info('Starting actor loop. Task: %r. Environment batch size: %r', FLAGS.task, env_batch_size) is_rendering_enabled = FLAGS.render and FLAGS.task == 0 if are_summaries_enabled(): summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.logdir, 'actor_{}'.format(FLAGS.task)), flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer else: summary_writer = tf.summary.create_noop_writer() timer_cls = utils.nullcontext actor_step = 0 with summary_writer.as_default(): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) utils.update_config(config, client) batched_env = env_wrappers.BatchedEnvironment( create_env_fn, env_batch_size, FLAGS.task * env_batch_size, config) env_id = batched_env.env_ids run_id = np.random.randint( low=0, high=np.iinfo(np.int64).max, size=env_batch_size, dtype=np.int64) observation = batched_env.reset() reward = np.zeros(env_batch_size, np.float32) raw_reward = np.zeros(env_batch_size, np.float32) done = np.zeros(env_batch_size, np.bool) abandoned = np.zeros(env_batch_size, np.bool) global_step = 0 episode_step = np.zeros(env_batch_size, np.int32) episode_return = np.zeros(env_batch_size, np.float32) episode_raw_return = np.zeros(env_batch_size, np.float32) episode_step_sum = 0 episode_return_sum = 0 episode_raw_return_sum = 0 episodes_in_report = 0 elapsed_inference_s_timer = timer_cls('actor/elapsed_inference_s', 1000) last_log_time = timeit.default_timer() last_global_step = 0 while True: tf.summary.experimental.set_step(actor_step) env_output = utils.EnvOutput(reward, done, observation, abandoned, episode_step) with elapsed_inference_s_timer: action = client.inference(env_id, run_id, env_output, raw_reward) with timer_cls('actor/elapsed_env_step_s', 1000): observation, reward, done, info = batched_env.step(action.numpy()) if is_rendering_enabled: batched_env.render() for i in range(env_batch_size): episode_step[i] += 1 episode_return[i] += reward[i] raw_reward[i] = float((info[i] or {}).get('score_reward', reward[i])) episode_raw_return[i] += raw_reward[i] # If the info dict contains an entry abandoned=True and the # episode was ended (done=True), then we need to specially handle # the final transition as per the explanations below. abandoned[i] = (info[i] or {}).get('abandoned', False) assert done[i] if abandoned[i] else True if done[i]: # If the episode was abandoned, we need to report the final # transition including the final observation as if the episode has # not terminated yet. This way, learning algorithms can use the # transition for learning. if abandoned[i]: # We do not signal yet that the episode was abandoned. This will # happen for the transition from the terminal state to the # resetted state. assert env_batch_size == 1 and i == 0, ( 'Mixing of batched and non-batched inference calls is not ' 'yet supported') env_output = utils.EnvOutput(reward, np.array([False]), observation, np.array([False]), episode_step) with elapsed_inference_s_timer: # action is ignored client.inference(env_id, run_id, env_output, raw_reward) reward[i] = 0.0 raw_reward[i] = 0.0 # Periodically log statistics. current_time = timeit.default_timer() episode_step_sum += episode_step[i] episode_return_sum += episode_return[i] episode_raw_return_sum += episode_raw_return[i] global_step += episode_step[i] episodes_in_report += 1 if current_time - last_log_time >= log_period: logging.info( 'Actor steps: %i, Return: %f Raw return: %f ' 'Episode steps: %f, Speed: %f steps/s', global_step, episode_return_sum / episodes_in_report, episode_raw_return_sum / episodes_in_report, episode_step_sum / episodes_in_report, (global_step - last_global_step) / (current_time - last_log_time)) last_global_step = global_step episode_return_sum = 0 episode_raw_return_sum = 0 episode_step_sum = 0 episodes_in_report = 0 last_log_time = current_time episode_step[i] = 0 episode_return[i] = 0 episode_raw_return[i] = 0 # Finally, we reset the episode which will report the transition # from the terminal state to the resetted state in the next loop # iteration (with zero rewards). with timer_cls('actor/elapsed_env_reset_s', 10): observation = batched_env.reset_if_done(done) if is_rendering_enabled and done[0]: batched_env.render() actor_step += 1 except (tf.errors.UnavailableError, tf.errors.CancelledError): logging.info('Inference call failed. This is normal at the end of ' 'training.') batched_env.close()
def actor_loop(create_env_fn): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. """ project = neptune.init('pmtest/marl-vtrace') experiment = DummyExperiment() if FLAGS.task == 0 and not FLAGS.is_local: # First actor logs winning rate. while True: time.sleep(5) experiments = project.get_experiments(tag=FLAGS.nonce) if len(experiments) == 0: logging.info('Experiment not found, retry...') else: experiment = experiments[-1] break log_period = 5 log_period_growth = 1.05 log_period_max = 600 last_replay_time = timeit.default_timer() replay_period = 600 replay_period_growth = 1.2 replay_period_max = 3600 env_batch_size = FLAGS.env_batch_size logging.info('Starting actor loop. Task: %r. Environment batch size: %r', FLAGS.task, env_batch_size) is_rendering_enabled = FLAGS.render and FLAGS.task == 0 if are_summaries_enabled(): summary_writer = tf.summary.create_file_writer(os.path.join( FLAGS.logdir, 'actor_{}'.format(FLAGS.task)), flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer else: summary_writer = tf.summary.create_noop_writer() timer_cls = utils.nullcontext actor_step = 0 with summary_writer.as_default(): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) batched_env = env_wrappers.BatchedEnvironment( create_env_fn, env_batch_size, FLAGS.task * env_batch_size) env_id = batched_env.env_ids run_id = np.random.randint(low=0, high=np.iinfo(np.int64).max, size=env_batch_size, dtype=np.int64) observation = batched_env.reset() reward = np.zeros(env_batch_size, np.float32) raw_reward = np.zeros(env_batch_size, np.float32) done = np.zeros(env_batch_size, np.bool) abandoned = np.zeros(env_batch_size, np.bool) global_step = 0 episode_step = np.zeros(env_batch_size, np.int32) episode_return = np.zeros(env_batch_size, np.float32) episode_raw_return = np.zeros(env_batch_size, np.float32) episode_step_sum = 0 episode_return_sum = 0 episode_raw_return_sum = 0 episode_won = 0 episodes_in_report = 0 elapsed_inference_s_timer = timer_cls( 'actor/elapsed_inference_s', 1000) last_log_time = timeit.default_timer() last_global_step = 0 while True: tf.summary.experimental.set_step(actor_step) env_output = utils.EnvOutput(reward, done, observation, abandoned, episode_step) with elapsed_inference_s_timer: action = client.inference(env_id, run_id, env_output, raw_reward) with timer_cls('actor/elapsed_env_step_s', 1000): observation, reward, done, info = batched_env.step( action.numpy()) if is_rendering_enabled: batched_env.render() for i in range(env_batch_size): episode_step[i] += 1 episode_return[i] += reward[i] raw_reward[i] = float( (info[i] or {}).get('score_reward', reward[i])) episode_raw_return[i] += raw_reward[i] # If the info dict contains an entry abandoned=True and the # episode was ended (done=True), then we need to specially handle # the final transition as per the explanations below. abandoned[i] = (info[i] or {}).get('abandoned', False) assert done[i] if abandoned[i] else True if done[i]: # If the episode was abandoned, we need to report the final # transition including the final observation as if the episode has # not terminated yet. This way, learning algorithms can use the # transition for learning. if abandoned[i]: # We do not signal yet that the episode was abandoned. This will # happen for the transition from the terminal state to the # resetted state. assert env_batch_size == 1 and i == 0, ( 'Mixing of batched and non-batched inference calls is not ' 'yet supported') env_output = utils.EnvOutput( reward, np.array([False]), observation, np.array([False]), episode_step) with elapsed_inference_s_timer: # action is ignored client.inference(env_id, run_id, env_output, raw_reward) reward[i] = 0.0 raw_reward[i] = 0.0 # Periodically log statistics. current_time = timeit.default_timer() episode_step_sum += episode_step[i] episode_return_sum += episode_return[i] episode_raw_return_sum += episode_raw_return[i] global_step += episode_step[i] episode_won += (info[i] or {}).get('battle_won', False) episodes_in_report += 1 if FLAGS.task == 0 and \ current_time - last_replay_time > replay_period: replay_period = min( replay_period_max, replay_period * replay_period_growth) last_replay_time = current_time batched_env.envs[0].save_replay() if current_time - last_log_time > log_period: log_period = min( log_period_max, log_period * log_period_growth) logging.info( 'Actor steps: %i, Return: %f Raw return: %f ' 'Episode steps: %f, Speed: %f steps/s, Won: %.2f', global_step, episode_return_sum / episodes_in_report, episode_raw_return_sum / episodes_in_report, episode_step_sum / episodes_in_report, (global_step - last_global_step) / (current_time - last_log_time), episode_won / episodes_in_report) tf.summary.scalar('episodes win rate', episode_won / episodes_in_report, step=global_step) if FLAGS.task == 0: experiment.log_metric( log_name='episode win rate', x=global_step, y=episode_won / episodes_in_report) last_global_step = global_step episode_return_sum = 0 episode_raw_return_sum = 0 episode_step_sum = 0 episode_won = 0 episodes_in_report = 0 last_log_time = current_time episode_step[i] = 0 episode_return[i] = 0 episode_raw_return[i] = 0 # Finally, we reset the episode which will report the transition # from the terminal state to the resetted state in the next loop # iteration (with zero rewards). with timer_cls('actor/elapsed_env_reset_s', 10): observation = batched_env.reset_if_done(done) if is_rendering_enabled and done[0]: batched_env.render() actor_step += 1 except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) batched_env.close()
def actor_loop(create_env_fn): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. """ logging.info('Starting actor loop') if are_summaries_enabled(): summary_writer = tf.summary.create_file_writer(os.path.join( FLAGS.logdir, 'actor_{}'.format(FLAGS.task)), flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer else: summary_writer = tf.summary.create_noop_writer() timer_cls = utils.nullcontext actor_step = 0 with summary_writer.as_default(): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) env = create_env_fn(FLAGS.task) # Unique ID to identify a specific run of an actor. run_id = np.random.randint(np.iinfo(np.int64).max) observation = env.reset() reward = 0.0 raw_reward = 0.0 done = False abandoned = False global_step = 0 episode_step = 0 episode_step_sum = 0 episode_return_sum = 0 episode_raw_return_sum = 0 episodes_in_report = 0 elapsed_inference_s_timer = timer_cls( 'actor/elapsed_inference_s', 1000) last_log_time = timeit.default_timer() while True: tf.summary.experimental.set_step(actor_step) env_output = utils.EnvOutput(reward, done, observation, abandoned, episode_step) with elapsed_inference_s_timer: action = client.inference(FLAGS.task, run_id, env_output, raw_reward) with timer_cls('actor/elapsed_env_step_s', 1000): observation, reward, done, info = env.step( action.numpy()) if is_rendering_enabled(): env.render() episode_step += 1 episode_return_sum += reward raw_reward = float((info or {}).get('score_reward', reward)) episode_raw_return_sum += raw_reward # If the info dict contains an entry abandoned=True and the # episode was ended (done=True), then we need to specially handle # the final transition as per the explanations below. abandoned = (info or {}).get('abandoned', False) assert done if abandoned else True if done: # If the episode was abandoned, we need to report the final # transition including the final observation as if the episode has # not terminated yet. This way, learning algorithms can use the # transition for learning. if abandoned: # We do not signal yet that the episode was abandoned. This will # happen for the transition from the terminal state to the # resetted state. env_output = utils.EnvOutput( reward, False, observation, False, episode_step) with elapsed_inference_s_timer: action = client.inference( FLAGS.task, run_id, env_output, raw_reward) reward = 0.0 raw_reward = 0.0 # Periodically log statistics. current_time = timeit.default_timer() episode_step_sum += episode_step global_step += episode_step episodes_in_report += 1 if current_time - last_log_time > 1: logging.info( 'Actor steps: %i, Return: %f Raw return: %f Episode steps: %f', global_step, episode_return_sum / episodes_in_report, episode_raw_return_sum / episodes_in_report, episode_step_sum / episodes_in_report) episode_return_sum = 0 episode_raw_return_sum = 0 episode_step_sum = 0 episodes_in_report = 0 last_log_time = current_time # Finally, we reset the episode which will report the transition # from the terminal state to the resetted state in the next loop # iteration (with zero rewards). with timer_cls('actor/elapsed_env_reset_s', 10): observation = env.reset() episode_step = 0 if is_rendering_enabled(): env.render() actor_step += 1 except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) env.close()
def actor_loop(create_env_fn): """Main actor loop. Args: create_env_fn: Callable (taking the task ID as argument) that must return a newly created environment. """ logging.info('Starting actor loop') if are_summaries_enabled(): summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.logdir, 'actor_{}'.format(FLAGS.task)), flush_millis=20000, max_queue=1000) timer_cls = profiling.ExportingTimer else: summary_writer = tf.summary.create_noop_writer() timer_cls = utils.nullcontext actor_step = 0 with summary_writer.as_default(): while True: try: # Client to communicate with the learner. client = grpc.Client(FLAGS.server_address) env = create_env_fn(FLAGS.task) # Unique ID to identify a specific run of an actor. run_id = np.random.randint(np.iinfo(np.int64).max) run_id1 = np.random.randint(np.iinfo(np.int64).max) observation = env.reset() reward = 0.0 raw_reward = 0.0 done = False episode_step = 0 episode_return = 0 color_state = 0 episode_end = False while True: tf.summary.experimental.set_step(actor_step) env_output = utils.EnvOutput(tf.cast(reward, tf.float32), done, tf.cast(observation, tf.float32)) if color_state==0: with timer_cls('actor/elapsed_inference_s', 1000): action = client.inference( FLAGS.task, run_id, env_output, reward) with timer_cls('actor/elapsed_env_step_s', 1000): observation, _reward, _done, info = env.step(action.numpy()) else: with timer_cls('actor/elapsed_inference_s', 1000): action = client.inference( int(FLAGS.num_actors/2+FLAGS.task), run_id1, env_output, reward) with timer_cls('actor/elapsed_env_step_s', 1000): observation, _reward, _done, info = env.step(action.numpy()) episode_step += 1 if _done: random_num_ = np.random.random() if random_num_>0.98: if is_rendering_enabled(): env.render() with timer_cls('actor/elapsed_env_reset_s', 10): observation = env.reset() color_state = 0 else: color_state = 1 - color_state if episode_end: # this color must be white assert color_state==1 if random_num_>0.98: logging.info('Return: %f Steps: %i', episode_return, episode_step) episode_step = 0 episode_return = 0 done = episode_end reward = -reward episode_end=_done else: reward=_reward episode_end=_done done = episode_end episode_return+=reward actor_step += 1 except (tf.errors.UnavailableError, tf.errors.CancelledError) as e: logging.exception(e) env.close()