def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["lr_decay_rate"] = float(hp_dict.get("lr_decay_rate", 0)) params["lr_decay_steps"] = float(hp_dict.get("lr_decay_steps", 0)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int(hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int(hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float(hp_dict.get("term_cond_avg_score", 100000)) params["tensorboard"] = hp_dict.get("tensorboard", False) params["dump_mp4"] = hp_dict.get("dump_mp4", False) params["dump_gifs"] = hp_dict.get("dump_gifs", False) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].learning_rate_decay_rate = params["lr_decay_rate"] agent_params.network_wrappers['main'].learning_rate_decay_steps = params["lr_decay_steps"] agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu' # Replace the default CNN with single layer Conv2d(32, 3, 1) # agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Shallow # agent_params.network_wrappers['main'].input_embedders_parameters['observation'].dropout_rate = 0.3 agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu' # agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Shallow # agent_params.network_wrappers['main'].middleware_parameters.dropout_rate = 0.3 agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 # agent_params.network_wrappers['main'].l2_regularization = 2e-5 if params["loss_type"] == "huber": agent_params.network_wrappers['main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### DeepRacerInputFilter = InputFilter(is_a_reference_filter=True) # Add an observation image pertubation for many aspects # DeepRacerInputFilter.add_observation_filter('observation', 'perturb_color', ObservationColorPerturbation(0.2)) # Rescale to much smaller input when using shallow networks to avoid OOM # DeepRacerInputFilter.add_observation_filter('observation', 'rescaling', # ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]), # high=255))) DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) DeepRacerInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) DeepRacerInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = DeepRacerInputFilter env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0' vis_params = VisualizationParameters() vis_params.tensorboard = params["tensorboard"] vis_params.dump_mp4 = params["dump_mp4"] vis_params.dump_gifs = params["dump_gifs"] # AlwaysDumpFilter, MaxDumpFilter, EveryNEpisodesDumpFilter, SelectedPhaseOnlyDumpFilter vis_params.video_dump_filters = [AlwaysDumpFilter()] ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
def get_graph_manager(hp_dict, agent_list, run_phase_subject): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: agent_params = DeepRacerAgentParams() if agent.network_settings: agent_params.env_agent = agent agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() agent_params.network_wrappers['main'].batch_size = params[ "batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
def __init__(self, improve_steps=TrainingSteps(10000000000)): super().__init__() self.heatup_steps = EnvironmentSteps(0) self.evaluation_steps = EnvironmentEpisodes(0) self.steps_between_evaluation_periods = improve_steps self.improve_steps = improve_steps
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, s3_writer): """ wait for first checkpoint then perform rollouts using the model """ if not graph_manager.data_store: raise AttributeError("None type for data_store object") data_store = graph_manager.data_store checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, data_store) wait_for_trainer_ready(checkpoint_dir, data_store) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics') rospy.wait_for_service('/gazebo/unpause_physics') rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4') rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty) subscribe_to_save_mp4 = ServiceProxyWrapper('/racecar/save_mp4/subscribe_to_save_mp4', Empty) unsubscribe_from_save_mp4 = ServiceProxyWrapper('/racecar/save_mp4/unsubscribe_from_save_mp4', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) chkpt_state_reader = CheckpointStateReader(checkpoint_dir, checkpoint_state_optional=False) last_checkpoint = chkpt_state_reader.get_latest().num # this worker should play a fraction of the total playing steps per rollout episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps act_steps = int(episode_steps_per_rollout / num_workers) if rollout_idx < episode_steps_per_rollout % num_workers: act_steps += 1 act_steps = EnvironmentEpisodes(act_steps) configure_environment_randomizer() for _ in range((graph_manager.improve_steps / act_steps.num_steps).num_steps): graph_manager.phase = RunPhase.TRAIN exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) graph_manager.act(act_steps, wait_for_full_episodes=graph_manager.agent_params.algorithm.act_for_full_episodes) graph_manager.reset_internal_state(True) time.sleep(1) pause_physics(EmptyRequest()) graph_manager.phase = RunPhase.UNDEFINED new_checkpoint = -1 if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.SYNC: unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) and rollout_idx == 0 if is_save_mp4_enabled: subscribe_to_save_mp4(EmptyRequest()) if rollout_idx == 0: for _ in range(MIN_EVAL_TRIALS): graph_manager.evaluate(EnvironmentSteps(1)) while new_checkpoint < last_checkpoint + 1: exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx) if rollout_idx == 0: graph_manager.evaluate(EnvironmentSteps(1)) new_checkpoint = data_store.get_chkpoint_num('agent') if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) s3_writer.upload_to_s3() pause_physics(EmptyRequest()) data_store.load_from_store(expected_checkpoint_number=last_checkpoint+1) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.ASYNC: if new_checkpoint > last_checkpoint: graph_manager.restore_checkpoint() last_checkpoint = new_checkpoint
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, s3_writer): """ wait for first checkpoint then perform rollouts using the model """ if not graph_manager.data_store: raise AttributeError("None type for data_store object") data_store = graph_manager.data_store checkpoint_dir = task_parameters.checkpoint_restore_path wait_for_checkpoint(checkpoint_dir, data_store) wait_for_trainer_ready(checkpoint_dir, data_store) # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4') rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) subscribe_to_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/subscribe_to_save_mp4', Empty) unsubscribe_from_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) chkpt_state_reader = CheckpointStateReader(checkpoint_dir, checkpoint_state_optional=False) last_checkpoint = chkpt_state_reader.get_latest().num # this worker should play a fraction of the total playing steps per rollout episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps act_steps = int(episode_steps_per_rollout / num_workers) if rollout_idx < episode_steps_per_rollout % num_workers: act_steps += 1 act_steps = EnvironmentEpisodes(act_steps) configure_environment_randomizer() for _ in range( (graph_manager.improve_steps / act_steps.num_steps).num_steps): # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): graph_manager.phase = RunPhase.TRAIN exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) graph_manager.act(act_steps, wait_for_full_episodes=graph_manager. agent_params.algorithm.act_for_full_episodes) graph_manager.reset_internal_state(True) time.sleep(1) pause_physics(EmptyRequest()) graph_manager.phase = RunPhase.UNDEFINED new_checkpoint = -1 if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.SYNC: is_save_mp4_enabled = rospy.get_param( 'MP4_S3_BUCKET', None) and rollout_idx == 0 if is_save_mp4_enabled: subscribe_to_save_mp4(EmptyRequest()) if rollout_idx == 0: unpause_physics(EmptyRequest()) for _ in range(int(rospy.get_param('MIN_EVAL_TRIALS', '5'))): graph_manager.evaluate(EnvironmentSteps(1)) while new_checkpoint < last_checkpoint + 1: exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx) if rollout_idx == 0: print( "Additional evaluation. New Checkpoint: {}, Last Checkpoint: {}" .format(new_checkpoint, last_checkpoint)) graph_manager.evaluate(EnvironmentSteps(1)) else: time.sleep(5) new_checkpoint = data_store.get_chkpoint_num('agent') if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) logger.info( "Completed iteration tasks. Writing results to S3.") s3_writer.upload_to_s3() pause_physics(EmptyRequest()) logger.info( "Preparing to load checkpoint {}".format(last_checkpoint + 1)) data_store.load_from_store( expected_checkpoint_number=last_checkpoint + 1) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.ASYNC: if new_checkpoint > last_checkpoint: graph_manager.restore_checkpoint() last_checkpoint = new_checkpoint logger.info("Exited main loop. Done.")
context_vector = attention_weights * conv context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector def __str__(self): return "Convolution (num filters = {}, kernel size = {}, stride = {})"\ .format(self.num_filters, self.kernel_size, self.strides) #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() # agent_params.network_wrappers['main'].input_embedders_parameters = { # 'left_camera': InputEmbedderParameters(activation_function='relu', dropout_rate=0.3), # 'stereo': InputEmbedderParameters(activation_function='relu', dropout_rate=0.3) # } agent_params.network_wrappers['main'].input_embedders_parameters = { 'left_camera':
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps from rl_coach.environments.doom_environment import DoomEnvironmentParameters from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.memories.memory import MemoryGranularity from rl_coach.filters.filter import InputFilter from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ACERAgentParameters() agent_params.algorithm.num_steps_between_gradient_updates = 30 agent_params.algorithm.apply_gradients_every_x_episodes = 1 agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.algorithm.ratio_of_replay = 4 agent_params.algorithm.num_transitions_to_start_replay = 2000 agent_params.memory.max_size = (MemoryGranularity.Transitions, 100000) agent_params.input_filter = InputFilter()
def train_using_experience_agent(env_params, n_epochs, dataset_size): tf.reset_default_graph( ) # just to clean things up; only needed for the tutorial # Experience Generating Agent parameters experience_generating_agent_params = DDQNAgentParameters() # schedule parameters experience_generating_schedule_params = ScheduleParameters() experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000) experience_generating_schedule_params.improve_steps = TrainingSteps( dataset_size - experience_generating_schedule_params.heatup_steps.num_steps) experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 10) experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes( 1) # DQN params experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 100) experience_generating_agent_params.algorithm.discount = 0.99 experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps( 1) # NN configuration experience_generating_agent_params.network_wrappers[ 'main'].learning_rate = 0.0001 experience_generating_agent_params.network_wrappers[ 'main'].batch_size = 128 experience_generating_agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = False experience_generating_agent_params.network_wrappers['main'].heads_parameters = \ [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \ # [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))] # ER size experience_generating_agent_params.memory = EpisodicExperienceReplayParameters( ) experience_generating_agent_params.memory.max_size = \ (MemoryGranularity.Transitions, experience_generating_schedule_params.heatup_steps.num_steps + experience_generating_schedule_params.improve_steps.num_steps) # E-Greedy schedule experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, 0.01, DATASET_SIZE) experience_generating_agent_params.exploration.evaluation_epsilon = 0 schedule_params = set_schedule_params(n_epochs, dataset_size) # set the agent params as before # agent_params = set_agent_params(DDQNAgentParameters) agent_params = set_agent_params(DDQNBCQAgentParameters) agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters( ) # 50 epochs of training (the entire dataset is used each epoch) # schedule_params.improve_steps = TrainingSteps(50) graph_manager = BatchRLGraphManager( agent_params=agent_params, experience_generating_agent_params=experience_generating_agent_params, experience_generating_schedule_params= experience_generating_schedule_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters( dump_signals_to_csv_every_x_episodes=1), reward_model_num_epochs=30, train_to_eval_ratio=0.5) graph_manager.create_graph(task_parameters) graph_manager.improve() return
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps from rl_coach.environments.gym_environment import MujocoInputFilter, GymEnvironmentParameters, MujocoOutputFilter from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters from rl_coach.exploration_policies.truncated_normal import TruncatedNormalParameters from rl_coach.schedules import ConstantSchedule, PieceWiseSchedule, ExponentialSchedule from rl_coach.memories.memory import MemoryGranularity from rl_coach.base_parameters import EmbedderScheme from rl_coach.architectures.tensorflow_components.architecture import Dense steps_per_episode = 13 #################### # Block Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentEpisodes(400) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 3) #3) # Neta: (1000) schedule_params.evaluation_steps = EnvironmentEpisodes(1) #1) # Neta: 0 schedule_params.heatup_steps = EnvironmentSteps( 2) #120*steps_per_episode) # Neta (2) ##################### # DDPG Agent Params # ##################### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters[ 'observation'].scheme = [Dense([300])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense([300]) ]
from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.graph_managers.hac_graph_manager import HACGraphManager from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \ EpisodicHindsightExperienceReplayParameters from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \ EpisodicHRLHindsightExperienceReplayParameters from rl_coach.memories.memory import MemoryGranularity from rl_coach.schedules import ConstantSchedule from rl_coach.spaces import GoalsSpace, ReachingGoal #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64) # 40 epochs schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 4 * 64) # 4 small batches of 64 episodes schedule_params.evaluation_steps = EnvironmentEpisodes(64) schedule_params.heatup_steps = EnvironmentSteps(0) polar_coordinates = False ######### # Agent # ######### if polar_coordinates: distance_from_goal_threshold = np.array([0.075, 0.75]) else: distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, simtrace_video_s3_writers): """ wait for first checkpoint then perform rollouts using the model """ if not graph_manager.data_store: raise AttributeError("None type for data_store object") data_store = graph_manager.data_store #TODO change agent to specific agent name for multip agent case checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path, "agent") graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.wait_for_trainer_ready() # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/gazebo/pause_physics_dr') rospy.wait_for_service('/gazebo/unpause_physics_dr') rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4') rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4') pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty) unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty) subscribe_to_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/subscribe_to_save_mp4', Empty) unsubscribe_from_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) chkpt_state_reader = CheckpointStateReader(checkpoint_dir, checkpoint_state_optional=False) last_checkpoint = chkpt_state_reader.get_latest().num # this worker should play a fraction of the total playing steps per rollout episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps act_steps = int(episode_steps_per_rollout / num_workers) if rollout_idx < episode_steps_per_rollout % num_workers: act_steps += 1 act_steps = EnvironmentEpisodes(act_steps) configure_environment_randomizer() for _ in range( (graph_manager.improve_steps / act_steps.num_steps).num_steps): # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): graph_manager.phase = RunPhase.TRAIN exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) graph_manager.act(act_steps, wait_for_full_episodes=graph_manager. agent_params.algorithm.act_for_full_episodes) graph_manager.reset_internal_state(True) time.sleep(1) pause_physics(EmptyRequest()) graph_manager.phase = RunPhase.UNDEFINED new_checkpoint = -1 if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.SYNC: unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param( 'MP4_S3_BUCKET', None) and rollout_idx == 0 if is_save_mp4_enabled: subscribe_to_save_mp4(EmptyRequest()) if rollout_idx == 0: for _ in range(MIN_EVAL_TRIALS): graph_manager.evaluate(EnvironmentSteps(1)) while new_checkpoint < last_checkpoint + 1: exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx) if rollout_idx == 0: graph_manager.evaluate(EnvironmentSteps(1)) new_checkpoint = data_store.get_coach_checkpoint_number( 'agent') if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) pause_physics(EmptyRequest()) data_store.load_from_store( expected_checkpoint_number=last_checkpoint + 1) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.ASYNC: if new_checkpoint > last_checkpoint: graph_manager.restore_checkpoint() last_checkpoint = new_checkpoint
def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### training_algorithm = agent_list[ 0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params( DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json
def get_clipped_ppo_params(agent_params, agent, params): """This function is algorithm specific settings required for Clipped PPO algorithm Args: agent_params (DeepRacerClippedPPOAgentParams): the agent parameters that will be used to create the RL agent agent (Agent): The agent object that was created either as part of create_rollout_agent or create_training_agent params (dict): dictionary of hyperparameters Returns: DeepRacerClippedPPOAgentParams: updated agent params object with hyperparameters and other required details """ agent_params.network_wrappers['main'].learning_rate = params[ HyperParameterKeys.LEARNING_RATE.value] agent_params.network_wrappers['main'].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].batch_size = params[ HyperParameterKeys.BATCH_SIZE.value] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params[HyperParameterKeys.LOSS_TYPE.value] == LossTypes.HUBER.value: agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.beta_entropy = params[ HyperParameterKeys.BETA_ENTROPY.value] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params[ HyperParameterKeys.DISCOUNT_FACTOR.value] agent_params.algorithm.optimization_epochs = params[ HyperParameterKeys.NUM_EPOCHS.value] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value]) agent_params.algorithm.num_consecutive_playing_steps = \ EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC if params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip( ) == ExplorationTypes.CATEGORICAL.value: agent_params.exploration = { DiscreteActionSpace: DeepRacerCategoricalParameters( use_stochastic_evaluation_policy=False), ScalableBoxActionSpace: AdditiveNoiseParameters() } elif params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip( ) == ExplorationTypes.E_GREEDY.value: agent_params.exploration = { DiscreteActionSpace: EGreedyParameters(), ScalableBoxActionSpace: AdditiveNoiseParameters() } agent_params.exploration[DiscreteActionSpace].epsilon_schedule = \ LinearSchedule(1.0, params[HyperParameterKeys.E_GREEDY_VALUE.value], params[HyperParameterKeys.EPSILON_STEPS.value]) else: log_and_exit( "Unknown exploration_type found in hyper parameters. \ exploration_type: {}".format(params[ HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip()), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) agent_params.memory = DeepRacerMemoryParameters() return agent_params
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.schedules import LinearSchedule from rl_coach.exploration_policies.e_greedy import EGreedyParameters from rl_coach.filters.filter import NoInputFilter, NoOutputFilter, InputFilter from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter from rl_coach.memories.memory import MemoryGranularity from markov import environments #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(100000) #Changing to 100K schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = DQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0003 agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu' agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu' agent_params.network_wrappers['main'].batch_size = 64 agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999
def rollout_worker(graph_manager, checkpoint_dir, data_store, num_workers, memory_backend_params): """ wait for first checkpoint then perform rollouts using the model """ wait_for_checkpoint(checkpoint_dir, data_store) task_parameters = TaskParameters() task_parameters.__dict__['checkpoint_restore_dir'] = checkpoint_dir graph_manager.create_graph(task_parameters) graph_manager.reset_internal_state() for level in graph_manager.level_managers: for agent in level.agents.values(): agent.memory.memory_backend = deepracer_memory.DeepRacerRolloutBackEnd( memory_backend_params, graph_manager.agent_params.algorithm. num_consecutive_playing_steps) with graph_manager.phase_context(RunPhase.TRAIN): last_checkpoint = 0 act_steps = math.ceil( (graph_manager.agent_params.algorithm. num_consecutive_playing_steps.num_steps) / num_workers) for i in range(int(graph_manager.improve_steps.num_steps / act_steps)): if should_stop(checkpoint_dir): break try: # This will only work for DeepRacerRacetrackEnv enviroments graph_manager.top_level_manager.environment.env.env.set_allow_servo_step_signals( True) except Exception as ex: utils.json_format_logger( "Method not defined in enviroment class: {}".format(ex), **utils.build_system_error_dict( utils.SIMAPP_SIMULATION_WORKER_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_500)) if type(graph_manager.agent_params.algorithm. num_consecutive_playing_steps) == EnvironmentSteps: graph_manager.act(EnvironmentSteps(num_steps=act_steps), wait_for_full_episodes=graph_manager. agent_params.algorithm.act_for_full_episodes) elif type(graph_manager.agent_params.algorithm. num_consecutive_playing_steps) == EnvironmentEpisodes: graph_manager.act(EnvironmentEpisodes(num_steps=act_steps)) try: # This will only work for DeepRacerRacetrackEnv enviroments graph_manager.top_level_manager.environment.env.env.set_allow_servo_step_signals( False) graph_manager.top_level_manager.environment.env.env.stop_car() except Exception as ex: utils.json_format_logger( "Method not defined in enviroment class: {}".format(ex), **utils.build_system_error_dict( utils.SIMAPP_SIMULATION_WORKER_EXCEPTION, utils.SIMAPP_EVENT_ERROR_CODE_500)) new_checkpoint = get_latest_checkpoint(checkpoint_dir) if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC: while new_checkpoint is None or new_checkpoint <= last_checkpoint: if should_stop(checkpoint_dir): break if data_store: data_store.load_from_store( expected_checkpoint_number=new_checkpoint) new_checkpoint = get_latest_checkpoint(checkpoint_dir) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC: if new_checkpoint is not None and new_checkpoint > last_checkpoint: graph_manager.restore_checkpoint() if new_checkpoint is not None: last_checkpoint = new_checkpoint
from rl_coach.architectures.head_parameters import QHeadParameters from rl_coach.agents.ddqn_bcq_agent import DDQNBCQAgentParameters from rl_coach.agents.ddqn_bcq_agent import KNNParameters, NNImitationModelParameters DATASET_SIZE = 200000 #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() # schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.improve_steps = TrainingSteps(400) # 400 epochs schedule_params.steps_between_evaluation_periods = TrainingSteps(1) schedule_params.evaluation_steps = EnvironmentEpisodes(10) schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE) ######### # Agent # ######### agent_params = DDQNBCQAgentParameters() agent_params.network_wrappers['main'].batch_size = 128 # TODO cross-DL framework abstraction for a constant initializer? agent_params.network_wrappers['main'].heads_parameters = [ QHeadParameters(output_bias_initializer=tf.constant_initializer(-100)) ] agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( 100)
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection from rl_coach.environments.gym_environment import Mujoco, mujoco_v2, MujocoInputFilter from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = PPOAgentParameters() agent_params.network_wrappers['actor'].learning_rate = 0.001 agent_params.network_wrappers['critic'].learning_rate = 0.001 agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense([64])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([64])] agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense([64])] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([64])] agent_params.input_filter = MujocoInputFilter()
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "huber")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'relu' agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### DeepRacerInputFilter = InputFilter(is_a_reference_filter=True) DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) DeepRacerInputFilter.add_observation_filter( 'observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) DeepRacerInputFilter.add_observation_filter( 'observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = DeepRacerInputFilter env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0' vis_params = VisualizationParameters() vis_params.dump_mp4 = False vis_params.dump_csv = True vis_params.print_networks_summary = True vis_params.dump_parameters_documentation = False vis_params.dump_signals_to_csv_every_x_episodes = 1 vis_params.tensorboard = True ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 1000 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
from rl_coach.agents.td3_agent import TD3AgentParameters from rl_coach.architectures.layers import Dense from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps from rl_coach.environments.environment import SingleLevelSelection from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2 from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.exploration_policies.truncated_normal import TruncatedNormalParameters from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentEpisodes(800) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(5000) schedule_params.evaluation_steps = EnvironmentEpisodes(0) # Neta: 0 schedule_params.heatup_steps = EnvironmentEpisodes(100) ######### # Agent # ######### agent_params = TD3AgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters[ 'observation'].scheme = [Dense(400)] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense(300) ] agent_params.network_wrappers['critic'].input_embedders_parameters[
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, simtrace_video_s3_writers, pause_physics, unpause_physics): """ wait for first checkpoint then perform rollouts using the model """ if not graph_manager.data_store: raise AttributeError("None type for data_store object") is_sageonly = check_is_sageonly() data_store = graph_manager.data_store #TODO change agent to specific agent name for multip agent case checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path, "agent") graph_manager.data_store.wait_for_checkpoints() graph_manager.data_store.wait_for_trainer_ready() # wait for the required cancel services to become available # Do this only for Robomaker job. # if not is_sageonly: # rospy.wait_for_service('/robomaker/job/cancel') # Make the clients that will allow us to pause and unpause the physics rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4') rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4') subscribe_to_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/subscribe_to_save_mp4', Empty) unsubscribe_from_save_mp4 = ServiceProxyWrapper( '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty) graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics, start_physics=unpause_physics, empty_service_call=EmptyRequest) chkpt_state_reader = CheckpointStateReader(checkpoint_dir, checkpoint_state_optional=False) last_checkpoint = chkpt_state_reader.get_latest().num # this worker should play a fraction of the total playing steps per rollout episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps act_steps = int(episode_steps_per_rollout / num_workers) if rollout_idx < episode_steps_per_rollout % num_workers: act_steps += 1 act_steps = EnvironmentEpisodes(act_steps) configure_environment_randomizer() for _ in range( (graph_manager.improve_steps / act_steps.num_steps).num_steps): # Collect profiler information only IS_PROFILER_ON is true with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET, s3_prefix=PROFILER_S3_PREFIX, output_local_path=ROLLOUT_WORKER_PROFILER_PATH, enable_profiling=IS_PROFILER_ON): graph_manager.phase = RunPhase.TRAIN exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx) unpause_physics(EmptyRequest()) graph_manager.reset_internal_state(True) graph_manager.act(act_steps, wait_for_full_episodes=graph_manager. agent_params.algorithm.act_for_full_episodes) graph_manager.reset_internal_state(True) time.sleep(1) pause_physics(EmptyRequest()) graph_manager.phase = RunPhase.UNDEFINED new_checkpoint = -1 if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.SYNC: unpause_physics(EmptyRequest()) is_save_mp4_enabled = rospy.get_param( 'MP4_S3_BUCKET', None) and rollout_idx == 0 if is_save_mp4_enabled: subscribe_to_save_mp4(EmptyRequest()) if rollout_idx == 0: for _ in range(int(rospy.get_param('MIN_EVAL_TRIALS', '5'))): graph_manager.evaluate(EnvironmentSteps(1)) # For sageonly job for better performance only run limited number of evaluations. # Pausing the physics makes its performance same as RoboMaker + SageMaker if is_sageonly: if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) graph_manager.phase = RunPhase.WAITING pause_physics(EmptyRequest()) while new_checkpoint < last_checkpoint + 1: exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers, rollout_idx) # Continously run the evaluation only for SageMaker + RoboMaker job if not is_sageonly and rollout_idx == 0: print( "Additional evaluation. New Checkpoint: {}, Last Checkpoint: {}" .format(new_checkpoint, last_checkpoint)) graph_manager.evaluate(EnvironmentSteps(1)) else: time.sleep(5) new_checkpoint = data_store.get_coach_checkpoint_number( 'agent') # Save the mp4 for Robo+Sage jobs if not is_sageonly: if is_save_mp4_enabled: unsubscribe_from_save_mp4(EmptyRequest()) # upload simtrace and mp4 into s3 bucket for s3_writer in simtrace_video_s3_writers: s3_writer.persist(utils.get_s3_kms_extra_args()) pause_physics(EmptyRequest()) data_store.load_from_store( expected_checkpoint_number=last_checkpoint + 1) graph_manager.restore_checkpoint() if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\ == DistributedCoachSynchronizationType.ASYNC: if new_checkpoint > last_checkpoint: graph_manager.restore_checkpoint() last_checkpoint = new_checkpoint logger.info("Exited main loop. Done.")
from rl_coach.environments.gym_environment import GymVectorEnvironment from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.memories.episodic.episodic_hindsight_experience_replay import \ EpisodicHindsightExperienceReplayParameters, HindsightGoalSelectionMethod from rl_coach.memories.memory import MemoryGranularity from rl_coach.schedules import ConstantSchedule from rl_coach.spaces import GoalsSpace, ReachingGoal bit_length = 20 #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentEpisodes(16 * 50 * 200) # 200 epochs schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(16 * 50) # 50 cycles schedule_params.evaluation_steps = EnvironmentEpisodes(10) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = DQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.001 agent_params.network_wrappers['main'].batch_size = 128 agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense(256)] agent_params.network_wrappers['main'].input_embedders_parameters = { 'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)} agent_params.algorithm.discount = 0.98
def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### # Note: The following three line hard-coded to pick the first agent's trainig algorithm # and dump the hyper parameters for the particular training algorithm into json # for training jobs (so that the console display the training hyperparameters correctly) # since right now, we only support training one model at a time. # TODO: clean these lines up when we support multi-agent training. training_algorithm = agent_list[ 0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: training_algorithm = agent.ctrl.model_metadata.training_algorithm params = get_updated_hyper_parameters(hp_dict, training_algorithm) if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params( DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or \ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: sector_binary_filter = ObservationSectorDiscretizeFilter( num_sectors=NUMBER_OF_LIDAR_SECTORS, num_values_per_sector=1, clipping_dist=SECTOR_LIDAR_CLIPPING_DIST) input_filter.add_observation_filter( observation, 'binary', sector_binary_filter) if observation == Input.DISCRETIZED_SECTOR_LIDAR.value: num_sectors = agent.ctrl.model_metadata.lidar_num_sectors num_values_per_sector = agent.ctrl.model_metadata.lidar_num_values_per_sector clipping_dist = agent.ctrl.model_metadata.lidar_clipping_dist sector_discretize_filter = ObservationSectorDiscretizeFilter( num_sectors=num_sectors, num_values_per_sector=num_values_per_sector, clipping_dist=clipping_dist) input_filter.add_observation_filter( observation, 'discrete', sector_discretize_filter) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json