def __init__( self, agent_params: AgentParameters, env_params: EnvironmentParameters, schedule_params: ScheduleParameters, vis_params: VisualizationParameters = VisualizationParameters(), preset_validation_params: PresetValidationParameters = PresetValidationParameters()): super().__init__('simple_rl_graph', schedule_params, vis_params) self.agent_params = agent_params self.env_params = env_params self.preset_validation_params = preset_validation_params self.agent_params.visualization = vis_params if self.agent_params.input_filter is None: self.agent_params.input_filter = env_params.default_input_filter() if self.agent_params.output_filter is None: self.agent_params.output_filter = env_params.default_output_filter( )
def train_on_csv_file(csv_file, n_epochs, dataset_size, obs_dim, act_dim): tf.reset_default_graph( ) # just to clean things up; only needed for the tutorial schedule_params = set_schedule_params(n_epochs, dataset_size) ######### # Agent # ######### # note that we have moved to BCQ, which will help the training to converge better and faster agent_params = set_agent_params(DDQNBCQAgentParameters) # additional setting for DDQNBCQAgentParameters agent parameters # can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation # agent_params.algorithm.action_drop_method_parameters = KNNParameters() agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters( ) DATATSET_PATH = csv_file agent_params.memory.load_memory_from_file_path = CsvDataset( DATATSET_PATH, is_episodic=True) spaces = SpacesDefinition(state=StateSpace( {'observation': VectorObservationSpace(shape=obs_dim)}), goal=None, action=DiscreteActionSpace(act_dim), reward=RewardSpace(1)) graph_manager = BatchRLGraphManager( agent_params=agent_params, env_params=None, spaces_definition=spaces, schedule_params=schedule_params, vis_params=VisualizationParameters( dump_signals_to_csv_every_x_episodes=1), reward_model_num_epochs=30, train_to_eval_ratio=0.4) graph_manager.create_graph(task_parameters) graph_manager.improve() return
def __init__( self, name: str, schedule_params: ScheduleParameters, vis_params: VisualizationParameters = VisualizationParameters()): self.sess = None self.level_managers = [] # type: List[LevelManager] self.top_level_manager = None self.environments = [] self.heatup_steps = schedule_params.heatup_steps self.evaluation_steps = schedule_params.evaluation_steps self.steps_between_evaluation_periods = schedule_params.steps_between_evaluation_periods self.improve_steps = schedule_params.improve_steps self.visualization_parameters = vis_params self.name = name self.task_parameters = None self._phase = self.phase = RunPhase.UNDEFINED self.preset_validation_params = PresetValidationParameters() self.reset_required = False # timers self.graph_creation_time = None self.last_checkpoint_saving_time = time.time() # counters self.total_steps_counters = { RunPhase.HEATUP: TotalStepsCounter(), RunPhase.TRAIN: TotalStepsCounter(), RunPhase.TEST: TotalStepsCounter() } self.checkpoint_id = 0 self.checkpoint_saver = None self.checkpoint_state_updater = None self.graph_logger = Logger() self.data_store = None self.is_batch_rl = False self.time_metric = TimeTypes.EpisodeNumber
def __init__( self, agent_params: AgentParameters, env_params: Union[EnvironmentParameters, None], schedule_params: ScheduleParameters, vis_params: VisualizationParameters = VisualizationParameters(), preset_validation_params: PresetValidationParameters = PresetValidationParameters(), name='batch_rl_graph', spaces_definition: SpacesDefinition = None, reward_model_num_epochs: int = 100, train_to_eval_ratio: float = 0.8): super().__init__(agent_params, env_params, schedule_params, vis_params, preset_validation_params, name) self.is_batch_rl = True self.time_metric = TimeTypes.Epoch self.reward_model_num_epochs = reward_model_num_epochs self.spaces_definition = spaces_definition # setting this here to make sure that, by default, train_to_eval_ratio gets a value < 1 # (its default value in the memory is 1) self.agent_params.memory.train_to_eval_ratio = train_to_eval_ratio
def __init__( self, name: str, schedule_params: ScheduleParameters, vis_params: VisualizationParameters = VisualizationParameters()): self.sess = None self.level_managers = [] self.top_level_manager = None self.environments = [] self.heatup_steps = schedule_params.heatup_steps self.evaluation_steps = schedule_params.evaluation_steps self.steps_between_evaluation_periods = schedule_params.steps_between_evaluation_periods self.improve_steps = schedule_params.improve_steps self.visualization_parameters = vis_params self.name = name self.task_parameters = None self._phase = self.phase = RunPhase.UNDEFINED self.preset_validation_params = PresetValidationParameters() # timers self.graph_initialization_time = time.time() self.heatup_start_time = None self.training_start_time = None self.last_evaluation_start_time = None self.last_checkpoint_saving_time = time.time() # counters self.total_steps_counters = { RunPhase.HEATUP: TotalStepsCounter(), RunPhase.TRAIN: TotalStepsCounter(), RunPhase.TEST: TotalStepsCounter() } self.checkpoint_id = 0 self.checkpoint_saver = None self.graph_logger = Logger()
Dense(200) ] agent_params.network_wrappers['main'].clip_gradients = 1000 agent_params.network_wrappers[ 'main'].gradients_clipping_method = GradientClippingMethod.ClipByValue ############### # Environment # ############### import jsbsim import gym_jsbsim from rl_coach.filters.filter import NoInputFilter, NoOutputFilter from rl_coach.filters.filter import InputFilter from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter vis_params = VisualizationParameters(native_rendering=True) input = InputFilter(is_a_reference_filter=True) input.add_observation_filter('observation', 'stacking', ObservationStackingFilter(10)) class MyGymVectorEnvironment(GymVectorEnvironment): def __init__(self, level=None): super().__init__(level=level) self.frame_skip = 1 self.default_input_filter = NoInputFilter( ) # hrmm.. my custom input filter errored out self.default_output_filter = NoOutputFilter()
def get_graph_manager_from_args( self, args: argparse.Namespace) -> 'GraphManager': """ Return the graph manager according to the command line arguments given by the user. :param args: the arguments given by the user :return: the graph manager, not bound to task_parameters yet. """ graph_manager = None # if a preset was given we will load the graph manager for the preset if args.preset is not None: graph_manager = short_dynamic_import(args.preset, ignore_module_case=True) # for human play we need to create a custom graph manager if args.play: from rl_coach.agents.human_agent import HumanAgentParameters env_params = short_dynamic_import(args.environment_type, ignore_module_case=True)() env_params.human_control = True schedule_params = HumanPlayScheduleParameters() graph_manager = BasicRLGraphManager(HumanAgentParameters(), env_params, schedule_params, VisualizationParameters()) # Set framework # Note: Some graph managers (e.g. HAC preset) create multiple agents and the attribute is called agents_params if hasattr(graph_manager, 'agent_params'): for network_parameters in graph_manager.agent_params.network_wrappers.values( ): network_parameters.framework = args.framework elif hasattr(graph_manager, 'agents_params'): for ap in graph_manager.agents_params: for network_parameters in ap.network_wrappers.values(): network_parameters.framework = args.framework if args.level: if isinstance(graph_manager.env_params.level, SingleLevelSelection): graph_manager.env_params.level.select(args.level) else: graph_manager.env_params.level = args.level # set the seed for the environment if args.seed is not None: graph_manager.env_params.seed = args.seed # visualization graph_manager.visualization_parameters.dump_gifs = graph_manager.visualization_parameters.dump_gifs or args.dump_gifs graph_manager.visualization_parameters.dump_mp4 = graph_manager.visualization_parameters.dump_mp4 or args.dump_mp4 graph_manager.visualization_parameters.render = args.render graph_manager.visualization_parameters.tensorboard = args.tensorboard graph_manager.visualization_parameters.print_networks_summary = args.print_networks_summary # update the custom parameters if args.custom_parameter is not None: unstripped_key_value_pairs = [ pair.split('=') for pair in args.custom_parameter.split(';') ] stripped_key_value_pairs = [ tuple([pair[0].strip(), pair[1].strip()]) for pair in unstripped_key_value_pairs if len(pair) == 2 ] # load custom parameters into run_dict for key, value in stripped_key_value_pairs: exec("graph_manager.{}={}".format(key, value)) return graph_manager
def get_graph_manager(hp_dict, agent_list, run_phase_subject): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: agent_params = DeepRacerAgentParams() if agent.network_settings: agent_params.env_agent = agent agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() agent_params.network_wrappers['main'].batch_size = params[ "batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
from rl_coach.agents.bootstrapped_dqn_agent import BootstrappedDQNAgentParameters from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters from rl_coach.environments.environment import SingleLevelSelection from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule from rl_coach.exploration_policies.ucb import UCBParameters from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager ######### # Agent # ######### agent_params = BootstrappedDQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.00025 agent_params.exploration = UCBParameters() ############### # Environment # ############### env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders'] graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=atari_schedule, vis_params=VisualizationParameters(), preset_validation_params=preset_validation_params)
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "huber")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'relu' agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### SilverstoneInputFilter = InputFilter(is_a_reference_filter=True) SilverstoneInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) SilverstoneInputFilter.add_observation_filter( 'observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) SilverstoneInputFilter.add_observation_filter( 'observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = SilverstoneInputFilter env_params.level = 'SilverstoneRacetrack-Discrete-v0' vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 1000 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
# ER - we'll be needing an episodic replay buffer for off-policy evaluation agent_params.memory = EpisodicExperienceReplayParameters() # E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1) agent_params.exploration.evaluation_epsilon = 0 # can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation #agent_params.algorithm.action_drop_method_parameters = KNNParameters() DATATSET_PATH = 'acrobot_dataset.csv' agent_params.memory = EpisodicExperienceReplayParameters() agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic = True) spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}), goal=None, action=DiscreteActionSpace(3), reward=RewardSpace(1)) graph_manager = BatchRLGraphManager(agent_params=agent_params, env_params=None, spaces_definition=spaces, schedule_params=schedule_params, vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1), reward_model_num_epochs=30, train_to_eval_ratio=0.4) graph_manager.create_graph(task_parameters) graph_manager.improve()
def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### training_algorithm = agent_list[ 0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params( DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json
def __init__( self, agents_params: List[AgentParameters], env_params: EnvironmentParameters, schedule_params: ScheduleParameters, vis_params: VisualizationParameters = VisualizationParameters(), preset_validation_params: PresetValidationParameters = PresetValidationParameters(), done_condition=any): self.done_condition = done_condition self.sess = {agent_params.name: None for agent_params in agents_params} self.level_managers = [] # type: List[MultiAgentLevelManager] self.top_level_manager = None self.environments = [] self.set_schedule_params(schedule_params) self.visualization_parameters = vis_params self.name = 'multi_agent_graph' self.task_parameters = None self._phase = self.phase = RunPhase.UNDEFINED self.preset_validation_params = preset_validation_params self.reset_required = False self.num_checkpoints_to_keep = 4 # TODO: make this a parameter # timers self.graph_creation_time = None self.last_checkpoint_saving_time = time.time() # counters self.total_steps_counters = { RunPhase.HEATUP: TotalStepsCounter(), RunPhase.TRAIN: TotalStepsCounter(), RunPhase.TEST: TotalStepsCounter() } self.checkpoint_id = 0 self.checkpoint_saver = { agent_params.name: None for agent_params in agents_params } self.checkpoint_state_updater = None self.graph_logger = Logger() self.data_store = None self.is_batch_rl = False self.time_metric = TimeTypes.EpisodeNumber self.env_params = env_params self.agents_params = agents_params self.agent_params = agents_params[0] # ...(find a better way)... for agent_index, agent_params in enumerate(agents_params): if len(agents_params) == 1: agent_params.name = "agent" else: agent_params.name = "agent_{}".format(agent_index) agent_params.visualization = copy.copy(vis_params) if agent_params.input_filter is None: agent_params.input_filter = copy.copy( env_params.default_input_filter()) if agent_params.output_filter is None: agent_params.output_filter = copy.copy( env_params.default_output_filter())
agent_params.exploration.continuous_exploration_policy_parameters.evaluation_noise = 0 agent_params.input_filter = InputFilter() agent_params.input_filter.add_observation_filter('observation', 'clipping', ObservationClippingFilter(-200, 200)) agent_params.pre_network_filter = InputFilter() agent_params.pre_network_filter.add_observation_filter('observation', 'normalize_observation', ObservationNormalizationFilter(name='normalize_observation')) agent_params.pre_network_filter.add_observation_filter('achieved_goal', 'normalize_achieved_goal', ObservationNormalizationFilter(name='normalize_achieved_goal')) agent_params.pre_network_filter.add_observation_filter('desired_goal', 'normalize_desired_goal', ObservationNormalizationFilter(name='normalize_desired_goal')) ############### # Environment # ############### env_params = GymVectorEnvironment(level=SingleLevelSelection(fetch_v1)) env_params.custom_reward_threshold = -49 ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.trace_test_levels = ['slide', 'pick_and_place', 'push', 'reach'] graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters(), preset_validation_params=preset_validation_params)
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["lr_decay_rate"] = float(hp_dict.get("lr_decay_rate", 0)) params["lr_decay_steps"] = float(hp_dict.get("lr_decay_steps", 0)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int(hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int(hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float(hp_dict.get("term_cond_avg_score", 100000)) params["tensorboard"] = hp_dict.get("tensorboard", False) params["dump_mp4"] = hp_dict.get("dump_mp4", False) params["dump_gifs"] = hp_dict.get("dump_gifs", False) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].learning_rate_decay_rate = params["lr_decay_rate"] agent_params.network_wrappers['main'].learning_rate_decay_steps = params["lr_decay_steps"] agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu' # Replace the default CNN with single layer Conv2d(32, 3, 1) # agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Shallow # agent_params.network_wrappers['main'].input_embedders_parameters['observation'].dropout_rate = 0.3 agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu' # agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Shallow # agent_params.network_wrappers['main'].middleware_parameters.dropout_rate = 0.3 agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 # agent_params.network_wrappers['main'].l2_regularization = 2e-5 if params["loss_type"] == "huber": agent_params.network_wrappers['main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### DeepRacerInputFilter = InputFilter(is_a_reference_filter=True) # Add an observation image pertubation for many aspects # DeepRacerInputFilter.add_observation_filter('observation', 'perturb_color', ObservationColorPerturbation(0.2)) # Rescale to much smaller input when using shallow networks to avoid OOM # DeepRacerInputFilter.add_observation_filter('observation', 'rescaling', # ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]), # high=255))) DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) DeepRacerInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) DeepRacerInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = DeepRacerInputFilter env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0' vis_params = VisualizationParameters() vis_params.tensorboard = params["tensorboard"] vis_params.dump_mp4 = params["dump_mp4"] vis_params.dump_gifs = params["dump_gifs"] # AlwaysDumpFilter, MaxDumpFilter, EveryNEpisodesDumpFilter, SelectedPhaseOnlyDumpFilter vis_params.video_dump_filters = [AlwaysDumpFilter()] ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
def train_using_experience_agent(env_params, n_epochs, dataset_size): tf.reset_default_graph( ) # just to clean things up; only needed for the tutorial # Experience Generating Agent parameters experience_generating_agent_params = DDQNAgentParameters() # schedule parameters experience_generating_schedule_params = ScheduleParameters() experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000) experience_generating_schedule_params.improve_steps = TrainingSteps( dataset_size - experience_generating_schedule_params.heatup_steps.num_steps) experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 10) experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes( 1) # DQN params experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 100) experience_generating_agent_params.algorithm.discount = 0.99 experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps( 1) # NN configuration experience_generating_agent_params.network_wrappers[ 'main'].learning_rate = 0.0001 experience_generating_agent_params.network_wrappers[ 'main'].batch_size = 128 experience_generating_agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = False experience_generating_agent_params.network_wrappers['main'].heads_parameters = \ [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \ # [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))] # ER size experience_generating_agent_params.memory = EpisodicExperienceReplayParameters( ) experience_generating_agent_params.memory.max_size = \ (MemoryGranularity.Transitions, experience_generating_schedule_params.heatup_steps.num_steps + experience_generating_schedule_params.improve_steps.num_steps) # E-Greedy schedule experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, 0.01, DATASET_SIZE) experience_generating_agent_params.exploration.evaluation_epsilon = 0 schedule_params = set_schedule_params(n_epochs, dataset_size) # set the agent params as before # agent_params = set_agent_params(DDQNAgentParameters) agent_params = set_agent_params(DDQNBCQAgentParameters) agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters( ) # 50 epochs of training (the entire dataset is used each epoch) # schedule_params.improve_steps = TrainingSteps(50) graph_manager = BatchRLGraphManager( agent_params=agent_params, experience_generating_agent_params=experience_generating_agent_params, experience_generating_schedule_params= experience_generating_schedule_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters( dump_signals_to_csv_every_x_episodes=1), reward_model_num_epochs=30, train_to_eval_ratio=0.5) graph_manager.create_graph(task_parameters) graph_manager.improve() return
agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic=True) ''' spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}), goal=None, action=DiscreteActionSpace(3), reward=RewardSpace(1)) spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=23)}), goal=None, action=DiscreteActionSpace(31), reward=RewardSpace(1)) ''' spaces = SpacesDefinition(state=StateSpace( {'observation': VectorObservationSpace(shape=23)}), goal=None, action=DiscreteActionSpace(21), reward=RewardSpace(1)) graph_manager = BatchRLGraphManager( agent_params=agent_params, env_params=None, spaces_definition=spaces, schedule_params=schedule_params, vis_params=VisualizationParameters(tensorboard=True, dump_csv=True, dump_signals_to_csv_every_x_episodes=1), reward_model_num_epochs=10, train_to_eval_ratio=0.4) graph_manager.create_graph(task_parameters) graph_manager.improve()
from rl_coach.agents.ddpg_agent import DDPGAgentParameters from rl_coach.agents.dqn_agent import DQNAgentParameters from rl_coach.graph_managers.graph_manager import SimpleSchedule, SimpleScheduleWithoutEvaluation from rl_coach.core_types import EnvironmentSteps, TrainingSteps from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.environments.first_test import ControlSuiteEnvironmentParameters agent_params = DQNAgentParameters() # rename the input embedder key from 'observation' to 'measurements' # agent_params.network_wrappers['main'].input_embedders_parameters['measurements'] = agent_params.network_wrappers['main'].input_embedders_parameters.pop('observation') schedule_params = SimpleSchedule() schedule_params.heatup_steps = EnvironmentSteps(10) preset_validation_params = PresetValidationParameters() # preset_validation_params.test = True # preset_validation_params.min_reward_threshold = 20 # preset_validation_params.max_episodes_to_achieve_reward = 400 vis_params = VisualizationParameters(render=False) env_params = ControlSuiteEnvironmentParameters() graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters(), preset_validation_params=preset_validation_params)
def lab_env(): # create a breakout gym environment env = LabEnvironment(level='nav_maze_static_01', seed=10, frame_skip=4, human_control=False, rotation=20, width=84, height=84, fps=60, custom_reward_threshold=None, visualization_parameters=VisualizationParameters(), random_initialization_steps=30) return env
# Environment # ############### SilverstoneInputFilter = InputFilter(is_a_reference_filter=True) SilverstoneInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) SilverstoneInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) SilverstoneInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(1)) env_params = GymVectorEnvironment() env_params.default_input_filter = SilverstoneInputFilter env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0' vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 1000 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params,
def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### # Note: The following three line hard-coded to pick the first agent's trainig algorithm # and dump the hyper parameters for the particular training algorithm into json # for training jobs (so that the console display the training hyperparameters correctly) # since right now, we only support training one model at a time. # TODO: clean these lines up when we support multi-agent training. training_algorithm = agent_list[ 0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: training_algorithm = agent.ctrl.model_metadata.training_algorithm params = get_updated_hyper_parameters(hp_dict, training_algorithm) if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params( DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or \ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: sector_binary_filter = ObservationSectorDiscretizeFilter( num_sectors=NUMBER_OF_LIDAR_SECTORS, num_values_per_sector=1, clipping_dist=SECTOR_LIDAR_CLIPPING_DIST) input_filter.add_observation_filter( observation, 'binary', sector_binary_filter) if observation == Input.DISCRETIZED_SECTOR_LIDAR.value: num_sectors = agent.ctrl.model_metadata.lidar_num_sectors num_values_per_sector = agent.ctrl.model_metadata.lidar_num_values_per_sector clipping_dist = agent.ctrl.model_metadata.lidar_clipping_dist sector_discretize_filter = ObservationSectorDiscretizeFilter( num_sectors=num_sectors, num_values_per_sector=num_values_per_sector, clipping_dist=clipping_dist) input_filter.add_observation_filter( observation, 'discrete', sector_discretize_filter) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json