Example #1
0
def get_graph_manager(**hp_dict):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["lr_decay_rate"] = float(hp_dict.get("lr_decay_rate", 0))
    params["lr_decay_steps"] = float(hp_dict.get("lr_decay_steps", 0))
    params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower()
    params["num_episodes_between_training"] = int(hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(hp_dict.get("term_cond_avg_score", 100000))
    params["tensorboard"] = hp_dict.get("tensorboard", False)
    params["dump_mp4"] = hp_dict.get("dump_mp4", False)
    params["dump_gifs"] = hp_dict.get("dump_gifs", False)

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    agent_params = ClippedPPOAgentParameters()

    agent_params.network_wrappers['main'].learning_rate = params["lr"]
    agent_params.network_wrappers['main'].learning_rate_decay_rate = params["lr_decay_rate"]
    agent_params.network_wrappers['main'].learning_rate_decay_steps = params["lr_decay_steps"]
    agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu'
    # Replace the default CNN with single layer Conv2d(32, 3, 1)
#   agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = EmbedderScheme.Shallow

#   agent_params.network_wrappers['main'].input_embedders_parameters['observation'].dropout_rate = 0.3
    agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu'
#   agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Shallow
#   agent_params.network_wrappers['main'].middleware_parameters.dropout_rate = 0.3
    agent_params.network_wrappers['main'].batch_size = params["batch_size"]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999
#   agent_params.network_wrappers['main'].l2_regularization = 2e-5

    if params["loss_type"] == "huber":
        agent_params.network_wrappers['main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000)
    agent_params.algorithm.beta_entropy = params["beta_entropy"]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params["discount_factor"]
    agent_params.algorithm.optimization_epochs = params["num_epochs"]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
        params["num_episodes_between_training"])
    agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(params["num_episodes_between_training"])

    agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

    if params["exploration_type"] == "categorical":
        agent_params.exploration = CategoricalParameters()
    else:
        agent_params.exploration = EGreedyParameters()
        agent_params.exploration.epsilon_schedule = LinearSchedule(1.0,
                                                                   params["e_greedy_value"],
                                                                   params["epsilon_steps"])

    ###############
    # Environment #
    ###############
    DeepRacerInputFilter = InputFilter(is_a_reference_filter=True)
    # Add an observation image pertubation for many aspects
#   DeepRacerInputFilter.add_observation_filter('observation', 'perturb_color', ObservationColorPerturbation(0.2))
    # Rescale to much smaller input when using shallow networks to avoid OOM
#   DeepRacerInputFilter.add_observation_filter('observation', 'rescaling',
#                                           ObservationRescaleToSizeFilter(ImageObservationSpace(np.array([84, 84, 3]),
#                                                                                            high=255)))
    DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
    DeepRacerInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
    DeepRacerInputFilter.add_observation_filter('observation', 'stacking',
                                                  ObservationStackingFilter(params["stack_size"]))

    env_params = GymVectorEnvironment()
    env_params.default_input_filter = DeepRacerInputFilter
    env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0'

    vis_params = VisualizationParameters()
    vis_params.tensorboard = params["tensorboard"]
    vis_params.dump_mp4 = params["dump_mp4"]
    vis_params.dump_gifs = params["dump_gifs"]
    # AlwaysDumpFilter, MaxDumpFilter, EveryNEpisodesDumpFilter, SelectedPhaseOnlyDumpFilter
    vis_params.video_dump_filters = [AlwaysDumpFilter()]

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                        schedule_params=schedule_params, vis_params=vis_params,
                                        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
def get_graph_manager(hp_dict, agent_list, run_phase_subject):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["exploration_type"] = (hp_dict.get("exploration_type",
                                              "categorical")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type",
                                      "Mean squared error").lower()
    params["num_episodes_between_training"] = int(
        hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(
        hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(
        hp_dict.get("term_cond_avg_score", 100000))

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    trainable_agents_list = list()
    non_trainable_agents_list = list()

    for agent in agent_list:
        agent_params = DeepRacerAgentParams()
        if agent.network_settings:
            agent_params.env_agent = agent
            agent_params.network_wrappers['main'].learning_rate = params["lr"]

            agent_params.network_wrappers['main'].input_embedders_parameters = \
                create_input_embedder(agent.network_settings['input_embedders'],
                                      agent.network_settings['embedder_type'],
                                      agent.network_settings['activation_function'])
            agent_params.network_wrappers['main'].middleware_parameters = \
                create_middle_embedder(agent.network_settings['middleware_embedders'],
                                       agent.network_settings['embedder_type'],
                                       agent.network_settings['activation_function'])

            input_filter = InputFilter(is_a_reference_filter=True)
            for observation in agent.network_settings['input_embedders'].keys(
            ):
                if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\
                observation == Input.OBSERVATION.value:
                    input_filter.add_observation_filter(
                        observation, 'to_grayscale', ObservationRGBToYFilter())
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))
                    input_filter.add_observation_filter(
                        observation, 'stacking', ObservationStackingFilter(1))

                if observation == Input.STEREO.value:
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))

                if observation == Input.LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'clipping',
                        ObservationClippingFilter(0.15, 1.0))
                if observation == Input.SECTOR_LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'binary', ObservationBinarySectorFilter())
            agent_params.input_filter = input_filter()

            agent_params.network_wrappers['main'].batch_size = params[
                "batch_size"]
            agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
            agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

            if params["loss_type"] == "huber":
                agent_params.network_wrappers[
                    'main'].replace_mse_with_huber_loss = True

            agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
            agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
                1.0, 0, 1000000)
            agent_params.algorithm.beta_entropy = params["beta_entropy"]
            agent_params.algorithm.gae_lambda = 0.95
            agent_params.algorithm.discount = params["discount_factor"]
            agent_params.algorithm.optimization_epochs = params["num_epochs"]
            agent_params.algorithm.estimate_state_value_using_gae = True
            agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \
                EnvironmentEpisodes(params["num_episodes_between_training"])
            agent_params.algorithm.num_consecutive_playing_steps = \
                EnvironmentEpisodes(params["num_episodes_between_training"])

            agent_params.algorithm.distributed_coach_synchronization_type = \
                DistributedCoachSynchronizationType.SYNC

            if params["exploration_type"] == "categorical":
                agent_params.exploration = CategoricalParameters()
            else:
                agent_params.exploration = EGreedyParameters()
                agent_params.exploration.epsilon_schedule = LinearSchedule(
                    1.0, params["e_greedy_value"], params["epsilon_steps"])

            trainable_agents_list.append(agent_params)
        else:
            non_trainable_agents_list.append(agent)

    ###############
    # Environment #
    ###############
    env_params = DeepRacerRacetrackEnvParameters()
    env_params.agents_params = trainable_agents_list
    env_params.non_trainable_agents = non_trainable_agents_list
    env_params.level = 'DeepRacerRacetrackEnv-v0'
    env_params.run_phase_subject = run_phase_subject

    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = MultiAgentGraphManager(
        agents_params=trainable_agents_list,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
Example #3
0
 def __init__(self, improve_steps=TrainingSteps(10000000000)):
     super().__init__()
     self.heatup_steps = EnvironmentSteps(0)
     self.evaluation_steps = EnvironmentEpisodes(0)
     self.steps_between_evaluation_periods = improve_steps
     self.improve_steps = improve_steps
Example #4
0
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters, s3_writer):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not graph_manager.data_store:
        raise AttributeError("None type for data_store object")

    data_store = graph_manager.data_store

    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    wait_for_trainer_ready(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics')
    rospy.wait_for_service('/gazebo/unpause_physics')
    rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4')
    rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics', Empty)
    subscribe_to_save_mp4 = ServiceProxyWrapper('/racecar/save_mp4/subscribe_to_save_mp4', Empty)
    unsubscribe_from_save_mp4 = ServiceProxyWrapper('/racecar/save_mp4/unsubscribe_from_save_mp4', Empty)
    graph_manager.create_graph(task_parameters=task_parameters, stop_physics=pause_physics,
                               start_physics=unpause_physics, empty_service_call=EmptyRequest)

    chkpt_state_reader = CheckpointStateReader(checkpoint_dir, checkpoint_state_optional=False)
    last_checkpoint = chkpt_state_reader.get_latest().num

    # this worker should play a fraction of the total playing steps per rollout
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    act_steps = int(episode_steps_per_rollout / num_workers)
    if rollout_idx < episode_steps_per_rollout % num_workers:
        act_steps += 1
    act_steps = EnvironmentEpisodes(act_steps)

    configure_environment_randomizer()

    for _ in range((graph_manager.improve_steps / act_steps.num_steps).num_steps):
        graph_manager.phase = RunPhase.TRAIN
        exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx)
        unpause_physics(EmptyRequest())
        graph_manager.reset_internal_state(True)
        graph_manager.act(act_steps, wait_for_full_episodes=graph_manager.agent_params.algorithm.act_for_full_episodes)
        graph_manager.reset_internal_state(True)
        time.sleep(1)
        pause_physics(EmptyRequest())

        graph_manager.phase = RunPhase.UNDEFINED
        new_checkpoint = -1
        if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                == DistributedCoachSynchronizationType.SYNC:
            unpause_physics(EmptyRequest())
            is_save_mp4_enabled = rospy.get_param('MP4_S3_BUCKET', None) and rollout_idx == 0
            if is_save_mp4_enabled:
                subscribe_to_save_mp4(EmptyRequest())
            if rollout_idx == 0:
                for _ in range(MIN_EVAL_TRIALS):
                    graph_manager.evaluate(EnvironmentSteps(1))

            while new_checkpoint < last_checkpoint + 1:
                exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx)
                if rollout_idx == 0:
                    graph_manager.evaluate(EnvironmentSteps(1))
                new_checkpoint = data_store.get_chkpoint_num('agent')
            if is_save_mp4_enabled:
                unsubscribe_from_save_mp4(EmptyRequest())
            s3_writer.upload_to_s3()

            pause_physics(EmptyRequest())
            data_store.load_from_store(expected_checkpoint_number=last_checkpoint+1)
            graph_manager.restore_checkpoint()

        if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                == DistributedCoachSynchronizationType.ASYNC:
            if new_checkpoint > last_checkpoint:
                graph_manager.restore_checkpoint()

        last_checkpoint = new_checkpoint
Example #5
0
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters,
                   s3_writer):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not graph_manager.data_store:
        raise AttributeError("None type for data_store object")

    data_store = graph_manager.data_store

    checkpoint_dir = task_parameters.checkpoint_restore_path
    wait_for_checkpoint(checkpoint_dir, data_store)
    wait_for_trainer_ready(checkpoint_dir, data_store)
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4')
    rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)
    subscribe_to_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/subscribe_to_save_mp4', Empty)
    unsubscribe_from_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    chkpt_state_reader = CheckpointStateReader(checkpoint_dir,
                                               checkpoint_state_optional=False)
    last_checkpoint = chkpt_state_reader.get_latest().num

    # this worker should play a fraction of the total playing steps per rollout
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    act_steps = int(episode_steps_per_rollout / num_workers)
    if rollout_idx < episode_steps_per_rollout % num_workers:
        act_steps += 1
    act_steps = EnvironmentEpisodes(act_steps)

    configure_environment_randomizer()

    for _ in range(
        (graph_manager.improve_steps / act_steps.num_steps).num_steps):
        # Collect profiler information only IS_PROFILER_ON is true
        with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                            s3_prefix=PROFILER_S3_PREFIX,
                            output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                            enable_profiling=IS_PROFILER_ON):
            graph_manager.phase = RunPhase.TRAIN
            exit_if_trainer_done(checkpoint_dir, s3_writer, rollout_idx)
            unpause_physics(EmptyRequest())
            graph_manager.reset_internal_state(True)
            graph_manager.act(act_steps,
                              wait_for_full_episodes=graph_manager.
                              agent_params.algorithm.act_for_full_episodes)
            graph_manager.reset_internal_state(True)
            time.sleep(1)
            pause_physics(EmptyRequest())

            graph_manager.phase = RunPhase.UNDEFINED
            new_checkpoint = -1
            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.SYNC:
                is_save_mp4_enabled = rospy.get_param(
                    'MP4_S3_BUCKET', None) and rollout_idx == 0
                if is_save_mp4_enabled:
                    subscribe_to_save_mp4(EmptyRequest())
                if rollout_idx == 0:
                    unpause_physics(EmptyRequest())
                    for _ in range(int(rospy.get_param('MIN_EVAL_TRIALS',
                                                       '5'))):
                        graph_manager.evaluate(EnvironmentSteps(1))

                while new_checkpoint < last_checkpoint + 1:
                    exit_if_trainer_done(checkpoint_dir, s3_writer,
                                         rollout_idx)
                    if rollout_idx == 0:
                        print(
                            "Additional evaluation. New Checkpoint: {}, Last Checkpoint: {}"
                            .format(new_checkpoint, last_checkpoint))
                        graph_manager.evaluate(EnvironmentSteps(1))
                    else:
                        time.sleep(5)
                    new_checkpoint = data_store.get_chkpoint_num('agent')
                if is_save_mp4_enabled:
                    unsubscribe_from_save_mp4(EmptyRequest())
                logger.info(
                    "Completed iteration tasks. Writing results to S3.")
                s3_writer.upload_to_s3()

                pause_physics(EmptyRequest())
                logger.info(
                    "Preparing to load checkpoint {}".format(last_checkpoint +
                                                             1))
                data_store.load_from_store(
                    expected_checkpoint_number=last_checkpoint + 1)
                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint > last_checkpoint:
                    graph_manager.restore_checkpoint()

            last_checkpoint = new_checkpoint

    logger.info("Exited main loop. Done.")
Example #6
0
        context_vector = attention_weights * conv
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector

    def __str__(self):
        return "Convolution (num filters = {}, kernel size = {}, stride = {})"\
            .format(self.num_filters, self.kernel_size, self.strides)


####################
# Graph Scheduling #
####################

schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
schedule_params.evaluation_steps = EnvironmentEpisodes(5)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = ClippedPPOAgentParameters()

# agent_params.network_wrappers['main'].input_embedders_parameters = {
#         'left_camera': InputEmbedderParameters(activation_function='relu', dropout_rate=0.3),
#         'stereo': InputEmbedderParameters(activation_function='relu', dropout_rate=0.3)
#         }

agent_params.network_wrappers['main'].input_embedders_parameters = {
    'left_camera':
Example #7
0
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.doom_environment import DoomEnvironmentParameters
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.filters.filter import InputFilter
from rl_coach.filters.reward.reward_rescale_filter import RewardRescaleFilter

####################
# Graph Scheduling #
####################

schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = ACERAgentParameters()

agent_params.algorithm.num_steps_between_gradient_updates = 30
agent_params.algorithm.apply_gradients_every_x_episodes = 1
agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.algorithm.ratio_of_replay = 4
agent_params.algorithm.num_transitions_to_start_replay = 2000
agent_params.memory.max_size = (MemoryGranularity.Transitions, 100000)
agent_params.input_filter = InputFilter()
Example #8
0
def train_using_experience_agent(env_params, n_epochs, dataset_size):
    tf.reset_default_graph(
    )  # just to clean things up; only needed for the tutorial

    # Experience Generating Agent parameters
    experience_generating_agent_params = DDQNAgentParameters()
    # schedule parameters
    experience_generating_schedule_params = ScheduleParameters()
    experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
    experience_generating_schedule_params.improve_steps = TrainingSteps(
        dataset_size -
        experience_generating_schedule_params.heatup_steps.num_steps)
    experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(
        10)
    experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(
        1)

    # DQN params
    experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
        100)
    experience_generating_agent_params.algorithm.discount = 0.99
    experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(
        1)

    # NN configuration
    experience_generating_agent_params.network_wrappers[
        'main'].learning_rate = 0.0001
    experience_generating_agent_params.network_wrappers[
        'main'].batch_size = 128
    experience_generating_agent_params.network_wrappers[
        'main'].replace_mse_with_huber_loss = False
    experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
        [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
    # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
    #     [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))]

    # ER size
    experience_generating_agent_params.memory = EpisodicExperienceReplayParameters(
    )
    experience_generating_agent_params.memory.max_size = \
        (MemoryGranularity.Transitions,
         experience_generating_schedule_params.heatup_steps.num_steps +
         experience_generating_schedule_params.improve_steps.num_steps)

    # E-Greedy schedule
    experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(
        1.0, 0.01, DATASET_SIZE)
    experience_generating_agent_params.exploration.evaluation_epsilon = 0

    schedule_params = set_schedule_params(n_epochs, dataset_size)
    # set the agent params as before
    # agent_params = set_agent_params(DDQNAgentParameters)
    agent_params = set_agent_params(DDQNBCQAgentParameters)
    agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters(
    )

    # 50 epochs of training (the entire dataset is used each epoch)
    # schedule_params.improve_steps = TrainingSteps(50)

    graph_manager = BatchRLGraphManager(
        agent_params=agent_params,
        experience_generating_agent_params=experience_generating_agent_params,
        experience_generating_schedule_params=
        experience_generating_schedule_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=VisualizationParameters(
            dump_signals_to_csv_every_x_episodes=1),
        reward_model_num_epochs=30,
        train_to_eval_ratio=0.5)
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
    return
Example #9
0
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.gym_environment import MujocoInputFilter, GymEnvironmentParameters, MujocoOutputFilter
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters
from rl_coach.exploration_policies.truncated_normal import TruncatedNormalParameters
from rl_coach.schedules import ConstantSchedule, PieceWiseSchedule, ExponentialSchedule
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.base_parameters import EmbedderScheme
from rl_coach.architectures.tensorflow_components.architecture import Dense

steps_per_episode = 13

####################
# Block Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentEpisodes(400)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(
    3)  #3)  # Neta: (1000)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)  #1)  # Neta: 0
schedule_params.heatup_steps = EnvironmentSteps(
    2)  #120*steps_per_episode) # Neta (2)

#####################
# DDPG Agent Params #
#####################
agent_params = DDPGAgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters[
    'observation'].scheme = [Dense([300])]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [
    Dense([300])
]
Example #10
0
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.graph_managers.hac_graph_manager import HACGraphManager
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import HindsightGoalSelectionMethod, \
    EpisodicHindsightExperienceReplayParameters
from rl_coach.memories.episodic.episodic_hrl_hindsight_experience_replay import \
    EpisodicHRLHindsightExperienceReplayParameters
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.schedules import ConstantSchedule
from rl_coach.spaces import GoalsSpace, ReachingGoal

####################
# Graph Scheduling #
####################

schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentEpisodes(40 * 4 * 64)  # 40 epochs
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(
    4 * 64)  # 4 small batches of 64 episodes
schedule_params.evaluation_steps = EnvironmentEpisodes(64)
schedule_params.heatup_steps = EnvironmentSteps(0)

polar_coordinates = False

#########
# Agent #
#########

if polar_coordinates:
    distance_from_goal_threshold = np.array([0.075, 0.75])
else:
    distance_from_goal_threshold = np.array([0.075, 0.075, 0.75])
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters,
                   simtrace_video_s3_writers):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not graph_manager.data_store:
        raise AttributeError("None type for data_store object")

    data_store = graph_manager.data_store

    #TODO change agent to specific agent name for multip agent case
    checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path,
                                  "agent")
    graph_manager.data_store.wait_for_checkpoints()
    graph_manager.data_store.wait_for_trainer_ready()
    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/gazebo/pause_physics_dr')
    rospy.wait_for_service('/gazebo/unpause_physics_dr')
    rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4')
    rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4')
    pause_physics = ServiceProxyWrapper('/gazebo/pause_physics_dr', Empty)
    unpause_physics = ServiceProxyWrapper('/gazebo/unpause_physics_dr', Empty)
    subscribe_to_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/subscribe_to_save_mp4', Empty)
    unsubscribe_from_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    chkpt_state_reader = CheckpointStateReader(checkpoint_dir,
                                               checkpoint_state_optional=False)
    last_checkpoint = chkpt_state_reader.get_latest().num

    # this worker should play a fraction of the total playing steps per rollout
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    act_steps = int(episode_steps_per_rollout / num_workers)
    if rollout_idx < episode_steps_per_rollout % num_workers:
        act_steps += 1
    act_steps = EnvironmentEpisodes(act_steps)

    configure_environment_randomizer()

    for _ in range(
        (graph_manager.improve_steps / act_steps.num_steps).num_steps):
        # Collect profiler information only IS_PROFILER_ON is true
        with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                            s3_prefix=PROFILER_S3_PREFIX,
                            output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                            enable_profiling=IS_PROFILER_ON):
            graph_manager.phase = RunPhase.TRAIN
            exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers,
                                 rollout_idx)
            unpause_physics(EmptyRequest())
            graph_manager.reset_internal_state(True)
            graph_manager.act(act_steps,
                              wait_for_full_episodes=graph_manager.
                              agent_params.algorithm.act_for_full_episodes)
            graph_manager.reset_internal_state(True)
            time.sleep(1)
            pause_physics(EmptyRequest())

            graph_manager.phase = RunPhase.UNDEFINED
            new_checkpoint = -1
            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.SYNC:
                unpause_physics(EmptyRequest())
                is_save_mp4_enabled = rospy.get_param(
                    'MP4_S3_BUCKET', None) and rollout_idx == 0
                if is_save_mp4_enabled:
                    subscribe_to_save_mp4(EmptyRequest())
                if rollout_idx == 0:
                    for _ in range(MIN_EVAL_TRIALS):
                        graph_manager.evaluate(EnvironmentSteps(1))

                while new_checkpoint < last_checkpoint + 1:
                    exit_if_trainer_done(checkpoint_dir,
                                         simtrace_video_s3_writers,
                                         rollout_idx)
                    if rollout_idx == 0:
                        graph_manager.evaluate(EnvironmentSteps(1))
                    new_checkpoint = data_store.get_coach_checkpoint_number(
                        'agent')
                if is_save_mp4_enabled:
                    unsubscribe_from_save_mp4(EmptyRequest())
                # upload simtrace and mp4 into s3 bucket
                for s3_writer in simtrace_video_s3_writers:
                    s3_writer.persist(utils.get_s3_kms_extra_args())
                pause_physics(EmptyRequest())
                data_store.load_from_store(
                    expected_checkpoint_number=last_checkpoint + 1)
                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint > last_checkpoint:
                    graph_manager.restore_checkpoint()

            last_checkpoint = new_checkpoint
Example #12
0
def get_graph_manager(hp_dict,
                      agent_list,
                      run_phase_subject,
                      enable_domain_randomization=False,
                      done_condition=any,
                      run_type=str(RunType.ROLLOUT_WORKER),
                      pause_physics=None,
                      unpause_physics=None):
    ####################
    # Hyperparameters #
    ####################
    training_algorithm = agent_list[
        0].ctrl.model_metadata.training_algorithm if agent_list else None
    params = get_updated_hyper_parameters(hp_dict, training_algorithm)
    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    trainable_agents_list = list()
    non_trainable_agents_list = list()

    for agent in agent_list:
        if agent.network_settings:
            if TrainingAlgorithm.SAC.value == training_algorithm:
                agent_params = get_sac_params(DeepRacerSACAgentParams(), agent,
                                              params, run_type)
            else:
                agent_params = get_clipped_ppo_params(
                    DeepRacerClippedPPOAgentParams(), agent, params)
            agent_params.env_agent = agent
            input_filter = InputFilter(is_a_reference_filter=True)
            for observation in agent.network_settings['input_embedders'].keys(
            ):
                if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\
                observation == Input.OBSERVATION.value:
                    input_filter.add_observation_filter(
                        observation, 'to_grayscale', ObservationRGBToYFilter())
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))
                    input_filter.add_observation_filter(
                        observation, 'stacking', ObservationStackingFilter(1))

                if observation == Input.STEREO.value:
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))

                if observation == Input.LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'clipping',
                        ObservationClippingFilter(0.15, 1.0))
                if observation == Input.SECTOR_LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'binary', ObservationBinarySectorFilter())
            agent_params.input_filter = input_filter()
            trainable_agents_list.append(agent_params)
        else:
            non_trainable_agents_list.append(agent)

    ###############
    # Environment #
    ###############
    env_params = DeepRacerRacetrackEnvParameters()
    env_params.agents_params = trainable_agents_list
    env_params.non_trainable_agents = non_trainable_agents_list
    env_params.level = 'DeepRacerRacetrackEnv-v0'
    env_params.run_phase_subject = run_phase_subject
    env_params.enable_domain_randomization = enable_domain_randomization
    env_params.done_condition = done_condition
    env_params.pause_physics = pause_physics
    env_params.unpause_physics = unpause_physics
    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = MultiAgentGraphManager(
        agents_params=trainable_agents_list,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params,
        done_condition=done_condition)
    return graph_manager, params_json
Example #13
0
def get_clipped_ppo_params(agent_params, agent, params):
    """This function is algorithm specific settings required for Clipped PPO algorithm

    Args:
        agent_params (DeepRacerClippedPPOAgentParams): the agent parameters that will be used to create the RL agent
        agent (Agent): The agent object that was created either as part of create_rollout_agent or create_training_agent
        params (dict): dictionary of hyperparameters

    Returns:
        DeepRacerClippedPPOAgentParams: updated agent params object with hyperparameters and other required details
    """
    agent_params.network_wrappers['main'].learning_rate = params[
        HyperParameterKeys.LEARNING_RATE.value]

    agent_params.network_wrappers['main'].input_embedders_parameters = \
        create_input_embedder(agent.network_settings['input_embedders'],
                              agent.network_settings['embedder_type'],
                              agent.network_settings['activation_function'])
    agent_params.network_wrappers['main'].middleware_parameters = \
        create_middle_embedder(agent.network_settings['middleware_embedders'],
                               agent.network_settings['embedder_type'],
                               agent.network_settings['activation_function'])

    agent_params.network_wrappers['main'].batch_size = params[
        HyperParameterKeys.BATCH_SIZE.value]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

    if params[HyperParameterKeys.LOSS_TYPE.value] == LossTypes.HUBER.value:
        agent_params.network_wrappers[
            'main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.beta_entropy = params[
        HyperParameterKeys.BETA_ENTROPY.value]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params[
        HyperParameterKeys.DISCOUNT_FACTOR.value]
    agent_params.algorithm.optimization_epochs = params[
        HyperParameterKeys.NUM_EPOCHS.value]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \
        EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value])
    agent_params.algorithm.num_consecutive_playing_steps = \
        EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value])

    agent_params.algorithm.distributed_coach_synchronization_type = \
        DistributedCoachSynchronizationType.SYNC
    if params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip(
    ) == ExplorationTypes.CATEGORICAL.value:
        agent_params.exploration = {
            DiscreteActionSpace:
            DeepRacerCategoricalParameters(
                use_stochastic_evaluation_policy=False),
            ScalableBoxActionSpace:
            AdditiveNoiseParameters()
        }
    elif params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip(
    ) == ExplorationTypes.E_GREEDY.value:
        agent_params.exploration = {
            DiscreteActionSpace: EGreedyParameters(),
            ScalableBoxActionSpace: AdditiveNoiseParameters()
        }
        agent_params.exploration[DiscreteActionSpace].epsilon_schedule = \
            LinearSchedule(1.0,
                           params[HyperParameterKeys.E_GREEDY_VALUE.value],
                           params[HyperParameterKeys.EPSILON_STEPS.value])
    else:
        log_and_exit(
            "Unknown exploration_type found in hyper parameters. \
            exploration_type: {}".format(params[
                HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip()),
            SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500)

    agent_params.memory = DeepRacerMemoryParameters()
    return agent_params
Example #14
0
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.schedules import LinearSchedule
from rl_coach.exploration_policies.e_greedy import EGreedyParameters
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter, InputFilter
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter
from rl_coach.memories.memory import MemoryGranularity
from markov import environments

####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(100000)       #Changing to 100K
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
schedule_params.evaluation_steps = EnvironmentEpisodes(5)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = DQNAgentParameters()

agent_params.network_wrappers['main'].learning_rate = 0.0003
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu'
agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu'
agent_params.network_wrappers['main'].batch_size = 64
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999
Example #15
0
def rollout_worker(graph_manager, checkpoint_dir, data_store, num_workers,
                   memory_backend_params):
    """
    wait for first checkpoint then perform rollouts using the model
    """

    wait_for_checkpoint(checkpoint_dir, data_store)

    task_parameters = TaskParameters()
    task_parameters.__dict__['checkpoint_restore_dir'] = checkpoint_dir

    graph_manager.create_graph(task_parameters)
    graph_manager.reset_internal_state()

    for level in graph_manager.level_managers:
        for agent in level.agents.values():
            agent.memory.memory_backend = deepracer_memory.DeepRacerRolloutBackEnd(
                memory_backend_params, graph_manager.agent_params.algorithm.
                num_consecutive_playing_steps)

    with graph_manager.phase_context(RunPhase.TRAIN):
        last_checkpoint = 0
        act_steps = math.ceil(
            (graph_manager.agent_params.algorithm.
             num_consecutive_playing_steps.num_steps) / num_workers)

        for i in range(int(graph_manager.improve_steps.num_steps / act_steps)):

            if should_stop(checkpoint_dir):
                break

            try:
                # This will only work for DeepRacerRacetrackEnv enviroments
                graph_manager.top_level_manager.environment.env.env.set_allow_servo_step_signals(
                    True)
            except Exception as ex:
                utils.json_format_logger(
                    "Method not defined in enviroment class: {}".format(ex),
                    **utils.build_system_error_dict(
                        utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
                        utils.SIMAPP_EVENT_ERROR_CODE_500))

            if type(graph_manager.agent_params.algorithm.
                    num_consecutive_playing_steps) == EnvironmentSteps:
                graph_manager.act(EnvironmentSteps(num_steps=act_steps),
                                  wait_for_full_episodes=graph_manager.
                                  agent_params.algorithm.act_for_full_episodes)
            elif type(graph_manager.agent_params.algorithm.
                      num_consecutive_playing_steps) == EnvironmentEpisodes:
                graph_manager.act(EnvironmentEpisodes(num_steps=act_steps))

            try:
                # This will only work for DeepRacerRacetrackEnv enviroments
                graph_manager.top_level_manager.environment.env.env.set_allow_servo_step_signals(
                    False)
                graph_manager.top_level_manager.environment.env.env.stop_car()
            except Exception as ex:
                utils.json_format_logger(
                    "Method not defined in enviroment class: {}".format(ex),
                    **utils.build_system_error_dict(
                        utils.SIMAPP_SIMULATION_WORKER_EXCEPTION,
                        utils.SIMAPP_EVENT_ERROR_CODE_500))

            new_checkpoint = get_latest_checkpoint(checkpoint_dir)

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.SYNC:
                while new_checkpoint is None or new_checkpoint <= last_checkpoint:
                    if should_stop(checkpoint_dir):
                        break
                    if data_store:
                        data_store.load_from_store(
                            expected_checkpoint_number=new_checkpoint)
                    new_checkpoint = get_latest_checkpoint(checkpoint_dir)
                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint is not None and new_checkpoint > last_checkpoint:
                    graph_manager.restore_checkpoint()

            if new_checkpoint is not None:
                last_checkpoint = new_checkpoint
Example #16
0
from rl_coach.architectures.head_parameters import QHeadParameters
from rl_coach.agents.ddqn_bcq_agent import DDQNBCQAgentParameters

from rl_coach.agents.ddqn_bcq_agent import KNNParameters, NNImitationModelParameters

DATASET_SIZE = 200000

####################
# Graph Scheduling #
####################

schedule_params = ScheduleParameters()
# schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.improve_steps = TrainingSteps(400)  # 400 epochs
schedule_params.steps_between_evaluation_periods = TrainingSteps(1)
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
schedule_params.heatup_steps = EnvironmentSteps(DATASET_SIZE)

#########
# Agent #
#########

agent_params = DDQNBCQAgentParameters()
agent_params.network_wrappers['main'].batch_size = 128
# TODO cross-DL framework abstraction for a constant initializer?
agent_params.network_wrappers['main'].heads_parameters = [
    QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))
]

agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
    100)
Example #17
0
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase
from rl_coach.environments.environment import MaxDumpMethod, SelectedPhaseOnlyDumpMethod, SingleLevelSelection
from rl_coach.environments.gym_environment import Mujoco, mujoco_v2, MujocoInputFilter
from rl_coach.exploration_policies.continuous_entropy import ContinuousEntropyParameters
from rl_coach.filters.observation.observation_normalization_filter import ObservationNormalizationFilter
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters

####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = PPOAgentParameters()
agent_params.network_wrappers['actor'].learning_rate = 0.001
agent_params.network_wrappers['critic'].learning_rate = 0.001

agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense([64])]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([64])]
agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense([64])]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([64])]

agent_params.input_filter = MujocoInputFilter()
Example #18
0
def get_graph_manager(**hp_dict):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["exploration_type"] = (hp_dict.get("exploration_type",
                                              "huber")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type",
                                      "Mean squared error").lower()
    params["num_episodes_between_training"] = int(
        hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(
        hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(
        hp_dict.get("term_cond_avg_score", 100000))

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    agent_params = ClippedPPOAgentParameters()

    agent_params.network_wrappers['main'].learning_rate = params["lr"]
    agent_params.network_wrappers['main'].input_embedders_parameters[
        'observation'].activation_function = 'relu'
    agent_params.network_wrappers[
        'main'].middleware_parameters.activation_function = 'relu'
    agent_params.network_wrappers['main'].batch_size = params["batch_size"]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

    if params["loss_type"] == "huber":
        agent_params.network_wrappers[
            'main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
        1.0, 0, 1000000)
    agent_params.algorithm.beta_entropy = params["beta_entropy"]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params["discount_factor"]
    agent_params.algorithm.optimization_epochs = params["num_epochs"]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
        params["num_episodes_between_training"])
    agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(
        params["num_episodes_between_training"])

    agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

    if params["exploration_type"] == "categorical":
        agent_params.exploration = CategoricalParameters()
    else:
        agent_params.exploration = EGreedyParameters()
        agent_params.exploration.epsilon_schedule = LinearSchedule(
            1.0, params["e_greedy_value"], params["epsilon_steps"])

    ###############
    # Environment #
    ###############
    DeepRacerInputFilter = InputFilter(is_a_reference_filter=True)
    DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale',
                                                ObservationRGBToYFilter())
    DeepRacerInputFilter.add_observation_filter(
        'observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
    DeepRacerInputFilter.add_observation_filter(
        'observation', 'stacking',
        ObservationStackingFilter(params["stack_size"]))

    env_params = GymVectorEnvironment()
    env_params.default_input_filter = DeepRacerInputFilter
    env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0'

    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False
    vis_params.dump_csv = True
    vis_params.print_networks_summary = True
    vis_params.dump_parameters_documentation = False
    vis_params.dump_signals_to_csv_every_x_episodes = 1
    vis_params.tensorboard = True

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 1000

    graph_manager = BasicRLGraphManager(
        agent_params=agent_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
Example #19
0
from rl_coach.agents.td3_agent import TD3AgentParameters
from rl_coach.architectures.layers import Dense
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, EmbedderScheme
from rl_coach.core_types import EnvironmentEpisodes, EnvironmentSteps
from rl_coach.environments.environment import SingleLevelSelection
from rl_coach.environments.gym_environment import GymVectorEnvironment, mujoco_v2
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.exploration_policies.truncated_normal import TruncatedNormalParameters
from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters

####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentEpisodes(800)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(5000)
schedule_params.evaluation_steps = EnvironmentEpisodes(0)  # Neta: 0
schedule_params.heatup_steps = EnvironmentEpisodes(100)

#########
# Agent #
#########
agent_params = TD3AgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters[
    'observation'].scheme = [Dense(400)]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [
    Dense(300)
]

agent_params.network_wrappers['critic'].input_embedders_parameters[
def rollout_worker(graph_manager, num_workers, rollout_idx, task_parameters,
                   simtrace_video_s3_writers, pause_physics, unpause_physics):
    """
    wait for first checkpoint then perform rollouts using the model
    """
    if not graph_manager.data_store:
        raise AttributeError("None type for data_store object")

    is_sageonly = check_is_sageonly()

    data_store = graph_manager.data_store

    #TODO change agent to specific agent name for multip agent case
    checkpoint_dir = os.path.join(task_parameters.checkpoint_restore_path,
                                  "agent")
    graph_manager.data_store.wait_for_checkpoints()
    graph_manager.data_store.wait_for_trainer_ready()
    # wait for the required cancel services to become available
    # Do this only for Robomaker job.
    # if not is_sageonly:
    #    rospy.wait_for_service('/robomaker/job/cancel')    # Make the clients that will allow us to pause and unpause the physics
    rospy.wait_for_service('/racecar/save_mp4/subscribe_to_save_mp4')
    rospy.wait_for_service('/racecar/save_mp4/unsubscribe_from_save_mp4')

    subscribe_to_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/subscribe_to_save_mp4', Empty)
    unsubscribe_from_save_mp4 = ServiceProxyWrapper(
        '/racecar/save_mp4/unsubscribe_from_save_mp4', Empty)
    graph_manager.create_graph(task_parameters=task_parameters,
                               stop_physics=pause_physics,
                               start_physics=unpause_physics,
                               empty_service_call=EmptyRequest)

    chkpt_state_reader = CheckpointStateReader(checkpoint_dir,
                                               checkpoint_state_optional=False)
    last_checkpoint = chkpt_state_reader.get_latest().num

    # this worker should play a fraction of the total playing steps per rollout
    episode_steps_per_rollout = graph_manager.agent_params.algorithm.num_consecutive_playing_steps.num_steps
    act_steps = int(episode_steps_per_rollout / num_workers)
    if rollout_idx < episode_steps_per_rollout % num_workers:
        act_steps += 1
    act_steps = EnvironmentEpisodes(act_steps)

    configure_environment_randomizer()

    for _ in range(
        (graph_manager.improve_steps / act_steps.num_steps).num_steps):
        # Collect profiler information only IS_PROFILER_ON is true
        with utils.Profiler(s3_bucket=PROFILER_S3_BUCKET,
                            s3_prefix=PROFILER_S3_PREFIX,
                            output_local_path=ROLLOUT_WORKER_PROFILER_PATH,
                            enable_profiling=IS_PROFILER_ON):
            graph_manager.phase = RunPhase.TRAIN
            exit_if_trainer_done(checkpoint_dir, simtrace_video_s3_writers,
                                 rollout_idx)
            unpause_physics(EmptyRequest())
            graph_manager.reset_internal_state(True)
            graph_manager.act(act_steps,
                              wait_for_full_episodes=graph_manager.
                              agent_params.algorithm.act_for_full_episodes)
            graph_manager.reset_internal_state(True)
            time.sleep(1)
            pause_physics(EmptyRequest())

            graph_manager.phase = RunPhase.UNDEFINED
            new_checkpoint = -1
            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.SYNC:
                unpause_physics(EmptyRequest())
                is_save_mp4_enabled = rospy.get_param(
                    'MP4_S3_BUCKET', None) and rollout_idx == 0
                if is_save_mp4_enabled:
                    subscribe_to_save_mp4(EmptyRequest())
                if rollout_idx == 0:
                    for _ in range(int(rospy.get_param('MIN_EVAL_TRIALS',
                                                       '5'))):
                        graph_manager.evaluate(EnvironmentSteps(1))

                # For sageonly job for better performance only run limited number of evaluations.
                # Pausing the physics makes its performance same as RoboMaker + SageMaker
                if is_sageonly:
                    if is_save_mp4_enabled:
                        unsubscribe_from_save_mp4(EmptyRequest())
                    # upload simtrace and mp4 into s3 bucket
                    for s3_writer in simtrace_video_s3_writers:
                        s3_writer.persist(utils.get_s3_kms_extra_args())
                    graph_manager.phase = RunPhase.WAITING
                    pause_physics(EmptyRequest())

                while new_checkpoint < last_checkpoint + 1:
                    exit_if_trainer_done(checkpoint_dir,
                                         simtrace_video_s3_writers,
                                         rollout_idx)
                    # Continously run the evaluation only for SageMaker + RoboMaker job
                    if not is_sageonly and rollout_idx == 0:
                        print(
                            "Additional evaluation. New Checkpoint: {}, Last Checkpoint: {}"
                            .format(new_checkpoint, last_checkpoint))
                        graph_manager.evaluate(EnvironmentSteps(1))
                    else:
                        time.sleep(5)
                    new_checkpoint = data_store.get_coach_checkpoint_number(
                        'agent')

                # Save the mp4 for Robo+Sage jobs
                if not is_sageonly:
                    if is_save_mp4_enabled:
                        unsubscribe_from_save_mp4(EmptyRequest())
                    # upload simtrace and mp4 into s3 bucket
                    for s3_writer in simtrace_video_s3_writers:
                        s3_writer.persist(utils.get_s3_kms_extra_args())
                    pause_physics(EmptyRequest())
                data_store.load_from_store(
                    expected_checkpoint_number=last_checkpoint + 1)
                graph_manager.restore_checkpoint()

            if graph_manager.agent_params.algorithm.distributed_coach_synchronization_type\
                    == DistributedCoachSynchronizationType.ASYNC:
                if new_checkpoint > last_checkpoint:
                    graph_manager.restore_checkpoint()

            last_checkpoint = new_checkpoint

    logger.info("Exited main loop. Done.")
Example #21
0
from rl_coach.environments.gym_environment import GymVectorEnvironment
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager
from rl_coach.graph_managers.graph_manager import ScheduleParameters
from rl_coach.memories.episodic.episodic_hindsight_experience_replay import \
    EpisodicHindsightExperienceReplayParameters, HindsightGoalSelectionMethod
from rl_coach.memories.memory import MemoryGranularity
from rl_coach.schedules import ConstantSchedule
from rl_coach.spaces import GoalsSpace, ReachingGoal

bit_length = 20

####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentEpisodes(16 * 50 * 200)  # 200 epochs
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(16 * 50)  # 50 cycles
schedule_params.evaluation_steps = EnvironmentEpisodes(10)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = DQNAgentParameters()
agent_params.network_wrappers['main'].learning_rate = 0.001
agent_params.network_wrappers['main'].batch_size = 128
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense(256)]
agent_params.network_wrappers['main'].input_embedders_parameters = {
    'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
agent_params.algorithm.discount = 0.98
Example #22
0
def get_graph_manager(hp_dict,
                      agent_list,
                      run_phase_subject,
                      enable_domain_randomization=False,
                      done_condition=any,
                      run_type=str(RunType.ROLLOUT_WORKER),
                      pause_physics=None,
                      unpause_physics=None):
    ####################
    # Hyperparameters #
    ####################
    # Note: The following three line hard-coded to pick the first agent's trainig algorithm
    # and dump the hyper parameters for the particular training algorithm into json
    # for training jobs (so that the console display the training hyperparameters correctly)
    # since right now, we only support training one model at a time.
    # TODO: clean these lines up when we support multi-agent training.
    training_algorithm = agent_list[
        0].ctrl.model_metadata.training_algorithm if agent_list else None
    params = get_updated_hyper_parameters(hp_dict, training_algorithm)
    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    trainable_agents_list = list()
    non_trainable_agents_list = list()

    for agent in agent_list:
        if agent.network_settings:
            training_algorithm = agent.ctrl.model_metadata.training_algorithm
            params = get_updated_hyper_parameters(hp_dict, training_algorithm)
            if TrainingAlgorithm.SAC.value == training_algorithm:
                agent_params = get_sac_params(DeepRacerSACAgentParams(), agent,
                                              params, run_type)
            else:
                agent_params = get_clipped_ppo_params(
                    DeepRacerClippedPPOAgentParams(), agent, params)
            agent_params.env_agent = agent
            input_filter = InputFilter(is_a_reference_filter=True)
            for observation in agent.network_settings['input_embedders'].keys(
            ):
                if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or \
                        observation == Input.OBSERVATION.value:
                    input_filter.add_observation_filter(
                        observation, 'to_grayscale', ObservationRGBToYFilter())
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))
                    input_filter.add_observation_filter(
                        observation, 'stacking', ObservationStackingFilter(1))

                if observation == Input.STEREO.value:
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))

                if observation == Input.LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'clipping',
                        ObservationClippingFilter(0.15, 1.0))
                if observation == Input.SECTOR_LIDAR.value:
                    sector_binary_filter = ObservationSectorDiscretizeFilter(
                        num_sectors=NUMBER_OF_LIDAR_SECTORS,
                        num_values_per_sector=1,
                        clipping_dist=SECTOR_LIDAR_CLIPPING_DIST)
                    input_filter.add_observation_filter(
                        observation, 'binary', sector_binary_filter)
                if observation == Input.DISCRETIZED_SECTOR_LIDAR.value:
                    num_sectors = agent.ctrl.model_metadata.lidar_num_sectors
                    num_values_per_sector = agent.ctrl.model_metadata.lidar_num_values_per_sector
                    clipping_dist = agent.ctrl.model_metadata.lidar_clipping_dist

                    sector_discretize_filter = ObservationSectorDiscretizeFilter(
                        num_sectors=num_sectors,
                        num_values_per_sector=num_values_per_sector,
                        clipping_dist=clipping_dist)
                    input_filter.add_observation_filter(
                        observation, 'discrete', sector_discretize_filter)
            agent_params.input_filter = input_filter()
            trainable_agents_list.append(agent_params)
        else:
            non_trainable_agents_list.append(agent)

    ###############
    # Environment #
    ###############
    env_params = DeepRacerRacetrackEnvParameters()
    env_params.agents_params = trainable_agents_list
    env_params.non_trainable_agents = non_trainable_agents_list
    env_params.level = 'DeepRacerRacetrackEnv-v0'
    env_params.run_phase_subject = run_phase_subject
    env_params.enable_domain_randomization = enable_domain_randomization
    env_params.done_condition = done_condition
    env_params.pause_physics = pause_physics
    env_params.unpause_physics = unpause_physics
    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = MultiAgentGraphManager(
        agents_params=trainable_agents_list,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params,
        done_condition=done_condition)
    return graph_manager, params_json