def evaluate(params):
    # file params
    experiment_path = os.path.join(params.output_data_dir)
    logger.experiment_path = os.path.join(experiment_path, 'evaluation')
    params.checkpoint_restore_dir = os.path.join(params.input_data_dir, 'checkpoint')
    checkpoint_file = os.path.join(params.checkpoint_restore_dir, 'checkpoint')

    inplace_change(checkpoint_file, "/opt/ml/output/data/checkpoint", ".")
    # Note that due to a tensorflow issue (https://github.com/tensorflow/tensorflow/issues/9146) we need to replace
    # the absolute path for the evaluation-from-a-checkpointed-model to work

    vis_params = VisualizationParameters()
    vis_params.dump_gifs = True

    task_params = TaskParameters(evaluate_only=True, experiment_path=logger.experiment_path)
    task_params.__dict__ = add_items_to_dict(task_params.__dict__, params.__dict__)

    graph_manager = BasicRLGraphManager(
        agent_params=ClippedPPOAgentParameters(),
        env_params=GymVectorEnvironment(level='TSP_env:TSPEasyEnv'),
        schedule_params=ScheduleParameters(),
        vis_params=vis_params
    )
    graph_manager = graph_manager.create_graph(task_parameters=task_params)
    graph_manager.evaluate(EnvironmentSteps(5))
Ejemplo n.º 2
0
# critic (q) network parameters
agent_params.network_wrappers['q'].heads_parameters[0].network_layers_sizes = (
    32, 32)
agent_params.network_wrappers['q'].batch_size = 32
agent_params.network_wrappers['q'].learning_rate = 0.0003
agent_params.network_wrappers['q'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['q'].adam_optimizer_beta2 = 0.999
agent_params.network_wrappers['q'].input_embedders_parameters['forward_camera'] = \
    agent_params.network_wrappers['q'].input_embedders_parameters.pop('observation')

# actor (policy) network parameters
agent_params.network_wrappers['policy'].batch_size = 32
agent_params.network_wrappers['policy'].learning_rate = 0.0003
agent_params.network_wrappers['policy'].middleware_parameters.scheme = [
    Dense(32)
]
agent_params.network_wrappers['policy'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['policy'].adam_optimizer_beta2 = 0.999
agent_params.network_wrappers['policy'].input_embedders_parameters['forward_camera'] = \
    agent_params.network_wrappers['policy'].input_embedders_parameters.pop('observation')

###############
# Environment #
###############
env_params = CarlaEnvironmentParameters(level='town2')

graph_manager = BasicRLGraphManager(agent_params=agent_params,
                                    env_params=env_params,
                                    schedule_params=schedule_params,
                                    vis_params=VisualizationParameters())
# NN configuration
agent_params.network_wrappers['main'].learning_rate = 0.00025
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False

# ER size
agent_params.memory.max_size = (MemoryGranularity.Transitions, 40000)

# E-Greedy schedule
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)
agent_params.pre_network_filter = InputFilter()
agent_params.pre_network_filter.add_observation_filter('observation', 'input_noise',
                                                       ObservationAdversarialNoiseFilter(0.1))

###############
# Environment #
###############
env_params = GymVectorEnvironment(level='MountainCar-v0')

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 250

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=VisualizationParameters(),
                                    preset_validation_params=preset_validation_params)
agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH,
                                                            is_episodic=True)
'''
spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}),
                          goal=None,
                          action=DiscreteActionSpace(3),
                          reward=RewardSpace(1))

spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=23)}),
                          goal=None,
                          action=DiscreteActionSpace(31),
                          reward=RewardSpace(1))
'''
spaces = SpacesDefinition(state=StateSpace(
    {'observation': VectorObservationSpace(shape=23)}),
                          goal=None,
                          action=DiscreteActionSpace(21),
                          reward=RewardSpace(1))

graph_manager = BatchRLGraphManager(
    agent_params=agent_params,
    env_params=None,
    spaces_definition=spaces,
    schedule_params=schedule_params,
    vis_params=VisualizationParameters(tensorboard=True,
                                       dump_csv=True,
                                       dump_signals_to_csv_every_x_episodes=1),
    reward_model_num_epochs=10,
    train_to_eval_ratio=0.4)
graph_manager.create_graph(task_parameters)
graph_manager.improve()
Ejemplo n.º 5
0
def get_graph_manager(**hp_dict):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["exploration_type"] = (hp_dict.get("exploration_type",
                                              "categorical")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type",
                                      "Mean squared error").lower()
    params["num_episodes_between_training"] = int(
        hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(
        hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(
        hp_dict.get("term_cond_avg_score", 100000))

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    agent_params = ClippedPPOAgentParameters()

    agent_params.network_wrappers['main'].learning_rate = params["lr"]
    agent_params.network_wrappers['main'].input_embedders_parameters[
        'observation'].activation_function = 'relu'
    agent_params.network_wrappers[
        'main'].middleware_parameters.activation_function = 'relu'
    agent_params.network_wrappers['main'].batch_size = params["batch_size"]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

    if params["loss_type"] == "huber":
        agent_params.network_wrappers[
            'main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.clipping_decay_schedule = LinearSchedule(
        1.0, 0, 1000000)
    agent_params.algorithm.beta_entropy = params["beta_entropy"]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params["discount_factor"]
    agent_params.algorithm.optimization_epochs = params["num_epochs"]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
        params["num_episodes_between_training"])
    agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(
        params["num_episodes_between_training"])

    agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

    if params["exploration_type"] == "categorical":
        agent_params.exploration = CategoricalParameters()
    else:
        agent_params.exploration = EGreedyParameters()
        agent_params.exploration.epsilon_schedule = LinearSchedule(
            1.0, params["e_greedy_value"], params["epsilon_steps"])

    ###############
    # Environment #
    ###############
    DeepRacerInputFilter = InputFilter(is_a_reference_filter=True)
    DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale',
                                                ObservationRGBToYFilter())
    DeepRacerInputFilter.add_observation_filter(
        'observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
    DeepRacerInputFilter.add_observation_filter(
        'observation', 'stacking',
        ObservationStackingFilter(params["stack_size"]))

    env_params = GymVectorEnvironment()
    env_params.default_input_filter = DeepRacerInputFilter
    env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0'

    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = BasicRLGraphManager(
        agent_params=agent_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
agent_params.pre_network_filter.add_observation_filter('observation', 'normalize_observation',
    ObservationNormalizationFilter(name='normalize_observation'))

###############
# Environment #
###############
config = {
    'eplus_path': '/usr/local/EnergyPlus-8-8-0/',
    'weather_file': 'weather/USA_CA_San.Francisco.Intl.AP.724940_TMY3.epw'
}
env_params = GymVectorEnvironment(level='eplus.envs.data_center_env:DataCenterEnv')
env_params.additional_simulator_parameters = {'config': config }

#################
# Visualization #
#################

vis_params = VisualizationParameters()
vis_params.dump_gifs = False

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 400

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=vis_params,
                                    preset_validation_params=preset_validation_params)
def get_graph_manager(**hp_dict):
    ####################
    # All Default Parameters #
    ####################
    params = {}
    params["batch_size"] = int(hp_dict.get("batch_size", 64))
    params["num_epochs"] = int(hp_dict.get("num_epochs", 10))
    params["stack_size"] = int(hp_dict.get("stack_size", 1))
    params["lr"] = float(hp_dict.get("lr", 0.0003))
    params["exploration_type"] = (hp_dict.get("exploration_type", "huber")).lower()
    params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05))
    params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000))
    params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01))
    params["discount_factor"] = float(hp_dict.get("discount_factor", .999))
    params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower()
    params["num_episodes_between_training"] = int(hp_dict.get("num_episodes_between_training", 20))
    params["term_cond_max_episodes"] = int(hp_dict.get("term_cond_max_episodes", 100000))
    params["term_cond_avg_score"] = float(hp_dict.get("term_cond_avg_score", 100000))

    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(params["term_cond_max_episodes"])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    agent_params = ClippedPPOAgentParameters()

    agent_params.network_wrappers['main'].learning_rate = params["lr"]
    agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'relu'
    agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'relu'
    agent_params.network_wrappers['main'].batch_size = params["batch_size"]
    agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
    agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

    if params["loss_type"] == "huber":
        agent_params.network_wrappers['main'].replace_mse_with_huber_loss = True

    agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
    agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000)
    agent_params.algorithm.beta_entropy = params["beta_entropy"]
    agent_params.algorithm.gae_lambda = 0.95
    agent_params.algorithm.discount = params["discount_factor"]
    agent_params.algorithm.optimization_epochs = params["num_epochs"]
    agent_params.algorithm.estimate_state_value_using_gae = True
    agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(
        params["num_episodes_between_training"])
    agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(params["num_episodes_between_training"])

    agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC

    if params["exploration_type"] == "categorical":
        agent_params.exploration = CategoricalParameters()
    else:
        agent_params.exploration = EGreedyParameters()
        agent_params.exploration.epsilon_schedule = LinearSchedule(1.0,
                                                                   params["e_greedy_value"],
                                                                   params["epsilon_steps"])

    ###############
    # Environment #
    ###############
    SilverstoneInputFilter = InputFilter(is_a_reference_filter=True)
    SilverstoneInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
    SilverstoneInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
    SilverstoneInputFilter.add_observation_filter('observation', 'stacking',
                                                  ObservationStackingFilter(params["stack_size"]))

    env_params = GymVectorEnvironment()
    env_params.default_input_filter = SilverstoneInputFilter
    env_params.level = 'SilverstoneRacetrack-Discrete-v0'

    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 1000

    graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                        schedule_params=schedule_params, vis_params=vis_params,
                                        preset_validation_params=preset_validation_params)
    return graph_manager, params_json
Ejemplo n.º 8
0
]
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'observation'].scheme = [Dense([400])]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [
    Dense([300])
]
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'action'].scheme = EmbedderScheme.Empty

###############
# Environment #
###############
env_params = Mujoco()
env_params.level = SingleLevelSelection(mujoco_v2)

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [
    SelectedPhaseOnlyDumpMethod(RunPhase.TEST),
    MaxDumpMethod()
]
vis_params.dump_mp4 = False

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 400
preset_validation_params.max_episodes_to_achieve_reward = 1000
preset_validation_params.reward_test_level = 'inverted_pendulum'
preset_validation_params.trace_test_levels = ['inverted_pendulum', 'hopper']
Ejemplo n.º 9
0
bottom_critic.learning_rate = 0.001
bottom_critic.batch_size = 4096

agents_params = [top_agent_params, bottom_agent_params]

###############
# Environment #
###############
time_limit = 1000

env_params = GymVectorEnvironment(
    level="rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals")
env_params.additional_simulator_parameters = {
    "time_limit": time_limit,
    "random_goals_instead_of_standing_goal": False,
    "polar_coordinates": polar_coordinates,
    "goal_reaching_thresholds": distance_from_goal_threshold
}
env_params.frame_skip = 10
env_params.custom_reward_threshold = -time_limit + 1

vis_params = VisualizationParameters()
vis_params.native_rendering = False

graph_manager = HACGraphManager(
    agents_params=agents_params,
    env_params=env_params,
    schedule_params=schedule_params,
    vis_params=vis_params,
    consecutive_steps_to_run_non_top_levels=EnvironmentSteps(40))
Ejemplo n.º 10
0
    def get_graph_manager_from_args(self, args: argparse.Namespace) -> 'GraphManager':
        """
        Return the graph manager according to the command line arguments given by the user.
        :param args: the arguments given by the user
        :return: the graph manager, not bound to task_parameters yet.
        """
        graph_manager = None

        # if a preset was given we will load the graph manager for the preset
        if args.preset is not None:
            graph_manager = short_dynamic_import(args.preset, ignore_module_case=True)

        # for human play we need to create a custom graph manager
        if args.play:
            from rl_coach.agents.human_agent import HumanAgentParameters

            env_params = short_dynamic_import(args.environment_type, ignore_module_case=True)()
            env_params.human_control = True
            schedule_params = HumanPlayScheduleParameters()
            graph_manager = BasicRLGraphManager(HumanAgentParameters(), env_params, schedule_params, VisualizationParameters())

        # Set framework
        # Note: Some graph managers (e.g. HAC preset) create multiple agents and the attribute is called agents_params
        if hasattr(graph_manager, 'agent_params'):
            for network_parameters in graph_manager.agent_params.network_wrappers.values():
                network_parameters.framework = args.framework
        elif hasattr(graph_manager, 'agents_params'):
            for ap in graph_manager.agents_params:
                for network_parameters in ap.network_wrappers.values():
                    network_parameters.framework = args.framework

        if args.level:
            if isinstance(graph_manager.env_params.level, SingleLevelSelection):
                graph_manager.env_params.level.select(args.level)
            else:
                graph_manager.env_params.level = args.level

        # set the seed for the environment
        if args.seed is not None and graph_manager.env_params is not None:
            graph_manager.env_params.seed = args.seed

        # visualization
        graph_manager.visualization_parameters.dump_gifs = graph_manager.visualization_parameters.dump_gifs or args.dump_gifs
        graph_manager.visualization_parameters.dump_mp4 = graph_manager.visualization_parameters.dump_mp4 or args.dump_mp4
        graph_manager.visualization_parameters.render = args.render
        graph_manager.visualization_parameters.tensorboard = args.tensorboard
        graph_manager.visualization_parameters.print_networks_summary = args.print_networks_summary

        # update the custom parameters
        if args.custom_parameter is not None:
            unstripped_key_value_pairs = [pair.split('=') for pair in args.custom_parameter.split(';')]
            stripped_key_value_pairs = [tuple([pair[0].strip(), pair[1].strip()]) for pair in
                                        unstripped_key_value_pairs if len(pair) == 2]

            # load custom parameters into run_dict
            for key, value in stripped_key_value_pairs:
                exec("graph_manager.{}={}".format(key, value))

        return graph_manager
Ejemplo n.º 11
0
agent_params.pre_network_filter.add_observation_filter(
    "observation",
    "normalize_observation",
    ObservationNormalizationFilter(name="normalize_observation"),
)

###############
# Environment #
###############
env_params = GymVectorEnvironment(
    level="patient_envs:PatientContinuousMountainCar")

#################
# Visualization #
#################
vis_params = VisualizationParameters()
vis_params.dump_gifs = True
vis_params.video_dump_filters = [
    SelectedPhaseOnlyDumpFilter(RunPhase.TEST),
    MaxDumpFilter()
]

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 250

graph_manager = BasicRLGraphManager(
# ER size
agent_params.memory.max_size = (MemoryGranularity.Transitions, 50000)

# E-Greedy schedule
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.05, 100000)

################
#  Environment #
################
env_params = GymVectorEnvironment(level='patient_envs:PatientMountainCar')

#################
# Visualization #
#################
vis_params = VisualizationParameters()
vis_params.dump_gifs = True
vis_params.video_dump_filters = [SelectedPhaseOnlyDumpFilter(RunPhase.TEST), MaxDumpFilter()]

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = -200
preset_validation_params.max_episodes_to_achieve_reward = 125

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=vis_params,
                                    preset_validation_params=preset_validation_params)
Ejemplo n.º 13
0
def get_graph_manager(hp_dict,
                      agent_list,
                      run_phase_subject,
                      enable_domain_randomization=False,
                      done_condition=any,
                      run_type=str(RunType.ROLLOUT_WORKER),
                      pause_physics=None,
                      unpause_physics=None):
    ####################
    # Hyperparameters #
    ####################
    training_algorithm = agent_list[
        0].ctrl.model_metadata.training_algorithm if agent_list else None
    params = get_updated_hyper_parameters(hp_dict, training_algorithm)
    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    trainable_agents_list = list()
    non_trainable_agents_list = list()

    for agent in agent_list:
        if agent.network_settings:
            if TrainingAlgorithm.SAC.value == training_algorithm:
                agent_params = get_sac_params(DeepRacerSACAgentParams(), agent,
                                              params, run_type)
            else:
                agent_params = get_clipped_ppo_params(
                    DeepRacerClippedPPOAgentParams(), agent, params)
            agent_params.env_agent = agent
            input_filter = InputFilter(is_a_reference_filter=True)
            for observation in agent.network_settings['input_embedders'].keys(
            ):
                if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\
                observation == Input.OBSERVATION.value:
                    input_filter.add_observation_filter(
                        observation, 'to_grayscale', ObservationRGBToYFilter())
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))
                    input_filter.add_observation_filter(
                        observation, 'stacking', ObservationStackingFilter(1))

                if observation == Input.STEREO.value:
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))

                if observation == Input.LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'clipping',
                        ObservationClippingFilter(0.15, 1.0))
                if observation == Input.SECTOR_LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'binary', ObservationBinarySectorFilter())
            agent_params.input_filter = input_filter()
            trainable_agents_list.append(agent_params)
        else:
            non_trainable_agents_list.append(agent)

    ###############
    # Environment #
    ###############
    env_params = DeepRacerRacetrackEnvParameters()
    env_params.agents_params = trainable_agents_list
    env_params.non_trainable_agents = non_trainable_agents_list
    env_params.level = 'DeepRacerRacetrackEnv-v0'
    env_params.run_phase_subject = run_phase_subject
    env_params.enable_domain_randomization = enable_domain_randomization
    env_params.done_condition = done_condition
    env_params.pause_physics = pause_physics
    env_params.unpause_physics = unpause_physics
    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = MultiAgentGraphManager(
        agents_params=trainable_agents_list,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params,
        done_condition=done_condition)
    return graph_manager, params_json
from rl_coach.agents.qr_dqn_agent import QuantileRegressionDQNAgentParameters
from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters
from rl_coach.environments.environment import SingleLevelSelection
from rl_coach.environments.gym_environment import Atari, atari_deterministic_v4, atari_schedule
from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager

#########
# Agent #
#########
agent_params = QuantileRegressionDQNAgentParameters()
agent_params.network_wrappers['main'].learning_rate = 0.00005  # called alpha in the paper
agent_params.algorithm.huber_loss_interval = 1  # k = 0 for strict quantile loss, k = 1 for Huber quantile loss

###############
# Environment #
###############
env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.trace_test_levels = ['breakout', 'pong', 'space_invaders']

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=atari_schedule, vis_params=VisualizationParameters(),
                                    preset_validation_params=preset_validation_params)
Ejemplo n.º 15
0
                                            'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)}
bottom_critic.embedding_merger_type = EmbeddingMergerType.Concat
bottom_critic.middleware_parameters.scheme = [Dense([64])] * 3
bottom_critic.learning_rate = 0.001
bottom_critic.batch_size = 4096

agents_params = [top_agent_params, bottom_agent_params]

###############
# Environment #
###############
time_limit = 1000

env_params = Mujoco()
env_params.level = "rl_coach.environments.mujoco.pendulum_with_goals:PendulumWithGoals"
env_params.additional_simulator_parameters = {"time_limit": time_limit,
                                              "random_goals_instead_of_standing_goal": False,
                                              "polar_coordinates": polar_coordinates,
                                              "goal_reaching_thresholds": distance_from_goal_threshold}
env_params.frame_skip = 10
env_params.custom_reward_threshold = -time_limit + 1

vis_params = VisualizationParameters()
vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST)]
vis_params.dump_mp4 = False
vis_params.native_rendering = False

graph_manager = HACGraphManager(agents_params=agents_params, env_params=env_params,
                                schedule_params=schedule_params, vis_params=vis_params,
                                consecutive_steps_to_run_non_top_levels=EnvironmentSteps(40))
Ejemplo n.º 16
0
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'observation'].scheme = EmbedderScheme.Empty
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'action'].scheme = EmbedderScheme.Empty
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [
    Dense(400), Dense(300)
]

agent_params.exploration = TruncatedNormalParameters()
agent_params.exploration.noise_as_percentage_from_action_space = False
agent_params.exploration.evaluation_noise = 0  # Neta new
agent_params.algorithm.use_target_network_for_evaluation = True
agent_params.algorithm.act_for_full_episodes = True

###############
# Environment #
###############
env_params = GymVectorEnvironment()
env_params.level = './environment.py:DistillerWrapperEnvironment'

vis_params = VisualizationParameters()
vis_params.dump_parameters_documentation = False
vis_params.render = True
vis_params.native_rendering = True
vis_params.dump_signals_to_csv_every_x_episodes = 1

graph_manager = BasicRLGraphManager(agent_params=agent_params,
                                    env_params=env_params,
                                    schedule_params=schedule_params,
                                    vis_params=vis_params)
Ejemplo n.º 17
0
env_params = GymVectorEnvironment(level='TSP_env:TSPEasyEnv')

#########
# Agent #
#########

agent_params = ClippedPPOAgentParameters()

#################
# Visualization #
#################

env_params.frame_skip = 5  #to make sure the gifs work without skipping steps

vis_params = VisualizationParameters()
vis_params.dump_gifs = True
#vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]

#experiment_name = "TSPEasy"
#experiment_name = logger.get_experiment_name(experiment_name)
#experiment_path = logger.get_experiment_path(experiment_name)

#task_params = TaskParameters(experiment_path=experiment_path)

####################
# Graph Scheduling #
####################

schedule_params = SimpleSchedule()
schedule_params.improve_steps = TrainingSteps(100000)
    def __init__(
            self,
            agents_params: List[AgentParameters],
            env_params: EnvironmentParameters,
            schedule_params: ScheduleParameters,
            vis_params: VisualizationParameters = VisualizationParameters(),
            preset_validation_params:
        PresetValidationParameters = PresetValidationParameters(),
            done_condition=any):
        self.done_condition = done_condition
        self.sess = {agent_params.name: None for agent_params in agents_params}
        self.level_managers = []  # type: List[MultiAgentLevelManager]
        self.top_level_manager = None
        self.environments = []
        self.set_schedule_params(schedule_params)
        self.visualization_parameters = vis_params
        self.name = 'multi_agent_graph'
        self.task_parameters = None
        self._phase = self.phase = RunPhase.UNDEFINED
        self.preset_validation_params = preset_validation_params
        self.reset_required = False
        self.num_checkpoints_to_keep = 4  # TODO: make this a parameter

        # timers
        self.graph_creation_time = None
        self.last_checkpoint_saving_time = time.time()

        # counters
        self.total_steps_counters = {
            RunPhase.HEATUP: TotalStepsCounter(),
            RunPhase.TRAIN: TotalStepsCounter(),
            RunPhase.TEST: TotalStepsCounter()
        }
        self.checkpoint_id = 0

        self.checkpoint_saver = {
            agent_params.name: None
            for agent_params in agents_params
        }
        self.checkpoint_state_updater = None
        self.graph_logger = Logger()
        self.data_store = None
        self.is_batch_rl = False
        self.time_metric = TimeTypes.EpisodeNumber

        self.env_params = env_params
        self.agents_params = agents_params
        self.agent_params = agents_params[0]  # ...(find a better way)...

        for agent_index, agent_params in enumerate(agents_params):
            if len(agents_params) == 1:
                agent_params.name = "agent"
            else:
                agent_params.name = "agent_{}".format(agent_index)
            agent_params.visualization = copy.copy(vis_params)
            if agent_params.input_filter is None:
                agent_params.input_filter = copy.copy(
                    env_params.default_input_filter())
            if agent_params.output_filter is None:
                agent_params.output_filter = copy.copy(
                    env_params.default_output_filter())
# ER size
agent_params.memory.max_size = (MemoryGranularity.Transitions, 40000)

# E-Greedy schedule
agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000)

################
#  Environment #
################
env_params = GymVectorEnvironment(level='CartPole-v0')

#################
# Visualization #
#################

vis_params = VisualizationParameters()
vis_params.dump_gifs = True

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 250

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=vis_params,
                                    preset_validation_params=preset_validation_params)
Ejemplo n.º 20
0
# E-Greedy schedule
experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(
    1.0, 0.01, DATASET_SIZE)
experience_generating_agent_params.exploration.evaluation_epsilon = 0

################
#  Environment #
################
env_params = GymVectorEnvironment(level='MountainCar-v0')

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 150
preset_validation_params.max_episodes_to_achieve_reward = 50
preset_validation_params.read_csv_tries = 500

graph_manager = BatchRLGraphManager(
    agent_params=agent_params,
    experience_generating_agent_params=experience_generating_agent_params,
    experience_generating_schedule_params=experience_generating_schedule_params,
    env_params=env_params,
    schedule_params=schedule_params,
    vis_params=VisualizationParameters(dump_signals_to_csv_every_x_episodes=1),
    preset_validation_params=preset_validation_params,
    reward_model_num_epochs=30,
    train_to_eval_ratio=0.4)
agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes(20)
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(20)
agent_params.exploration = CategoricalParameters()

###############
# Environment #
###############
turtlebot3_input_filter = InputFilter(is_a_reference_filter=True)
turtlebot3_input_filter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
turtlebot3_input_filter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
turtlebot3_input_filter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(1))

env_params = GymVectorEnvironment()
env_params.default_input_filter = turtlebot3_input_filter
env_params.level = 'SageMaker-TurtleBot3-Discrete-v0'

vis_params = VisualizationParameters()
vis_params.dump_mp4 = False

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 400
preset_validation_params.max_episodes_to_achieve_reward = 1000

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=vis_params,
                                    preset_validation_params=preset_validation_params)
Ejemplo n.º 22
0
def train_using_experience_agent(env_params, n_epochs, dataset_size):
    tf.reset_default_graph(
    )  # just to clean things up; only needed for the tutorial

    # Experience Generating Agent parameters
    experience_generating_agent_params = DDQNAgentParameters()
    # schedule parameters
    experience_generating_schedule_params = ScheduleParameters()
    experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000)
    experience_generating_schedule_params.improve_steps = TrainingSteps(
        dataset_size -
        experience_generating_schedule_params.heatup_steps.num_steps)
    experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(
        10)
    experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(
        1)

    # DQN params
    experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(
        100)
    experience_generating_agent_params.algorithm.discount = 0.99
    experience_generating_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(
        1)

    # NN configuration
    experience_generating_agent_params.network_wrappers[
        'main'].learning_rate = 0.0001
    experience_generating_agent_params.network_wrappers[
        'main'].batch_size = 128
    experience_generating_agent_params.network_wrappers[
        'main'].replace_mse_with_huber_loss = False
    experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
        [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))]
    # experience_generating_agent_params.network_wrappers['main'].heads_parameters = \
    #     [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))]

    # ER size
    experience_generating_agent_params.memory = EpisodicExperienceReplayParameters(
    )
    experience_generating_agent_params.memory.max_size = \
        (MemoryGranularity.Transitions,
         experience_generating_schedule_params.heatup_steps.num_steps +
         experience_generating_schedule_params.improve_steps.num_steps)

    # E-Greedy schedule
    experience_generating_agent_params.exploration.epsilon_schedule = LinearSchedule(
        1.0, 0.01, DATASET_SIZE)
    experience_generating_agent_params.exploration.evaluation_epsilon = 0

    schedule_params = set_schedule_params(n_epochs, dataset_size)
    # set the agent params as before
    # agent_params = set_agent_params(DDQNAgentParameters)
    agent_params = set_agent_params(DDQNBCQAgentParameters)
    agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters(
    )

    # 50 epochs of training (the entire dataset is used each epoch)
    # schedule_params.improve_steps = TrainingSteps(50)

    graph_manager = BatchRLGraphManager(
        agent_params=agent_params,
        experience_generating_agent_params=experience_generating_agent_params,
        experience_generating_schedule_params=
        experience_generating_schedule_params,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=VisualizationParameters(
            dump_signals_to_csv_every_x_episodes=1),
        reward_model_num_epochs=30,
        train_to_eval_ratio=0.5)
    graph_manager.create_graph(task_parameters)
    graph_manager.improve()
    return
Ejemplo n.º 23
0
agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(20)
agent_params.exploration = CategoricalParameters()
agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**5)

###############
# Environment #
###############
DeepRacerInputFilter = InputFilter(is_a_reference_filter=True)
DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter())
DeepRacerInputFilter.add_observation_filter('observation', 'to_uint8', ObservationToUInt8Filter(0, 255))
DeepRacerInputFilter.add_observation_filter('observation', 'stacking', ObservationStackingFilter(1))

env_params = GymVectorEnvironment()
env_params.default_input_filter = DeepRacerInputFilter
env_params.level = 'RoboMaker-DeepRacer-v0'

vis_params = VisualizationParameters()
vis_params.dump_mp4 = False

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
preset_validation_params.min_reward_threshold = 400
preset_validation_params.max_episodes_to_achieve_reward = 10000

graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params,
                                    schedule_params=schedule_params, vis_params=vis_params,
                                    preset_validation_params=preset_validation_params)
Ejemplo n.º 24
0
def get_graph_manager(hp_dict,
                      agent_list,
                      run_phase_subject,
                      enable_domain_randomization=False,
                      done_condition=any,
                      run_type=str(RunType.ROLLOUT_WORKER),
                      pause_physics=None,
                      unpause_physics=None):
    ####################
    # Hyperparameters #
    ####################
    # Note: The following three line hard-coded to pick the first agent's trainig algorithm
    # and dump the hyper parameters for the particular training algorithm into json
    # for training jobs (so that the console display the training hyperparameters correctly)
    # since right now, we only support training one model at a time.
    # TODO: clean these lines up when we support multi-agent training.
    training_algorithm = agent_list[
        0].ctrl.model_metadata.training_algorithm if agent_list else None
    params = get_updated_hyper_parameters(hp_dict, training_algorithm)
    params_json = json.dumps(params, indent=2, sort_keys=True)
    print("Using the following hyper-parameters", params_json, sep='\n')

    ####################
    # Graph Scheduling #
    ####################
    schedule_params = ScheduleParameters()
    schedule_params.improve_steps = TrainingSteps(
        params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value])
    schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40)
    schedule_params.evaluation_steps = EnvironmentEpisodes(5)
    schedule_params.heatup_steps = EnvironmentSteps(0)

    #########
    # Agent #
    #########
    trainable_agents_list = list()
    non_trainable_agents_list = list()

    for agent in agent_list:
        if agent.network_settings:
            training_algorithm = agent.ctrl.model_metadata.training_algorithm
            params = get_updated_hyper_parameters(hp_dict, training_algorithm)
            if TrainingAlgorithm.SAC.value == training_algorithm:
                agent_params = get_sac_params(DeepRacerSACAgentParams(), agent,
                                              params, run_type)
            else:
                agent_params = get_clipped_ppo_params(
                    DeepRacerClippedPPOAgentParams(), agent, params)
            agent_params.env_agent = agent
            input_filter = InputFilter(is_a_reference_filter=True)
            for observation in agent.network_settings['input_embedders'].keys(
            ):
                if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or \
                        observation == Input.OBSERVATION.value:
                    input_filter.add_observation_filter(
                        observation, 'to_grayscale', ObservationRGBToYFilter())
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))
                    input_filter.add_observation_filter(
                        observation, 'stacking', ObservationStackingFilter(1))

                if observation == Input.STEREO.value:
                    input_filter.add_observation_filter(
                        observation, 'to_uint8',
                        ObservationToUInt8Filter(0, 255))

                if observation == Input.LIDAR.value:
                    input_filter.add_observation_filter(
                        observation, 'clipping',
                        ObservationClippingFilter(0.15, 1.0))
                if observation == Input.SECTOR_LIDAR.value:
                    sector_binary_filter = ObservationSectorDiscretizeFilter(
                        num_sectors=NUMBER_OF_LIDAR_SECTORS,
                        num_values_per_sector=1,
                        clipping_dist=SECTOR_LIDAR_CLIPPING_DIST)
                    input_filter.add_observation_filter(
                        observation, 'binary', sector_binary_filter)
                if observation == Input.DISCRETIZED_SECTOR_LIDAR.value:
                    num_sectors = agent.ctrl.model_metadata.lidar_num_sectors
                    num_values_per_sector = agent.ctrl.model_metadata.lidar_num_values_per_sector
                    clipping_dist = agent.ctrl.model_metadata.lidar_clipping_dist

                    sector_discretize_filter = ObservationSectorDiscretizeFilter(
                        num_sectors=num_sectors,
                        num_values_per_sector=num_values_per_sector,
                        clipping_dist=clipping_dist)
                    input_filter.add_observation_filter(
                        observation, 'discrete', sector_discretize_filter)
            agent_params.input_filter = input_filter()
            trainable_agents_list.append(agent_params)
        else:
            non_trainable_agents_list.append(agent)

    ###############
    # Environment #
    ###############
    env_params = DeepRacerRacetrackEnvParameters()
    env_params.agents_params = trainable_agents_list
    env_params.non_trainable_agents = non_trainable_agents_list
    env_params.level = 'DeepRacerRacetrackEnv-v0'
    env_params.run_phase_subject = run_phase_subject
    env_params.enable_domain_randomization = enable_domain_randomization
    env_params.done_condition = done_condition
    env_params.pause_physics = pause_physics
    env_params.unpause_physics = unpause_physics
    vis_params = VisualizationParameters()
    vis_params.dump_mp4 = False

    ########
    # Test #
    ########
    preset_validation_params = PresetValidationParameters()
    preset_validation_params.test = True
    preset_validation_params.min_reward_threshold = 400
    preset_validation_params.max_episodes_to_achieve_reward = 10000

    graph_manager = MultiAgentGraphManager(
        agents_params=trainable_agents_list,
        env_params=env_params,
        schedule_params=schedule_params,
        vis_params=vis_params,
        preset_validation_params=preset_validation_params,
        done_condition=done_condition)
    return graph_manager, params_json