def evaluate(params): # file params experiment_path = os.path.join(params.output_data_dir) logger.experiment_path = os.path.join(experiment_path, 'evaluation') params.checkpoint_restore_dir = os.path.join(params.input_data_dir, 'checkpoint') checkpoint_file = os.path.join(params.checkpoint_restore_dir, 'checkpoint') inplace_change(checkpoint_file, "/opt/ml/output/data/checkpoint", ".") # Note that due to a tensorflow issue (https://github.com/tensorflow/tensorflow/issues/9146) we need to replace # the absolute path for the evaluation-from-a-checkpointed-model to work vis_params = VisualizationParameters() vis_params.dump_gifs = True task_params = TaskParameters(evaluate_only=True, experiment_path=logger.experiment_path) task_params.__dict__ = add_items_to_dict(task_params.__dict__, params.__dict__) graph_manager = BasicRLGraphManager( agent_params=ClippedPPOAgentParameters(), env_params=GymVectorEnvironment(level='TSP_env:TSPEasyEnv'), schedule_params=ScheduleParameters(), vis_params=vis_params ) graph_manager = graph_manager.create_graph(task_parameters=task_params) graph_manager.evaluate(EnvironmentSteps(5))
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False # ER size agent_params.memory.max_size = (MemoryGranularity.Transitions, 40000) # E-Greedy schedule agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000) agent_params.input_filter = InputFilter() agent_params.input_filter.add_reward_filter( 'adversarial_inversion', RewardAdversarialInversionFilter(0.3)) ############### # Environment # ############### env_params = GymVectorEnvironment(level='MountainCar-v0') ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 150 preset_validation_params.max_episodes_to_achieve_reward = 250 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=VisualizationParameters(), preset_validation_params=preset_validation_params)
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([300])] agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty agent_params.input_filter = MujocoInputFilter() agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1/10.)) ############### # Environment # ############### env_params = ControlSuiteEnvironmentParameters() env_params.level = SingleLevelSelection(control_suite_envs) vis_params = VisualizationParameters() vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()] vis_params.dump_mp4 = False ######## # Test # ######## # this preset is too slow to test on a regular basis # preset_validation_params = PresetValidationParameters() # preset_validation_params.test = True # preset_validation_params.min_reward_threshold = 150 # preset_validation_params.max_episodes_to_achieve_reward = 250 graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params,) # preset_validation_params=preset_validation_params)
from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters from rl_coach.core_types import EnvironmentSteps from rl_coach.environments.gym_environment import GymVectorEnvironment from rl_coach.graph_managers.basic_rl_graph_manager import BasicRLGraphManager from rl_coach.graph_managers.graph_manager import SimpleSchedule graph_manager = BasicRLGraphManager( agent_params=ClippedPPOAgentParameters(), env_params=GymVectorEnvironment(level='CartPole-v0'), schedule_params=SimpleSchedule()) graph_manager.heatup(EnvironmentSteps(100)) graph_manager.train_and_act(EnvironmentSteps(100))
agent_params.network_wrappers['main'].clip_gradients = 40. agent_params.exploration = CategoricalParameters() ############### # Environment # ############### env_params = DoomEnvironmentParameters() env_params.level = 'basic' vis_params = VisualizationParameters() vis_params.video_dump_methods = [ SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod() ] vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 20 preset_validation_params.max_episodes_to_achieve_reward = 400 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params)
agent_params.network_wrappers['main'].learning_rate = 0.025 agent_params.exploration.epsilon_schedule = LinearSchedule(1, 0, 500) ############### # Environment # ############### level = 'gym_dynamic_multi_armed_bandit.envs:BasicEnv' env_params = GymVectorEnvironment(level) ######################## # Create Graph Manager # ######################## graph_manager = BasicRLGraphManager(agent_params=agent_params, env_params=env_params, schedule_params=schedule) ####################### # add task parameters # ####################### log_path = './experiments/log' # training logs are saved checkpoint_sec = 60 # checkpoints are used to restore the model if not os.path.exists(log_path): os.makedirs(log_path) task_parameters = TaskParameters(evaluate_only=False, experiment_path=log_path, checkpoint_save_secs=checkpoint_sec)
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'relu' agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### DeepRacerInputFilter = InputFilter(is_a_reference_filter=True) DeepRacerInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) DeepRacerInputFilter.add_observation_filter( 'observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) DeepRacerInputFilter.add_observation_filter( 'observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = DeepRacerInputFilter env_params.level = 'DeepRacerRacetrackCustomActionSpaceEnv-v0' vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json