def __init__(self): super().__init__() self.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000) self.evaluation_epsilon = 0.05 self.continuous_exploration_policy_parameters = AdditiveNoiseParameters( ) self.continuous_exploration_policy_parameters.noise_schedule = LinearSchedule( 0.1, 0.1, 50000)
def __init__(self): super().__init__() self.architecture_num_q_heads = 10 self.bootstrapped_data_sharing_probability = 1.0 self.epsilon_schedule = PieceWiseSchedule([ (LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)), (LinearSchedule(0.1, 0.01, 4000000), EnvironmentSteps(4000000)) ]) self.lamb = 0.1
def __init__(self): super().__init__() self.noise_schedule = LinearSchedule(0.1, 0.1, 50000) self.evaluation_noise = 0.05 self.clip_low = 0 self.clip_high = 1 self.noise_as_percentage_from_action_space = True
def __init__(self): super().__init__(algorithm=DQNAlgorithmParameters(), exploration=EGreedyParameters(), memory=ExperienceReplayParameters(), networks={"main": DQNNetworkParameters()}) self.exploration.epsilon_schedule = LinearSchedule(1, 0.1, 1000000) self.exploration.evaluation_epsilon = 0.05
def set_agent_params(agent_params_func): ######### # Agent # ######### agent_params = agent_params_func() agent_params.network_wrappers['main'].batch_size = 128 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( 100) agent_params.algorithm.discount = 0.99 # to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer's bias # with a number in the order of the discounted reward of a random policy agent_params.network_wrappers['main'].heads_parameters = \ [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] # agent_params.network_wrappers['main'].heads_parameters = \ # [QHeadParameters(output_bias_initializer=tf.constant_initializer(0))] # NN configuration agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False agent_params.network_wrappers['main'].softmax_temperature = 0.2 # ER - we'll need an episodic replay buffer for off-policy evaluation agent_params.memory = EpisodicExperienceReplayParameters() # E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1) agent_params.exploration.evaluation_epsilon = 0 return agent_params
def __init__(self): super().__init__() self.algorithm = QuantileRegressionDQNAlgorithmParameters() self.network_wrappers = { "main": QuantileRegressionDQNNetworkParameters() } self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000) self.exploration.evaluation_epsilon = 0.001
def test_init(): # discrete control action_space = DiscreteActionSpace(3) noise_schedule = LinearSchedule(1.0, 1.0, 1000) # additive noise requires a bounded range for the actions action_space = BoxActionSpace(np.array([10])) with pytest.raises(ValueError): policy = AdditiveNoise(action_space, noise_schedule, 0)
def __init__(self): super().__init__(algorithm=DQNAlgorithmParameters(), exploration=EGreedyParameters(), memory=ExperienceReplayParameters(), networks={ "main": DQNNetworkParameters(), "predictor": RNDNetworkParameters(), "constant": RNDNetworkParameters() }) self.exploration.epsilon_schedule = LinearSchedule(1.0, 0.15, 15000)
def test_get_action(): # make sure noise is in range action_space = BoxActionSpace(np.array([10]), -1, 1) noise_schedule = LinearSchedule(1.0, 1.0, 1000) policy = AdditiveNoise(action_space, noise_schedule, 0) # the action range is 2, so there is a ~0.1% chance that the noise will be larger than 3*std=3*2=6 for i in range(1000): action = policy.get_action(np.zeros([10])) assert np.all(action < 10) # make sure there is no clipping of the action since it should be the environment that clips actions assert np.all(action != 1.0) assert np.all(action != -1.0) # make sure that each action element has a different value assert np.all(action[0] != action[1:])
def test_change_phase(): # discrete control action_space = DiscreteActionSpace(3) epsilon_schedule = LinearSchedule(1.0, 0.1, 1000) policy = EGreedy(action_space, epsilon_schedule, evaluation_epsilon=0.01) # verify schedule not applying if not in training phase assert policy.get_control_param() == 1.0 policy.change_phase(RunPhase.TEST) best_action = policy.get_action(np.array([10, 20, 30])) assert policy.epsilon_schedule.current_value == 1.0 policy.change_phase(RunPhase.HEATUP) best_action = policy.get_action(np.array([10, 20, 30])) assert policy.epsilon_schedule.current_value == 1.0 policy.change_phase(RunPhase.UNDEFINED) best_action = policy.get_action(np.array([10, 20, 30])) assert policy.epsilon_schedule.current_value == 1.0
def test_piece_wise_schedule(): # decreasing schedule schedule = PieceWiseSchedule( [(LinearSchedule(1, 3, 10), EnvironmentSteps(5)), (ConstantSchedule(4), EnvironmentSteps(10)), (ExponentialSchedule(3, 1, 0.99), EnvironmentSteps(10)) ] ) target_values = np.append(np.linspace(1, 2, 6), np.ones(11)*4) for i in range(16): assert round(schedule.current_value, 4) == round(target_values[i], 4) schedule.step() current_power = 1 for i in range(10): assert round(schedule.current_value, 4) == round(3*current_power, 4) current_power *= 0.99 schedule.step()
def test_get_action(): # discrete control action_space = DiscreteActionSpace(3) epsilon_schedule = LinearSchedule(1.0, 1.0, 1000) policy = EGreedy(action_space, epsilon_schedule, evaluation_epsilon=0) # verify that test phase gives greedy actions (evaluation_epsilon = 0) policy.change_phase(RunPhase.TEST) for i in range(100): best_action, _ = policy.get_action(np.array([10, 20, 30])) assert best_action == 2 # verify that train phase gives uniform actions (exploration = 1) policy.change_phase(RunPhase.TRAIN) counters = np.array([0, 0, 0]) for i in range(30000): best_action, _ = policy.get_action(np.array([10, 20, 30])) counters[best_action] += 1 assert np.all(counters > 9500) # this is noisy so we allow 5% error
def test_get_control_param(): # discrete control action_space = DiscreteActionSpace(3) epsilon_schedule = LinearSchedule(1.0, 0.1, 1000) policy = EGreedy(action_space, epsilon_schedule, evaluation_epsilon=0.01) # verify schedule applies to TRAIN phase policy.change_phase(RunPhase.TRAIN) for i in range(999): best_action = policy.get_action(np.array([10, 20, 30])) assert 1.0 > policy.get_control_param() > 0.1 best_action = policy.get_action(np.array([10, 20, 30])) assert policy.get_control_param() == 0.1 # test phases policy.change_phase(RunPhase.TEST) assert policy.get_control_param() == 0.01 policy.change_phase(RunPhase.TRAIN) assert policy.get_control_param() == 0.1 policy.change_phase(RunPhase.HEATUP) assert policy.get_control_param() == 0.1
schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentSteps(50000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(1000000) schedule_params.evaluation_steps = EnvironmentSteps(125000) schedule_params.heatup_steps = EnvironmentSteps(20000) ######### # Agent # ######### agent_params = RainbowDQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0000625 agent_params.network_wrappers['main'].optimizer_epsilon = 1.5e-4 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 32000 // 4) # 32k frames agent_params.memory.beta = LinearSchedule( 0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames agent_params.memory.alpha = 0.5 ############### # Environment # ############### env_params = Atari() env_params.level = SingleLevelSelection(atari_deterministic_v4) vis_params = VisualizationParameters() vis_params.video_dump_methods = [ SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod() ] vis_params.dump_mp4 = False
def get_graph_manager(**hp_dict): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "huber")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'relu' agent_params.network_wrappers['main'].batch_size = params["batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes( params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) ############### # Environment # ############### SilverstoneInputFilter = InputFilter(is_a_reference_filter=True) SilverstoneInputFilter.add_observation_filter('observation', 'to_grayscale', ObservationRGBToYFilter()) SilverstoneInputFilter.add_observation_filter( 'observation', 'to_uint8', ObservationToUInt8Filter(0, 255)) SilverstoneInputFilter.add_observation_filter( 'observation', 'stacking', ObservationStackingFilter(params["stack_size"])) env_params = GymVectorEnvironment() env_params.default_input_filter = SilverstoneInputFilter env_params.level = 'SilverstoneRacetrack-Discrete-v0' vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 1000 graph_manager = BasicRLGraphManager( agent_params=agent_params, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(N) #################### # DQN Agent Params # #################### agent_params = DDQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.00025 agent_params.network_wrappers['main'].heads_parameters = [ DuelingQHeadParameters() ] agent_params.memory.max_size = (MemoryGranularity.Transitions, 1000000) agent_params.algorithm.discount = 0.99 agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(4) agent_params.exploration.epsilon_schedule = LinearSchedule( 1, 0.1, (N + 7) * 2000) agent_params.input_filter = NoInputFilter() agent_params.output_filter = NoOutputFilter() ############### # Environment # ############### env_params = GymEnvironmentParameters() env_params.level = 'rl_coach.environments.toy_problems.exploration_chain:ExplorationChain' env_params.additional_simulator_parameters = { 'chain_length': N, 'max_steps': N + 7 } vis_params = VisualizationParameters()
schedule_params.improve_steps = EnvironmentSteps(6250000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(62500) schedule_params.evaluation_steps = EnvironmentSteps(6250) schedule_params.heatup_steps = EnvironmentSteps(1) ######### # Agent # ######### agent_params = DFPAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0001 # the original DFP code decays epsilon in ~1.5M steps. Only that unlike other most other papers, these are 1.5M # training steps. i.e. it is equivalent to once every 8 playing steps (when a training batch is sampled). # so this is 1.5M*8 =~ 12M playing steps per worker. # TODO allow the epsilon schedule to be defined in terms of training steps. agent_params.exploration.epsilon_schedule = LinearSchedule(1, 0, 12000000) agent_params.exploration.evaluation_epsilon = 0 agent_params.algorithm.use_accumulated_reward_as_measurement = False agent_params.algorithm.goal_vector = [0.5, 0.5, 1] # ammo, health, frag count agent_params.network_wrappers['main'].input_embedders_parameters[ 'measurements'].input_rescaling['vector'] = 100. agent_params.algorithm.scale_measurements_targets['GameVariable.HEALTH'] = 30.0 agent_params.algorithm.scale_measurements_targets['GameVariable.AMMO2'] = 7.5 agent_params.algorithm.scale_measurements_targets['GameVariable.USER2'] = 1.0 agent_params.network_wrappers['main'].learning_rate_decay_rate = 0.3 agent_params.network_wrappers['main'].learning_rate_decay_steps = 250000 agent_params.network_wrappers['main'].input_embedders_parameters[ 'measurements'].input_offset['vector'] = 0.5 agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].input_offset['vector'] = 0.5
def __init__(self): super().__init__() self.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(30000) self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000) self.exploration.evaluation_epsilon = 0.001
def __init__(self): super().__init__() self.architecture_num_q_heads = 10 self.bootstrapped_data_sharing_probability = 1.0 self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000)
def __init__(self): super().__init__() self.epsilon_schedule = LinearSchedule(1, 0.01, 1000000) self.evaluation_epsilon = 0.001
def __init__(self): super().__init__() self.algorithm = DDQNBCQAlgorithmParameters() self.exploration.epsilon_schedule = LinearSchedule(1, 0.01, 1000000) self.exploration.evaluation_epsilon = 0.001
schedule_params.improve_steps = EnvironmentSteps(50000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(250000) schedule_params.evaluation_steps = EnvironmentSteps(135000) schedule_params.heatup_steps = EnvironmentSteps(50000) ######### # Agent # ######### agent_params = DDQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].middleware_parameters.scheme = MiddlewareScheme.Empty agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()] agent_params.network_wrappers['main'].clip_gradients = 10 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(40000) agent_params.exploration.epsilon_schedule = PieceWiseSchedule( [(LinearSchedule(1, 0.1, 1000000), EnvironmentSteps(1000000)), (LinearSchedule(0.1, 0.01, 10000000), EnvironmentSteps(1000000)), (ConstantSchedule(0.001), EnvironmentSteps(10000000))] ) agent_params.memory = PrioritizedExperienceReplayParameters() agent_params.memory.beta = LinearSchedule(0.4, 1, 12500000) # 12.5M training iterations = 50M steps = 200M frames ############### # Environment # ############### env_params = Atari() env_params.level = SingleLevelSelection(atari_deterministic_v4) vis_params = VisualizationParameters() vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()] vis_params.dump_mp4 = False
def __init__(self): super().__init__() self.noise_percentage_schedule = LinearSchedule(0.1, 0.1, 50000) self.evaluation_noise_percentage = 0.05
agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(2048) agent_params.network_wrappers["main"].learning_rate = 0.0003 agent_params.network_wrappers["main"].input_embedders_parameters[ "observation" ].activation_function = "tanh" agent_params.network_wrappers["main"].input_embedders_parameters["observation"].scheme = [Dense(64)] agent_params.network_wrappers["main"].middleware_parameters.scheme = [Dense(64)] agent_params.network_wrappers["main"].middleware_parameters.activation_function = "tanh" agent_params.network_wrappers["main"].batch_size = 64 agent_params.network_wrappers["main"].optimizer_epsilon = 1e-5 agent_params.network_wrappers["main"].clip_gradients = 40.0 agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000) ############### # Environment # ############### env_params = GymVectorEnvironment(level="autoscalesim:SimpleScalableWebserviceSim") ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 150 preset_validation_params.max_episodes_to_achieve_reward = 400 graph_manager = BasicRLGraphManager(
def get_graph_manager(hp_dict, agent_list, run_phase_subject): #################### # All Default Parameters # #################### params = {} params["batch_size"] = int(hp_dict.get("batch_size", 64)) params["num_epochs"] = int(hp_dict.get("num_epochs", 10)) params["stack_size"] = int(hp_dict.get("stack_size", 1)) params["lr"] = float(hp_dict.get("lr", 0.0003)) params["exploration_type"] = (hp_dict.get("exploration_type", "categorical")).lower() params["e_greedy_value"] = float(hp_dict.get("e_greedy_value", .05)) params["epsilon_steps"] = int(hp_dict.get("epsilon_steps", 10000)) params["beta_entropy"] = float(hp_dict.get("beta_entropy", .01)) params["discount_factor"] = float(hp_dict.get("discount_factor", .999)) params["loss_type"] = hp_dict.get("loss_type", "Mean squared error").lower() params["num_episodes_between_training"] = int( hp_dict.get("num_episodes_between_training", 20)) params["term_cond_max_episodes"] = int( hp_dict.get("term_cond_max_episodes", 100000)) params["term_cond_avg_score"] = float( hp_dict.get("term_cond_avg_score", 100000)) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps( params["term_cond_max_episodes"]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: agent_params = DeepRacerAgentParams() if agent.network_settings: agent_params.env_agent = agent agent_params.network_wrappers['main'].learning_rate = params["lr"] agent_params.network_wrappers['main'].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys( ): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or\ observation == Input.OBSERVATION.value: input_filter.add_observation_filter( observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter( observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter( observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter( observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: input_filter.add_observation_filter( observation, 'binary', ObservationBinarySectorFilter()) agent_params.input_filter = input_filter() agent_params.network_wrappers['main'].batch_size = params[ "batch_size"] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params["loss_type"] == "huber": agent_params.network_wrappers[ 'main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = params["beta_entropy"] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params["discount_factor"] agent_params.algorithm.optimization_epochs = params["num_epochs"] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.num_consecutive_playing_steps = \ EnvironmentEpisodes(params["num_episodes_between_training"]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC if params["exploration_type"] == "categorical": agent_params.exploration = CategoricalParameters() else: agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule( 1.0, params["e_greedy_value"], params["epsilon_steps"]) trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager( agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params) return graph_manager, params_json
agent_params.algorithm.discount = 0.99 # to jump start the agent's q values, and speed things up, we'll initialize the last Dense layer # with something in the order of the discounted reward of a random policy agent_params.network_wrappers['main'].heads_parameters = \ [QHeadParameters(output_bias_initializer=tf.constant_initializer(-100))] # NN configuration agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False # ER - we'll be needing an episodic replay buffer for off-policy evaluation agent_params.memory = EpisodicExperienceReplayParameters() # E-Greedy schedule - there is no exploration in Batch RL. Disabling E-Greedy. agent_params.exploration.epsilon_schedule = LinearSchedule(initial_value=0, final_value=0, decay_steps=1) agent_params.exploration.evaluation_epsilon = 0 # can use either a kNN or a NN based model for predicting which actions not to max over in the bellman equation #agent_params.algorithm.action_drop_method_parameters = KNNParameters() DATATSET_PATH = 'acrobot_dataset.csv' agent_params.memory = EpisodicExperienceReplayParameters() agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, is_episodic = True) spaces = SpacesDefinition(state=StateSpace({'observation': VectorObservationSpace(shape=6)}), goal=None, action=DiscreteActionSpace(3), reward=RewardSpace(1))
agent_params.network_wrappers['main'].learning_rate = 0.0003 agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'tanh' agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].scheme = [Dense(64)] agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense(64) ] agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'tanh' agent_params.network_wrappers['main'].batch_size = 64 agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = 0 agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = 0.99 agent_params.algorithm.optimization_epochs = 10 agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps( 2048) # Distributed Coach synchronization type. agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC agent_params.exploration = EGreedyParameters() agent_params.exploration.epsilon_schedule = LinearSchedule(1.0, 0.01, 10000) agent_params.pre_network_filter.add_observation_filter( 'observation', 'normalize_observation',
# agent_params.algorithm.action_drop_method_parameters = KNNParameters() agent_params.algorithm.action_drop_method_parameters = NNImitationModelParameters() # NN configuration agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False agent_params.network_wrappers['main'].softmax_temperature = 0.2 # ER size agent_params.memory = EpisodicExperienceReplayParameters() # DATATSET_PATH = 'acrobot.csv' # agent_params.memory.load_memory_from_file_path = CsvDataset(DATATSET_PATH, True) # E-Greedy schedule agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000) agent_params.exploration.evaluation_epsilon = 0 # Experience Generating Agent parameters experience_generating_agent_params = DDQNAgentParameters() # schedule parameters experience_generating_schedule_params = ScheduleParameters() experience_generating_schedule_params.heatup_steps = EnvironmentSteps(1000) experience_generating_schedule_params.improve_steps = TrainingSteps( DATASET_SIZE - experience_generating_schedule_params.heatup_steps.num_steps) experience_generating_schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(10) experience_generating_schedule_params.evaluation_steps = EnvironmentEpisodes(1) # DQN params experience_generating_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(100)
] agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].activation_function = 'relu' agent_params.network_wrappers[ 'main'].middleware_parameters.activation_function = 'relu' #agent_params.network_wrappers['main'].middleware_parameters.scheme = [ # Conv2dWithAttention(64, 3, 1, 1000) #] agent_params.network_wrappers['main'].batch_size = 64 agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule( 1.0, 0, 1000000) agent_params.algorithm.beta_entropy = 0.01 # also try 0.001 agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = 0.999 agent_params.algorithm.optimization_epochs = 10 agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentEpisodes( 20) agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(20) agent_params.exploration = CategoricalParameters() agent_params.algorithm.distributed_coach_synchronization_type = DistributedCoachSynchronizationType.SYNC ############### # Environment #
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(50) schedule_params.evaluation_steps = EnvironmentEpisodes(3) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = DDQNAgentParameters() agent_params.memory.max_size = (MemoryGranularity.Transitions, 5000) agent_params.network_wrappers['main'].learning_rate = 0.00025 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = EnvironmentSteps(1000) agent_params.exploration.epsilon_schedule = LinearSchedule(0.5, 0.01, 50000) agent_params.exploration.evaluation_epsilon = 0 agent_params.algorithm.num_consecutive_playing_steps = EnvironmentSteps(1) agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False agent_params.network_wrappers['main'].heads_parameters = [DuelingQHeadParameters()] ############### # Environment # ############### env_params = DoomEnvironmentParameters() env_params.level = 'basic' vis_params = VisualizationParameters() vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()] vis_params.dump_mp4 = False