class FCMiddleware(Middleware): schemes = { MiddlewareScheme.Empty: [], # ppo MiddlewareScheme.Shallow: [ Dense([64]) ], # dqn MiddlewareScheme.Medium: [ Dense([512]) ], MiddlewareScheme.Deep: \ [ Dense([128]), Dense([128]), Dense([128]) ] } def __init__(self, activation_function=tf.nn.relu, scheme: MiddlewareScheme = MiddlewareScheme.Medium, batchnorm: bool = False, dropout: bool = False, name="middleware_fc_embedder"): super().__init__(activation_function=activation_function, batchnorm=batchnorm, dropout=dropout, scheme=scheme, name=name) self.return_type = Middleware_FC_Embedding self.layers = [] def _build_module(self): self.layers.append(self.input) if isinstance(self.scheme, MiddlewareScheme): layers_params = FCMiddleware.schemes[self.scheme] else: layers_params = self.scheme for idx, layer_params in enumerate(layers_params): self.layers.append( layer_params(self.layers[-1], name='{}_{}'.format( layer_params.__class__.__name__, idx))) self.layers.extend( batchnorm_activation_dropout(self.layers[-1], self.batchnorm, self.activation_function, self.dropout, self.dropout_rate, idx)) self.output = self.layers[-1]
class VectorEmbedder(InputEmbedder): """ An input embedder that is intended for inputs that can be represented as vectors. The embedder flattens the input, applies several dense layers to it and returns the output. """ schemes = { EmbedderScheme.Empty: [], EmbedderScheme.Shallow: [ Dense([128]) ], # dqn EmbedderScheme.Medium: [ Dense([256]) ], # carla EmbedderScheme.Deep: \ [ Dense([128]), Dense([128]), Dense([128]) ] } def __init__(self, input_size: List[int], activation_function=tf.nn.relu, scheme: EmbedderScheme = EmbedderScheme.Medium, batchnorm: bool = False, dropout: bool = False, name: str = "embedder", input_rescaling: float = 1.0, input_offset: float = 0.0, input_clipping=None): super().__init__(input_size, activation_function, scheme, batchnorm, dropout, name, input_rescaling, input_offset, input_clipping) self.return_type = InputVectorEmbedding if len(self.input_size) != 1 and scheme != EmbedderScheme.Empty: raise ValueError( "The input size of a vector embedder must contain only a single dimension" )
def __init__(self): super().__init__() self.input_embedders_parameters = { 'observation': InputEmbedderParameters(activation_function='leaky_relu'), 'measurements': InputEmbedderParameters(activation_function='leaky_relu'), 'goal': InputEmbedderParameters(activation_function='leaky_relu') } self.input_embedders_parameters['observation'].scheme = [ Conv2d([32, 8, 4]), Conv2d([64, 4, 2]), Conv2d([64, 3, 1]), Dense([512]), ] self.input_embedders_parameters['measurements'].scheme = [ Dense([128]), Dense([128]), Dense([128]), ] self.input_embedders_parameters['goal'].scheme = [ Dense([128]), Dense([128]), Dense([128]), ] self.middleware_parameters = FCMiddlewareParameters( activation_function='leaky_relu', scheme=MiddlewareScheme.Empty) self.heads_parameters = [ MeasurementsPredictionHeadParameters( activation_function='leaky_relu') ] self.loss_weights = [1.0] self.async_training = False self.batch_size = 64 self.adam_optimizer_beta1 = 0.95
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \ agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation') agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \ agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation') agent_params.network_wrappers['actor'].input_embedders_parameters[ 'measurements'].scheme = [Dense([300])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense([200]) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'measurements'].scheme = [Dense([400])] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [ Dense([300]) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'action'].scheme = EmbedderScheme.Empty agent_params.input_filter = MujocoInputFilter() agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1 / 10.)) ###############
################ # Agent Params # ################ agent_params = CILAgentParameters() # forward camera and measurements input agent_params.network_wrappers['main'].input_embedders_parameters = { 'CameraRGB': InputEmbedderParameters(scheme=[Conv2d([32, 5, 2]), Conv2d([32, 3, 1]), Conv2d([64, 3, 2]), Conv2d([64, 3, 1]), Conv2d([128, 3, 2]), Conv2d([128, 3, 1]), Conv2d([256, 3, 1]), Conv2d([256, 3, 1]), Dense([512]), Dense([512])], dropout=True, batchnorm=True), 'measurements': InputEmbedderParameters(scheme=[Dense([128]), Dense([128])]) } # TODO: batch norm is currently applied to the fc layers which is not desired # TODO: dropout should be configured differenetly per layer [1.0] * 8 + [0.7] * 2 + [0.5] * 2 + [0.5] * 1 + [0.5, 1.] * 5 # simple fc middleware agent_params.network_wrappers['main'].middleware_parameters = FCMiddlewareParameters(scheme=[Dense([512])]) # output branches agent_params.network_wrappers['main'].heads_parameters = [
from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps, RunPhase, GradientClippingMethod #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = NAFAgentParameters() agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Dense([200])] agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([200])] agent_params.network_wrappers['main'].clip_gradients = 1000 agent_params.network_wrappers['main'].gradients_clipping_method = GradientClippingMethod.ClipByValue ############### # Environment # ############### env_params = Mujoco() env_params.level = SingleLevelSelection(mujoco_v2) vis_params = VisualizationParameters() vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()] vis_params.dump_mp4 = False
#################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = NAFAgentParameters() agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].scheme = [Dense([200])] agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense([200]) ] agent_params.network_wrappers['main'].clip_gradients = 1000 agent_params.network_wrappers[ 'main'].gradients_clipping_method = GradientClippingMethod.ClipByValue ############### # Environment # ############### env_params = Mujoco() env_params.level = SingleLevelSelection(mujoco_v2) vis_params = VisualizationParameters() vis_params.video_dump_methods = [
top_agent_params.memory.hindsight_transitions_per_regular_transition = 3 top_agent_params.memory.hindsight_goal_selection_method = HindsightGoalSelectionMethod.Future top_agent_params.memory.goals_space = goals_space top_agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(32) top_agent_params.algorithm.num_consecutive_training_steps = 40 top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(40) # exploration - OU process top_agent_params.exploration = OUProcessParameters() top_agent_params.exploration.theta = 0.1 # actor top_actor = top_agent_params.network_wrappers['actor'] top_actor.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)} top_actor.middleware_parameters.scheme = [Dense([64])] * 3 top_actor.learning_rate = 0.001 top_actor.batch_size = 4096 # critic top_critic = top_agent_params.network_wrappers['critic'] top_critic.input_embedders_parameters = {'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)} top_critic.embedding_merger_type = EmbeddingMergerType.Concat top_critic.middleware_parameters.scheme = [Dense([64])] * 3 top_critic.learning_rate = 0.001 top_critic.batch_size = 4096 # ----------
# Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = PPOAgentParameters() agent_params.network_wrappers['actor'].learning_rate = 0.001 agent_params.network_wrappers['critic'].learning_rate = 0.001 agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense([64])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([64])] agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense([64])] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([64])] agent_params.input_filter = MujocoInputFilter() agent_params.input_filter.add_observation_filter('observation', 'normalize', ObservationNormalizationFilter()) agent_params.exploration = ContinuousEntropyParameters() ############### # Environment # ############### env_params = Mujoco() env_params.level = SingleLevelSelection(mujoco_v2)
#################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = PPOAgentParameters() agent_params.network_wrappers['actor'].learning_rate = 0.001 agent_params.network_wrappers['critic'].learning_rate = 0.001 agent_params.network_wrappers['actor'].input_embedders_parameters[ 'observation'].scheme = [Dense([64])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense([64]) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'observation'].scheme = [Dense([64])] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [ Dense([64]) ] agent_params.input_filter = MujocoInputFilter() agent_params.input_filter.add_observation_filter( 'observation', 'normalize', ObservationNormalizationFilter()) agent_params.exploration = ContinuousEntropyParameters()
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ActorCriticAgentParameters() agent_params.algorithm.apply_gradients_every_x_episodes = 1 agent_params.algorithm.num_steps_between_gradient_updates = 20 agent_params.algorithm.beta_entropy = 0.005 agent_params.network_wrappers['main'].learning_rate = 0.00002 agent_params.network_wrappers['main'].input_embedders_parameters['observation'] = \ InputEmbedderParameters(scheme=[Dense([200])]) agent_params.network_wrappers['main'].middleware_parameters = LSTMMiddlewareParameters(scheme=MiddlewareScheme.Empty, number_of_lstm_cells=128) agent_params.input_filter = MujocoInputFilter() agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/20.)) agent_params.input_filter.add_observation_filter('observation', 'normalize', ObservationNormalizationFilter()) agent_params.exploration = ContinuousEntropyParameters() ############### # Environment # ############### env_params = Mujoco() env_params.level = SingleLevelSelection(mujoco_v2)
#################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentSteps(2000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters[ 'observation'].scheme = [Dense([400])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense([300]) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'observation'].scheme = [Dense([400])] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [ Dense([300]) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'action'].scheme = EmbedderScheme.Empty ############### # Environment # ############### env_params = Mujoco()
################ agent_params = CILAgentParameters() # forward camera and measurements input agent_params.network_wrappers['main'].input_embedders_parameters = { 'forward_camera': InputEmbedderParameters(scheme=[ Conv2d([32, 5, 2]), Conv2d([32, 3, 1]), Conv2d([64, 3, 2]), Conv2d([64, 3, 1]), Conv2d([128, 3, 2]), Conv2d([128, 3, 1]), Conv2d([256, 3, 1]), Conv2d([256, 3, 1]), Dense([512]), Dense([512]) ], dropout=True, batchnorm=True), 'measurements': InputEmbedderParameters(scheme=[Dense([128]), Dense([128])]) } # TODO: batch norm will apply to the fc layers which is not desirable. # TODO: dropout rate can be configured currently # TODO: dropout should be configured differenetly per layer [1.0] * 8 + [0.7] * 2 + [0.5] * 2 + [0.5] * 1 + [0.5, 1.] * 5 # simple fc middleware agent_params.network_wrappers[ 'main'].middleware_parameters = FCMiddlewareParameters(
schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentEpisodes(16 * 50 * 200) # 200 epochs schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes( 16 * 50) # 50 cycles schedule_params.evaluation_steps = EnvironmentEpisodes(10) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = DQNAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.001 agent_params.network_wrappers['main'].batch_size = 128 agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense([256]) ] agent_params.network_wrappers['main'].input_embedders_parameters = { 'state': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } agent_params.algorithm.discount = 0.98 agent_params.algorithm.num_consecutive_playing_steps = EnvironmentEpisodes(16) agent_params.algorithm.num_consecutive_training_steps = 40 agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( 40) agent_params.algorithm.rate_for_copying_weights_to_target = 0.05 agent_params.memory.max_size = (MemoryGranularity.Transitions, 10**6) agent_params.exploration.epsilon_schedule = ConstantSchedule(0.2) agent_params.exploration.evaluation_epsilon = 0
#################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100) schedule_params.evaluation_steps = EnvironmentEpisodes(3) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = NStepQAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Conv2d([16, 8, 4]), Conv2d([32, 4, 2])] agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense([256])] ############### # Environment # ############### env_params = Atari() env_params.level = SingleLevelSelection(atari_deterministic_v4) vis_params = VisualizationParameters() vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()] vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters()
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \ agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation') agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \ agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation') agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'].scheme = [Dense([300])] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense([200])] agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'].scheme = [Dense([400])] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense([300])] agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty agent_params.input_filter = MujocoInputFilter() agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1/10.)) ############### # Environment # ############### env_params = ControlSuiteEnvironmentParameters() env_params.level = SingleLevelSelection(control_suite_envs) vis_params = VisualizationParameters() vis_params.video_dump_methods = [SelectedPhaseOnlyDumpMethod(RunPhase.TEST), MaxDumpMethod()]
# Agent Params # ################ agent_params = DDPGAgentParameters() # actor actor_network = agent_params.network_wrappers['actor'] actor_network.learning_rate = 0.001 actor_network.batch_size = 256 actor_network.optimizer_epsilon = 1e-08 actor_network.adam_optimizer_beta1 = 0.9 actor_network.adam_optimizer_beta2 = 0.999 actor_network.input_embedders_parameters = { 'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } actor_network.middleware_parameters = FCMiddlewareParameters(scheme=[Dense([256]), Dense([256]), Dense([256])]) actor_network.heads_parameters[0].batchnorm = False # critic critic_network = agent_params.network_wrappers['critic'] critic_network.learning_rate = 0.001 critic_network.batch_size = 256 critic_network.optimizer_epsilon = 1e-08 critic_network.adam_optimizer_beta1 = 0.9 critic_network.adam_optimizer_beta2 = 0.999 critic_network.input_embedders_parameters = { 'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } critic_network.middleware_parameters = FCMiddlewareParameters(scheme=[Dense([256]), Dense([256]), Dense([256])])