def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str, head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str = 'relu', dense_layer=Dense, scheme=[Dense(256), Dense(256)]): super().__init__(agent_parameters, spaces, network_name, head_idx, loss_weight, is_local, activation_function, dense_layer=dense_layer) self.name = 'regression_head' self.scheme = scheme self.layers = [] if isinstance(self.spaces.action, BoxActionSpace): self.num_actions = self.spaces.action.shape[0] elif isinstance(self.spaces.action, DiscreteActionSpace): self.num_actions = len(self.spaces.action.actions) self.return_type = QActionStateValue if agent_parameters.network_wrappers[ self.network_name].replace_mse_with_huber_loss: self.loss_type = tf.losses.huber_loss else: self.loss_type = tf.losses.mean_squared_error
def __call__(self, input_layer, name: str = None, is_training=None): """ returns a tensorflow conv2d layer :param input_layer: previous layer :param name: layer name :return: conv2d layer """ #input_layer.reshape #print ("--input", input_layer.shape) #input_layer = tf.reshape(tensor=input_layer[-1], shape=(input_layer[-1].shape[0]/512,64,4,2)) #input_layer = tf.reshape(tensor=input_layer, shape=(-1,64,4,2)) #print (input_layer.shape) conv = tf.layers.conv2d(input_layer, filters=self.num_filters, kernel_size=self.kernel_size, strides=self.strides, data_format='channels_last', name=name) W1 = Dense(self.units) V = Dense(1) score = tf.nn.tanh(W1(conv)) attention_weights = tf.nn.softmax(V(score), axis=1) context_vector = attention_weights * conv context_vector = tf.reduce_sum(context_vector, axis=1) return context_vector
def __init__(self, activation_function: str = 'relu', name: str = 'q_head_params', num_output_head_copies: int = 1, rescale_gradient_from_head_by_factor: float = 1.0, loss_weight: float = 1.0, dense_layer=Dense, scheme=[Dense(256), Dense(256)]): super().__init__(parameterized_class=RegressionHead, activation_function=activation_function, name=name, dense_layer=dense_layer, num_output_head_copies=num_output_head_copies, rescale_gradient_from_head_by_factor= rescale_gradient_from_head_by_factor, loss_weight=loss_weight)
def __init__(self): super().__init__() self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'), 'measurements': InputEmbedderParameters(activation_function='leaky_relu'), 'goal': InputEmbedderParameters(activation_function='leaky_relu')} self.input_embedders_parameters['observation'].scheme = [ Conv2d(32, 8, 4), Conv2d(64, 4, 2), Conv2d(64, 3, 1), Dense(512), ] self.input_embedders_parameters['measurements'].scheme = [ Dense(128), Dense(128), Dense(128), ] self.input_embedders_parameters['goal'].scheme = [ Dense(128), Dense(128), Dense(128), ] self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu', scheme=MiddlewareScheme.Empty) self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')] self.async_training = False self.batch_size = 64 self.adam_optimizer_beta1 = 0.95
schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100) schedule_params.evaluation_steps = EnvironmentEpisodes(3) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = NStepQAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0001 agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].scheme = [Conv2d(16, 8, 4), Conv2d(32, 4, 2)] agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense(256) ] ############### # Environment # ############### env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4)) ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.trace_test_levels = [ 'breakout', 'pong', 'space_invaders' ]
#################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = PPOAgentParameters() agent_params.network_wrappers['actor'].learning_rate = 0.001 agent_params.network_wrappers['critic'].learning_rate = 0.001 agent_params.network_wrappers['actor'].input_embedders_parameters[ 'observation'].scheme = [Dense(64)] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense(64) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'observation'].scheme = [Dense(64)] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [ Dense(64) ] agent_params.input_filter = InputFilter() agent_params.input_filter.add_observation_filter( 'observation', 'normalize', ObservationNormalizationFilter()) ############### # Environment #
Conv2d(64, 3, 1), BatchnormActivationDropout(batchnorm=True, activation_function=tf.tanh), Conv2d(128, 3, 2), BatchnormActivationDropout(batchnorm=True, activation_function=tf.tanh), Conv2d(128, 3, 1), BatchnormActivationDropout(batchnorm=True, activation_function=tf.tanh), Conv2d(256, 3, 1), BatchnormActivationDropout(batchnorm=True, activation_function=tf.tanh), Conv2d(256, 3, 1), BatchnormActivationDropout(batchnorm=True, activation_function=tf.tanh), Dense(512), BatchnormActivationDropout(activation_function=tf.tanh, dropout_rate=0.3), Dense(512), BatchnormActivationDropout(activation_function=tf.tanh, dropout_rate=0.3) ], activation_function= 'none' # we define the activation function for each layer explicitly ), 'measurements': InputEmbedderParameters( scheme=[ Dense(128), BatchnormActivationDropout(activation_function=tf.tanh, dropout_rate=0.5),
#################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = NAFAgentParameters() agent_params.network_wrappers['main'].input_embedders_parameters[ 'observation'].scheme = [Dense(200)] agent_params.network_wrappers['main'].middleware_parameters.scheme = [ Dense(200) ] agent_params.network_wrappers['main'].clip_gradients = 1000 agent_params.network_wrappers[ 'main'].gradients_clipping_method = GradientClippingMethod.ClipByValue ############### # Environment # ############### import jsbsim import gym_jsbsim from rl_coach.filters.filter import NoInputFilter, NoOutputFilter from rl_coach.filters.filter import InputFilter from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
32) top_agent_params.algorithm.num_consecutive_training_steps = 40 top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps( 40) # exploration - OU process top_agent_params.exploration = OUProcessParameters() top_agent_params.exploration.theta = 0.1 # actor top_actor = top_agent_params.network_wrappers['actor'] top_actor.input_embedders_parameters = { 'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } top_actor.middleware_parameters.scheme = [Dense(64)] * 3 top_actor.learning_rate = 0.001 top_actor.batch_size = 4096 # critic top_critic = top_agent_params.network_wrappers['critic'] top_critic.input_embedders_parameters = { 'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } top_critic.embedding_merger_type = EmbeddingMergerType.Concat top_critic.middleware_parameters.scheme = [Dense(64)] * 3 top_critic.learning_rate = 0.001 top_critic.batch_size = 4096
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000) schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2048) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ClippedPPOAgentParameters() agent_params.network_wrappers['main'].learning_rate = 0.0003 agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'tanh' agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Dense(64)] agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense(64)] agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'tanh' agent_params.network_wrappers['main'].batch_size = 64 agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000) agent_params.algorithm.beta_entropy = 0 agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = 0.99 agent_params.algorithm.optimization_epochs = 10 agent_params.algorithm.estimate_state_value_using_gae = True agent_params.input_filter = InputFilter()
#################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = EnvironmentSteps(2000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)] agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense(400)] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(300)] agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty ############### # Environment # ############### env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2)) ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False agent_params.network_wrappers['main'].l2_regularization = 0.0001 agent_params.network_wrappers['main'].softmax_temperature = 0.2 # reward model params agent_params.network_wrappers['reward_model'] = deepcopy(agent_params.network_wrappers['main']) agent_params.network_wrappers['reward_model'].learning_rate = 0.0001 agent_params.network_wrappers['reward_model'].l2_regularization = 0 agent_params.network_wrappers['imitation_model'] = deepcopy(agent_params.network_wrappers['main']) agent_params.network_wrappers['imitation_model'].learning_rate = 0.0001 agent_params.network_wrappers['imitation_model'].l2_regularization = 0 agent_params.network_wrappers['imitation_model'].heads_parameters = [ClassificationHeadParameters()] agent_params.network_wrappers['imitation_model'].input_embedders_parameters['observation'].scheme = \ [Dense(1024), Dense(1024), Dense(512), Dense(512), Dense(256)] agent_params.network_wrappers['imitation_model'].middleware_parameters.scheme = [Dense(128), Dense(64)] # ER size agent_params.memory = EpisodicExperienceReplayParameters() agent_params.memory.max_size = (MemoryGranularity.Transitions, DATASET_SIZE) # E-Greedy schedule agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000) agent_params.exploration.evaluation_epsilon = 0 agent_params.input_filter = InputFilter() agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
from rl_coach.filters.reward import RewardEwmaNormalizationFilter import numpy as np #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(0) schedule_params.evaluation_steps = EnvironmentEpisodes(0) ##################### # DDPG Agent Params # ##################### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters[ 'observation'].scheme = [Dense(300)] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense(300) ] agent_params.network_wrappers['actor'].heads_parameters[ 0].activation_function = 'sigmoid' agent_params.network_wrappers['critic'].input_embedders_parameters[ 'observation'].scheme = [Dense(300)] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [ Dense(300) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'action'].scheme = [Dense(300)] agent_params.network_wrappers[ 'critic'].embedding_merger_type = EmbeddingMergerType.Sum
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### agent_params = ActorCriticAgentParameters() agent_params.algorithm.apply_gradients_every_x_episodes = 1 agent_params.algorithm.num_steps_between_gradient_updates = 20 agent_params.algorithm.beta_entropy = 0.005 agent_params.network_wrappers['main'].learning_rate = 0.00002 agent_params.network_wrappers['main'].input_embedders_parameters['observation'] = \ InputEmbedderParameters(scheme=[Dense(200)]) agent_params.network_wrappers[ 'main'].middleware_parameters = LSTMMiddlewareParameters( scheme=MiddlewareScheme.Empty, number_of_lstm_cells=128) agent_params.input_filter = InputFilter() agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1 / 20.)) agent_params.input_filter.add_observation_filter( 'observation', 'normalize', ObservationNormalizationFilter()) ############### # Environment # ############### env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(10000000000) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20) schedule_params.evaluation_steps = EnvironmentEpisodes(1) schedule_params.heatup_steps = EnvironmentSteps(1000) ######### # Agent # ######### agent_params = DDPGAgentParameters() agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \ agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation') agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \ agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation') agent_params.network_wrappers['actor'].input_embedders_parameters[ 'measurements'].scheme = [Dense(300)] agent_params.network_wrappers['actor'].middleware_parameters.scheme = [ Dense(200) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'measurements'].scheme = [Dense(400)] agent_params.network_wrappers['critic'].middleware_parameters.scheme = [ Dense(300) ] agent_params.network_wrappers['critic'].input_embedders_parameters[ 'action'].scheme = EmbedderScheme.Empty agent_params.input_filter = InputFilter() agent_params.input_filter.add_reward_filter("rescale", RewardRescaleFilter(1 / 10.)) ###############
################ agent_params = DDPGAgentParameters() # actor actor_network = agent_params.network_wrappers['actor'] actor_network.learning_rate = 0.001 actor_network.batch_size = 256 actor_network.optimizer_epsilon = 1e-08 actor_network.adam_optimizer_beta1 = 0.9 actor_network.adam_optimizer_beta2 = 0.999 actor_network.input_embedders_parameters = { 'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } actor_network.middleware_parameters = FCMiddlewareParameters( scheme=[Dense(256), Dense(256), Dense(256)]) actor_network.heads_parameters[0].batchnorm = False # critic critic_network = agent_params.network_wrappers['critic'] critic_network.learning_rate = 0.001 critic_network.batch_size = 256 critic_network.optimizer_epsilon = 1e-08 critic_network.adam_optimizer_beta1 = 0.9 critic_network.adam_optimizer_beta2 = 0.999 critic_network.input_embedders_parameters = { 'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty), 'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty) } critic_network.middleware_parameters = FCMiddlewareParameters(
# reward model params agent_params.network_wrappers['reward_model'] = deepcopy( agent_params.network_wrappers['main']) agent_params.network_wrappers['reward_model'].learning_rate = 0.0001 agent_params.network_wrappers['reward_model'].l2_regularization = 0 agent_params.network_wrappers['imitation_model'] = deepcopy( agent_params.network_wrappers['main']) agent_params.network_wrappers['imitation_model'].learning_rate = 0.0001 agent_params.network_wrappers['imitation_model'].l2_regularization = 0 agent_params.network_wrappers['imitation_model'].heads_parameters = [ ClassificationHeadParameters() ] agent_params.network_wrappers['imitation_model'].input_embedders_parameters['observation'].scheme = \ [Dense(1024), Dense(1024), Dense(512), Dense(512), Dense(256)] agent_params.network_wrappers[ 'imitation_model'].middleware_parameters.scheme = [Dense(128), Dense(64)] # ER size agent_params.memory = EpisodicExperienceReplayParameters() # E-Greedy schedule agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000) agent_params.exploration.evaluation_epsilon = 0 # Input filtering agent_params.input_filter = InputFilter() agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1 / 200.))