Ejemplo n.º 1
0
 def __init__(self,
              agent_parameters: AgentParameters,
              spaces: SpacesDefinition,
              network_name: str,
              head_idx: int = 0,
              loss_weight: float = 1.,
              is_local: bool = True,
              activation_function: str = 'relu',
              dense_layer=Dense,
              scheme=[Dense(256), Dense(256)]):
     super().__init__(agent_parameters,
                      spaces,
                      network_name,
                      head_idx,
                      loss_weight,
                      is_local,
                      activation_function,
                      dense_layer=dense_layer)
     self.name = 'regression_head'
     self.scheme = scheme
     self.layers = []
     if isinstance(self.spaces.action, BoxActionSpace):
         self.num_actions = self.spaces.action.shape[0]
     elif isinstance(self.spaces.action, DiscreteActionSpace):
         self.num_actions = len(self.spaces.action.actions)
     self.return_type = QActionStateValue
     if agent_parameters.network_wrappers[
             self.network_name].replace_mse_with_huber_loss:
         self.loss_type = tf.losses.huber_loss
     else:
         self.loss_type = tf.losses.mean_squared_error
    def __call__(self, input_layer, name: str = None, is_training=None):
        """
        returns a tensorflow conv2d layer
        :param input_layer: previous layer
        :param name: layer name
        :return: conv2d layer
        """

        #input_layer.reshape

        #print ("--input", input_layer.shape)
        #input_layer = tf.reshape(tensor=input_layer[-1], shape=(input_layer[-1].shape[0]/512,64,4,2))
        #input_layer = tf.reshape(tensor=input_layer, shape=(-1,64,4,2))
        #print (input_layer.shape)

        conv = tf.layers.conv2d(input_layer,
                                filters=self.num_filters,
                                kernel_size=self.kernel_size,
                                strides=self.strides,
                                data_format='channels_last',
                                name=name)
        W1 = Dense(self.units)
        V = Dense(1)
        score = tf.nn.tanh(W1(conv))
        attention_weights = tf.nn.softmax(V(score), axis=1)
        context_vector = attention_weights * conv
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector
Ejemplo n.º 3
0
 def __init__(self,
              activation_function: str = 'relu',
              name: str = 'q_head_params',
              num_output_head_copies: int = 1,
              rescale_gradient_from_head_by_factor: float = 1.0,
              loss_weight: float = 1.0,
              dense_layer=Dense,
              scheme=[Dense(256), Dense(256)]):
     super().__init__(parameterized_class=RegressionHead,
                      activation_function=activation_function,
                      name=name,
                      dense_layer=dense_layer,
                      num_output_head_copies=num_output_head_copies,
                      rescale_gradient_from_head_by_factor=
                      rescale_gradient_from_head_by_factor,
                      loss_weight=loss_weight)
Ejemplo n.º 4
0
    def __init__(self):
        super().__init__()
        self.input_embedders_parameters = {'observation': InputEmbedderParameters(activation_function='leaky_relu'),
                                            'measurements': InputEmbedderParameters(activation_function='leaky_relu'),
                                            'goal': InputEmbedderParameters(activation_function='leaky_relu')}

        self.input_embedders_parameters['observation'].scheme = [
            Conv2d(32, 8, 4),
            Conv2d(64, 4, 2),
            Conv2d(64, 3, 1),
            Dense(512),
        ]

        self.input_embedders_parameters['measurements'].scheme = [
            Dense(128),
            Dense(128),
            Dense(128),
        ]

        self.input_embedders_parameters['goal'].scheme = [
            Dense(128),
            Dense(128),
            Dense(128),
        ]

        self.middleware_parameters = FCMiddlewareParameters(activation_function='leaky_relu',
                                                            scheme=MiddlewareScheme.Empty)
        self.heads_parameters = [MeasurementsPredictionHeadParameters(activation_function='leaky_relu')]
        self.async_training = False
        self.batch_size = 64
        self.adam_optimizer_beta1 = 0.95
Ejemplo n.º 5
0
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(100)
schedule_params.evaluation_steps = EnvironmentEpisodes(3)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = NStepQAgentParameters()

agent_params.network_wrappers['main'].learning_rate = 0.0001
agent_params.network_wrappers['main'].input_embedders_parameters[
    'observation'].scheme = [Conv2d(16, 8, 4),
                             Conv2d(32, 4, 2)]
agent_params.network_wrappers['main'].middleware_parameters.scheme = [
    Dense(256)
]

###############
# Environment #
###############
env_params = Atari(level=SingleLevelSelection(atari_deterministic_v4))

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.trace_test_levels = [
    'breakout', 'pong', 'space_invaders'
]
Ejemplo n.º 6
0
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2000)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = PPOAgentParameters()
agent_params.network_wrappers['actor'].learning_rate = 0.001
agent_params.network_wrappers['critic'].learning_rate = 0.001

agent_params.network_wrappers['actor'].input_embedders_parameters[
    'observation'].scheme = [Dense(64)]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [
    Dense(64)
]
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'observation'].scheme = [Dense(64)]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [
    Dense(64)
]

agent_params.input_filter = InputFilter()
agent_params.input_filter.add_observation_filter(
    'observation', 'normalize', ObservationNormalizationFilter())

###############
# Environment #
Ejemplo n.º 7
0
         Conv2d(64, 3, 1),
         BatchnormActivationDropout(batchnorm=True,
                                    activation_function=tf.tanh),
         Conv2d(128, 3, 2),
         BatchnormActivationDropout(batchnorm=True,
                                    activation_function=tf.tanh),
         Conv2d(128, 3, 1),
         BatchnormActivationDropout(batchnorm=True,
                                    activation_function=tf.tanh),
         Conv2d(256, 3, 1),
         BatchnormActivationDropout(batchnorm=True,
                                    activation_function=tf.tanh),
         Conv2d(256, 3, 1),
         BatchnormActivationDropout(batchnorm=True,
                                    activation_function=tf.tanh),
         Dense(512),
         BatchnormActivationDropout(activation_function=tf.tanh,
                                    dropout_rate=0.3),
         Dense(512),
         BatchnormActivationDropout(activation_function=tf.tanh,
                                    dropout_rate=0.3)
     ],
     activation_function=
     'none'  # we define the activation function for each layer explicitly
 ),
 'measurements':
 InputEmbedderParameters(
     scheme=[
         Dense(128),
         BatchnormActivationDropout(activation_function=tf.tanh,
                                    dropout_rate=0.5),
Ejemplo n.º 8
0
####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(1000)

#########
# Agent #
#########
agent_params = NAFAgentParameters()
agent_params.network_wrappers['main'].input_embedders_parameters[
    'observation'].scheme = [Dense(200)]
agent_params.network_wrappers['main'].middleware_parameters.scheme = [
    Dense(200)
]
agent_params.network_wrappers['main'].clip_gradients = 1000
agent_params.network_wrappers[
    'main'].gradients_clipping_method = GradientClippingMethod.ClipByValue

###############
# Environment #
###############
import jsbsim
import gym_jsbsim
from rl_coach.filters.filter import NoInputFilter, NoOutputFilter
from rl_coach.filters.filter import InputFilter
from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter
Ejemplo n.º 9
0
    32)
top_agent_params.algorithm.num_consecutive_training_steps = 40
top_agent_params.algorithm.num_steps_between_copying_online_weights_to_target = TrainingSteps(
    40)

# exploration - OU process
top_agent_params.exploration = OUProcessParameters()
top_agent_params.exploration.theta = 0.1

# actor
top_actor = top_agent_params.network_wrappers['actor']
top_actor.input_embedders_parameters = {
    'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
}
top_actor.middleware_parameters.scheme = [Dense(64)] * 3
top_actor.learning_rate = 0.001
top_actor.batch_size = 4096

# critic
top_critic = top_agent_params.network_wrappers['critic']
top_critic.input_embedders_parameters = {
    'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
}
top_critic.embedding_merger_type = EmbeddingMergerType.Concat
top_critic.middleware_parameters.scheme = [Dense(64)] * 3
top_critic.learning_rate = 0.001
top_critic.batch_size = 4096
Ejemplo n.º 10
0
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000)
schedule_params.steps_between_evaluation_periods = EnvironmentSteps(2048)
schedule_params.evaluation_steps = EnvironmentEpisodes(5)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = ClippedPPOAgentParameters()


agent_params.network_wrappers['main'].learning_rate = 0.0003
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].activation_function = 'tanh'
agent_params.network_wrappers['main'].input_embedders_parameters['observation'].scheme = [Dense(64)]
agent_params.network_wrappers['main'].middleware_parameters.scheme = [Dense(64)]
agent_params.network_wrappers['main'].middleware_parameters.activation_function = 'tanh'
agent_params.network_wrappers['main'].batch_size = 64
agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5
agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999

agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2
agent_params.algorithm.clipping_decay_schedule = LinearSchedule(1.0, 0, 1000000)
agent_params.algorithm.beta_entropy = 0
agent_params.algorithm.gae_lambda = 0.95
agent_params.algorithm.discount = 0.99
agent_params.algorithm.optimization_epochs = 10
agent_params.algorithm.estimate_state_value_using_gae = True

agent_params.input_filter = InputFilter()
Ejemplo n.º 11
0
####################
# Graph Scheduling #
####################

schedule_params = ScheduleParameters()
schedule_params.improve_steps = EnvironmentSteps(2000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(1000)

#########
# Agent #
#########
agent_params = DDPGAgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters['observation'].scheme = [Dense(400)]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [Dense(300)]
agent_params.network_wrappers['critic'].input_embedders_parameters['observation'].scheme = [Dense(400)]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [Dense(300)]
agent_params.network_wrappers['critic'].input_embedders_parameters['action'].scheme = EmbedderScheme.Empty

###############
# Environment #
###############
env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))

########
# Test #
########
preset_validation_params = PresetValidationParameters()
preset_validation_params.test = True
agent_params.network_wrappers['main'].replace_mse_with_huber_loss = False
agent_params.network_wrappers['main'].l2_regularization = 0.0001
agent_params.network_wrappers['main'].softmax_temperature = 0.2

# reward model params
agent_params.network_wrappers['reward_model'] = deepcopy(agent_params.network_wrappers['main'])
agent_params.network_wrappers['reward_model'].learning_rate = 0.0001
agent_params.network_wrappers['reward_model'].l2_regularization = 0

agent_params.network_wrappers['imitation_model'] = deepcopy(agent_params.network_wrappers['main'])
agent_params.network_wrappers['imitation_model'].learning_rate = 0.0001
agent_params.network_wrappers['imitation_model'].l2_regularization = 0

agent_params.network_wrappers['imitation_model'].heads_parameters = [ClassificationHeadParameters()]
agent_params.network_wrappers['imitation_model'].input_embedders_parameters['observation'].scheme = \
    [Dense(1024), Dense(1024), Dense(512), Dense(512), Dense(256)]
agent_params.network_wrappers['imitation_model'].middleware_parameters.scheme = [Dense(128), Dense(64)]


# ER size
agent_params.memory = EpisodicExperienceReplayParameters()
agent_params.memory.max_size = (MemoryGranularity.Transitions, DATASET_SIZE)


# E-Greedy schedule
agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000)
agent_params.exploration.evaluation_epsilon = 0


agent_params.input_filter = InputFilter()
agent_params.input_filter.add_reward_filter('rescale', RewardRescaleFilter(1/200.))
Ejemplo n.º 13
0
from rl_coach.filters.reward import RewardEwmaNormalizationFilter
import numpy as np

####################
# Graph Scheduling #
####################
schedule_params = ScheduleParameters()
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(0)
schedule_params.evaluation_steps = EnvironmentEpisodes(0)

#####################
# DDPG Agent Params #
#####################
agent_params = DDPGAgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters[
    'observation'].scheme = [Dense(300)]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [
    Dense(300)
]
agent_params.network_wrappers['actor'].heads_parameters[
    0].activation_function = 'sigmoid'
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'observation'].scheme = [Dense(300)]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [
    Dense(300)
]
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'action'].scheme = [Dense(300)]

agent_params.network_wrappers[
    'critic'].embedding_merger_type = EmbeddingMergerType.Sum
Ejemplo n.º 14
0
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(0)

#########
# Agent #
#########
agent_params = ActorCriticAgentParameters()
agent_params.algorithm.apply_gradients_every_x_episodes = 1
agent_params.algorithm.num_steps_between_gradient_updates = 20
agent_params.algorithm.beta_entropy = 0.005
agent_params.network_wrappers['main'].learning_rate = 0.00002
agent_params.network_wrappers['main'].input_embedders_parameters['observation'] = \
    InputEmbedderParameters(scheme=[Dense(200)])
agent_params.network_wrappers[
    'main'].middleware_parameters = LSTMMiddlewareParameters(
        scheme=MiddlewareScheme.Empty, number_of_lstm_cells=128)

agent_params.input_filter = InputFilter()
agent_params.input_filter.add_reward_filter('rescale',
                                            RewardRescaleFilter(1 / 20.))
agent_params.input_filter.add_observation_filter(
    'observation', 'normalize', ObservationNormalizationFilter())

###############
# Environment #
###############
env_params = GymVectorEnvironment(level=SingleLevelSelection(mujoco_v2))
Ejemplo n.º 15
0
schedule_params = ScheduleParameters()
schedule_params.improve_steps = TrainingSteps(10000000000)
schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(20)
schedule_params.evaluation_steps = EnvironmentEpisodes(1)
schedule_params.heatup_steps = EnvironmentSteps(1000)

#########
# Agent #
#########
agent_params = DDPGAgentParameters()
agent_params.network_wrappers['actor'].input_embedders_parameters['measurements'] = \
    agent_params.network_wrappers['actor'].input_embedders_parameters.pop('observation')
agent_params.network_wrappers['critic'].input_embedders_parameters['measurements'] = \
    agent_params.network_wrappers['critic'].input_embedders_parameters.pop('observation')
agent_params.network_wrappers['actor'].input_embedders_parameters[
    'measurements'].scheme = [Dense(300)]
agent_params.network_wrappers['actor'].middleware_parameters.scheme = [
    Dense(200)
]
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'measurements'].scheme = [Dense(400)]
agent_params.network_wrappers['critic'].middleware_parameters.scheme = [
    Dense(300)
]
agent_params.network_wrappers['critic'].input_embedders_parameters[
    'action'].scheme = EmbedderScheme.Empty
agent_params.input_filter = InputFilter()
agent_params.input_filter.add_reward_filter("rescale",
                                            RewardRescaleFilter(1 / 10.))

###############
Ejemplo n.º 16
0
################
agent_params = DDPGAgentParameters()

# actor
actor_network = agent_params.network_wrappers['actor']
actor_network.learning_rate = 0.001
actor_network.batch_size = 256
actor_network.optimizer_epsilon = 1e-08
actor_network.adam_optimizer_beta1 = 0.9
actor_network.adam_optimizer_beta2 = 0.999
actor_network.input_embedders_parameters = {
    'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
}
actor_network.middleware_parameters = FCMiddlewareParameters(
    scheme=[Dense(256), Dense(256), Dense(256)])
actor_network.heads_parameters[0].batchnorm = False

# critic
critic_network = agent_params.network_wrappers['critic']
critic_network.learning_rate = 0.001
critic_network.batch_size = 256
critic_network.optimizer_epsilon = 1e-08
critic_network.adam_optimizer_beta1 = 0.9
critic_network.adam_optimizer_beta2 = 0.999
critic_network.input_embedders_parameters = {
    'action': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'desired_goal': InputEmbedderParameters(scheme=EmbedderScheme.Empty),
    'observation': InputEmbedderParameters(scheme=EmbedderScheme.Empty)
}
critic_network.middleware_parameters = FCMiddlewareParameters(
# reward model params
agent_params.network_wrappers['reward_model'] = deepcopy(
    agent_params.network_wrappers['main'])
agent_params.network_wrappers['reward_model'].learning_rate = 0.0001
agent_params.network_wrappers['reward_model'].l2_regularization = 0

agent_params.network_wrappers['imitation_model'] = deepcopy(
    agent_params.network_wrappers['main'])
agent_params.network_wrappers['imitation_model'].learning_rate = 0.0001
agent_params.network_wrappers['imitation_model'].l2_regularization = 0

agent_params.network_wrappers['imitation_model'].heads_parameters = [
    ClassificationHeadParameters()
]
agent_params.network_wrappers['imitation_model'].input_embedders_parameters['observation'].scheme = \
    [Dense(1024), Dense(1024), Dense(512), Dense(512), Dense(256)]
agent_params.network_wrappers[
    'imitation_model'].middleware_parameters.scheme = [Dense(128),
                                                       Dense(64)]

# ER size
agent_params.memory = EpisodicExperienceReplayParameters()

# E-Greedy schedule
agent_params.exploration.epsilon_schedule = LinearSchedule(0, 0, 10000)
agent_params.exploration.evaluation_epsilon = 0

# Input filtering
agent_params.input_filter = InputFilter()
agent_params.input_filter.add_reward_filter('rescale',
                                            RewardRescaleFilter(1 / 200.))