class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `device`: string of tensorflow device name. * `tf_summary`: boolean indicating whether to use tensorflow summary file writer. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound """ name = 'DQFDAgent' model = DQFDModel default_config = dict(target_update_frequency=10000, demo_memory_capacity=1000000, demo_sampling_ratio=0.01) def __init__(self, config, model=None): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config, model) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format( self.demo_batch_size) def observe(self, reward, terminal): """Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: Returns: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) if self.timestep >= self.first_update and self.timestep % self.target_update_frequency == 0: self.model.update_target() def import_demonstrations(self, demonstrations): """Imports demonstrations, i.e. expert observations Args: demonstrations: Returns: """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['state']) else: state = observation['state'] if self.unique_action: action = dict(action=observation['action']) else: action = observation['action'] self.demo_memory.add_observation(state=state, action=action, reward=observation['reward'], terminal=observation['terminal'], internal=observation['internal']) def pretrain(self, steps): """Computes pretrain updates. Args: steps: Number of updates to execute. Returns: """ for _ in xrange(steps): # Sample from demo memory batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `device`: string of tensorflow device name. * `tf_summary`: string directory to write tensorflow summaries. Default None * `tf_summary_level`: int indicating which tensorflow summaries to create. * `tf_summary_interval`: int number of calls to get_action until writing tensorflow summaries on update. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound """ default_config = dict( # Agent preprocessing=None, exploration=None, reward_preprocessing=None, # Model optimizer=dict( type='adam', learning_rate=1e-3 ), discount=0.99, normalize_rewards=False, variable_noise=None, # not documented!!! # DistributionModel distributions=None, # not documented!!! entropy_regularization=None, # QModel target_sync_frequency=10000, # not documented!!! target_update_weight=1.0, # not documented!!! huber_loss=0.0, # not documented!!! # Logging log_level='info', model_directory=None, save_frequency=600, # TensorFlow default summary_labels=['total-loss'], summary_frequency=120, # TensorFlow default # TensorFlow distributed configuration cluster_spec=None, parameter_server=False, task_index=0, device=None, local_model=False, replica_model=False, scope='dqfd' ) def __init__(self, states_spec, actions_spec, network_spec, config): self.network_spec = network_spec config = config.copy() config.default(DQFDAgent.default_config) # DQFD always uses double dqn, which is a required key for a q-model. config.obligatory(double_dqn=True) self.target_update_frequency = config.target_update_frequency self.demo_memory_capacity = config.demo_memory_capacity # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec) super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, config=config ) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internal'], actions=action, terminal=observation['terminal'], reward=observation['reward'] ) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory( states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'] ) def initialize_model(self, states_spec, actions_spec, config): return QDemoModel( states_spec=states_spec, actions_spec=actions_spec, network_spec=self.network_spec, config=config ) def pretrain(self, steps): """ Computes pretrain updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `device`: string of tensorflow device name. * `tf_summary`: string directory to write tensorflow summaries. Default None * `tf_summary_level`: int indicating which tensorflow summaries to create. * `tf_summary_interval`: int number of calls to get_action until writing tensorflow summaries on update. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_loss`: float if not 0, uses the huber loss with clip_loss as the linear bound """ def __init__(self, states_spec, actions_spec, network_spec, device=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: actions_spec: network_spec: device: scope: saver_spec: summary_spec: distributed_spec: optimizer: discount: normalize_rewards: variable_noise: distributions_spec: entropy_regularization: target_sync_frequency: target_update_weight: double_q_model: huber_loss: preprocessing: exploration: reward_preprocessing: batched_observe: batch_size: memory: first_update: update_frequency: repeat_update: expert_margin: supervised_weight: demo_memory_capacity: demo_sampling_ratio: """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity) def initialize_model(self, states_spec, actions_spec): return QDemoModel( states_spec=states_spec, actions_spec=actions_spec, network_spec=self.network_spec, device=self.device, scope=self.scope, saver_spec=self.saver_spec, summary_spec=self.summary_spec, distributed_spec=self.distributed_spec, optimizer=self.optimizer, discount=self.discount, normalize_rewards=self.normalize_rewards, variable_noise=self.variable_noise, distributions_spec=self.distributions_spec, entropy_regularization=self.entropy_regularization, target_sync_frequency=self.target_sync_frequency, target_update_weight=self.target_update_weight, double_q_model=self.double_q_model, huber_loss=self.huber_loss, # TEMP: Random sampling fix random_sampling_fix=True, expert_margin=self.expert_margin, supervised_weight=self.supervised_weight) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internals'], actions=action, terminal=observation['terminal'], reward=observation['reward']) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory(states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def pretrain(self, steps): """ Computes pretrain updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data. Configuration: Each agent requires the following configuration parameters: * `states`: dict containing one or more state definitions. * `actions`: dict containing one or more action definitions. * `preprocessing`: dict or list containing state preprocessing configuration. * `exploration`: dict containing action exploration configuration. Each model requires the following configuration parameters: * `discount`: float of discount factor (gamma). * `learning_rate`: float of learning rate (alpha). * `optimizer`: string of optimizer to use (e.g. 'adam'). * `optimizer_args`: list of arguments for optimizer. * `optimizer_kwargs`: dict of keyword arguments for optimizer. * `device`: string of tensorflow device name. * `tf_saver`: boolean whether to save model parameters. * `tf_summary`: boolean indicating whether to use tensorflow summary file writer. * `log_level`: string containing logleve (e.g. 'info'). * `distributed`: boolean indicating whether to use distributed tensorflow. * `global_model`: global model. * `session`: session to use. The `DQFDAgent` class additionally requires the following parameters: * `batch_size`: integer of the batch size. * `memory_capacity`: integer of maximum experiences to store. * `memory`: string indicating memory type ('replay' or 'prioritized_replay'). * `memory_args`: list of arguments to pass to replay memory constructor. * `memory_kwargs`: list of keyword arguments to pass to replay memory constructor. * `min_replay_size`: integer of minimum replay size before the first update. * `update_rate`: float of the update rate (e.g. 0.25 = every 4 steps). * `target_network_update_rate`: float of target network update rate (e.g. 0.01 = every 100 steps). * `use_target_network`: boolean indicating whether to use a target network. * `update_repeat`: integer of how many times to repeat an update. * `update_target_weight`: float of update target weight (tau parameter). * `demo_sampling_ratio`: float, ratio of expert data used at runtime to train from. * `supervised_weight`: float, weight of large margin classifier loss. * `expert_margin`: float of difference in Q-values between expert action and other actions enforced by the large margin function. * `clip_gradients`: float of maximum values for gradients before clipping. """ name = 'DQFDAgent' model = DQFDModel default_config = dict( target_update_frequency=10000, demo_memory_capacity=1000000, demo_sampling_ratio=0.01 ) def __init__(self, config): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive.' \ '(Calculated {} based on current parameters)'.format(self.demo_batch_size) def observe(self, reward, terminal): """Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: Returns: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch(self.demo_batch_size) self.model.demonstration_update(batch=batch) if self.timestep >= self.first_update and self.timestep % self.target_update_frequency == 0: self.model.update_target() def import_demonstrations(self, demonstrations): """Imports demonstrations, i.e. expert observations Args: demonstrations: Returns: """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['state']) else: state = observation['state'] if self.unique_action: action = dict(action=observation['action']) else: action = observation['action'] self.demo_memory.add_observation( state=state, action=action, reward=observation['reward'], terminal=observation['terminal'], internal=observation['internal'] ) def pretrain(self, steps): """Computes pretrain updates. Args: steps: Number of updates to execute. Returns: """ for _ in xrange(steps): # Sample from demo memory batch = self.demo_memory.get_batch(self.batch_size) # Update using both double Q-learning and supervised double_q_loss self.model.demonstration_update(batch)
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data via an additional supervised loss term. """ def __init__(self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. discount: Float specifying reward discount factor. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). states_preprocessing_spec: Optional list of states preprocessors to apply to state (e.g. `image_resize`, `grayscale`). explorations_spec: Optional dict specifying action exploration type (epsilon greedy or Gaussian noise). reward_preprocessing_spec: Optional dict specifying reward preprocessing. distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size. memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`. first_update: Int describing at which time step the first update is performed. Should be larger than batch size. update_frequency: Int specifying number of observe steps to perform until an update is executed. repeat_update: Int specifying how many update steps are performed per update, where each update step implies sampling a batch from the memory and passing it to the model. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.variable_noise = variable_noise self.states_preprocessing_spec = states_preprocessing_spec self.explorations_spec = explorations_spec self.reward_preprocessing_spec = reward_preprocessing_spec self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity) def initialize_model(self): return QDemoModel( states_spec=self.states_spec, actions_spec=self.actions_spec, network_spec=self.network_spec, device=self.device, session_config=self.session_config, scope=self.scope, saver_spec=self.saver_spec, summary_spec=self.summary_spec, distributed_spec=self.distributed_spec, optimizer=self.optimizer, discount=self.discount, variable_noise=self.variable_noise, states_preprocessing_spec=self.states_preprocessing_spec, explorations_spec=self.explorations_spec, reward_preprocessing_spec=self.reward_preprocessing_spec, distributions_spec=self.distributions_spec, entropy_regularization=self.entropy_regularization, target_sync_frequency=self.target_sync_frequency, target_update_weight=self.target_update_weight, double_q_model=self.double_q_model, huber_loss=self.huber_loss, # TEMP: Random sampling fix random_sampling_fix=True, expert_margin=self.expert_margin, supervised_weight=self.supervised_weight) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. Args: reward: terminal: """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch( batch_size=self.demo_batch_size, next_states=True) self.model.demonstration_update(states={ name: np.stack( (batch['states'][name], batch['next_states'][name])) for name in batch['states'] }, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internals'], actions=action, terminal=observation['terminal'], reward=observation['reward']) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory(states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def pretrain(self, steps): """ Computes pretrain updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(states={ name: np.stack( (batch['states'][name], batch['next_states'][name])) for name in batch['states'] }, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'])
class DQFDAgent(MemoryAgent): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data via an additional supervised loss term. """ def __init__( self, states_spec, actions_spec, batched_observe=1000, scope='dqfd', # parameters specific to LearningAgents summary_spec=None, network_spec=None, device=None, session_config=None, saver_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, # parameters specific to MemoryAgents batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, # parameters specific to DQFD agents target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss self.expert_margin = expert_margin self.supervised_weight = supervised_weight super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, scope=scope, # parameters specific to LearningAgent summary_spec=summary_spec, network_spec=network_spec, discount=discount, device=device, session_config=session_config, saver_spec=saver_spec, distributed_spec=distributed_spec, optimizer=optimizer, variable_noise=variable_noise, states_preprocessing_spec=states_preprocessing_spec, explorations_spec=explorations_spec, reward_preprocessing_spec=reward_preprocessing_spec, distributions_spec=distributions_spec, entropy_regularization=entropy_regularization, # parameters specific to MemoryAgents batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_memory_capacity = demo_memory_capacity self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity) def initialize_model(self): return QDemoModel( states_spec=self.states_spec, actions_spec=self.actions_spec, network_spec=self.network_spec, device=self.device, session_config=self.session_config, scope=self.scope, saver_spec=self.saver_spec, summary_spec=self.summary_spec, distributed_spec=self.distributed_spec, optimizer=self.optimizer, discount=self.discount, variable_noise=self.variable_noise, states_preprocessing_spec=self.states_preprocessing_spec, explorations_spec=self.explorations_spec, reward_preprocessing_spec=self.reward_preprocessing_spec, distributions_spec=self.distributions_spec, entropy_regularization=self.entropy_regularization, target_sync_frequency=self.target_sync_frequency, target_update_weight=self.target_update_weight, # DQFD always uses double dqn, which is a required key for a q-model. double_q_model=True, huber_loss=self.huber_loss, # TEMP: Random sampling fix random_sampling_fix=True, expert_margin=self.expert_margin, supervised_weight=self.supervised_weight) def observe(self, reward, terminal): """ Adds observations, updates via sampling from memories according to update rate. DQFD samples from the online replay memory and the demo memory with the fractions controlled by a hyper parameter p called 'expert sampling ratio. """ super(DQFDAgent, self).observe(reward=reward, terminal=terminal) if self.timestep >= self.first_update and self.timestep % self.update_frequency == 0: for _ in xrange(self.repeat_update): batch = self.demo_memory.get_batch( batch_size=self.demo_batch_size, next_states=True) self.model.demonstration_update(states={ name: np.stack( (batch['states'][name], batch['next_states'][name])) for name in batch['states'] }, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def import_demonstrations(self, demonstrations): """ Imports demonstrations, i.e. expert observations. Note that for large numbers of observations, set_demonstrations is more appropriate, which directly sets memory contents to an array an expects a different layout. Args: demonstrations: List of observation dicts """ for observation in demonstrations: if self.unique_state: state = dict(state=observation['states']) else: state = observation['states'] if self.unique_action: action = dict(action=observation['actions']) else: action = observation['actions'] self.demo_memory.add_observation( states=state, internals=observation['internals'], actions=action, terminal=observation['terminal'], reward=observation['reward']) def set_demonstrations(self, batch): """ Set all demonstrations from batch data. Expects a dict wherein each value contains an array containing all states, actions, rewards, terminals and internals respectively. Args: batch: """ self.demo_memory.set_memory(states=batch['states'], internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward']) def pretrain(self, steps): """ Computes pre-train updates. Args: steps: Number of updates to execute. """ for _ in xrange(steps): # Sample from demo memory. batch = self.demo_memory.get_batch(batch_size=self.batch_size, next_states=True) # Update using both double Q-learning and supervised double_q_loss. self.model.demonstration_update(states={ name: np.stack( (batch['states'][name], batch['next_states'][name])) for name in batch['states'] }, internals=batch['internals'], actions=batch['actions'], terminal=batch['terminal'], reward=batch['reward'])