def initialize(self, custom_getter): super(QDemoModel, self).initialize(custom_getter=custom_getter) self.demo_memory = Replay(states=self.states_spec, internals=self.internals_spec, actions=self.actions_spec, include_next_states=True, capacity=self.demo_memory_capacity, scope='demo-replay', summary_labels=self.summary_labels) # Import demonstration optimization. self.fn_import_demo_experience = tf.make_template( name_='import-demo-experience', func_=self.tf_import_demo_experience, custom_getter_=custom_getter) # Demonstration loss. self.fn_demo_loss = tf.make_template(name_='demo-loss', func_=self.tf_demo_loss, custom_getter_=custom_getter) # Combined loss. self.fn_combined_loss = tf.make_template(name_='combined-loss', func_=self.tf_combined_loss, custom_getter_=custom_getter) # Demonstration optimization. self.fn_demo_optimization = tf.make_template( name_='demo-optimization', func_=self.tf_demo_optimization, custom_getter_=custom_getter)
def __init__(self, states_spec, actions_spec, network_spec, config): self.network_spec = network_spec config = config.copy() config.default(DQFDAgent.default_config) # DQFD always uses double dqn, which is a required key for a q-model. config.obligatory(double_dqn=True) self.target_update_frequency = config.target_update_frequency self.demo_memory_capacity = config.demo_memory_capacity # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.demo_memory_capacity, self.states_spec, self.actions_spec) super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, config=config )
def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.epochs = config.epochs self.optimizer_batch_size = config.optimizer_batch_size # Use replay memory so memory logic can be used to sample batches self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling)
def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.optimizer_batch_size = config.optimizer_batch_size self.batch_size = config.batch_size self.updates = int( config.batch_size / self.optimizer_batch_size) * config.epochs if self.batch_size % self.optimizer_batch_size != 0: raise TensorForceError( 'batch_size must be a multiple of optimizer_batch_size') # Use replay memory as a cache so it can be used to sample minibatches self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling)
def __init__(self, config, model=None): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config, model) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to make sure demo_batch_size is positive. (Calculated {} based on current parameters)'.format(self.demo_batch_size)
def __init__(self, config): config.default(PPOModel.default_config) super(PPOModel, self).__init__(config) self.optimizer_batch_size = config.optimizer_batch_size # Use replay memory so memory logic can be used to sample batches if self.optimizer_batch_size > config.batch_size: raise Exception( "optimizer_batch_size > batch_size ({}, {})".format( self.optimizer_batch_size, config.batch_size)) self.updates = int( config.batch_size / self.optimizer_batch_size) * config.epochs self.memory = Replay(config.batch_size, config.states, config.actions, config.random_sampling)
def __init__(self, config): config.default(DQFDAgent.default_config) super(DQFDAgent, self).__init__(config) self.target_update_frequency = config.target_update_frequency # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(config.demo_memory_capacity, config.states, config.actions) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(config.demo_sampling_ratio * config.batch_size / (1.0 - config.demo_sampling_ratio))
def setup_components_and_tf_funcs(self, custom_getter=None): """ Constructs the extra Replay memory. """ custom_getter = super(QDemoModel, self).setup_components_and_tf_funcs(custom_getter) self.demo_memory = Replay( states=self.states_spec, internals=self.internals_spec, actions=self.actions_spec, include_next_states=True, capacity=self.demo_memory_capacity, scope='demo-replay', summary_labels=self.summary_labels ) # Import demonstration optimization. self.fn_import_demo_experience = tf.make_template( name_='import-demo-experience', func_=self.tf_import_demo_experience, custom_getter_=custom_getter ) # Demonstration loss. self.fn_demo_loss = tf.make_template( name_='demo-loss', func_=self.tf_demo_loss, custom_getter_=custom_getter ) # Combined loss. self.fn_combined_loss = tf.make_template( name_='combined-loss', func_=self.tf_combined_loss, custom_getter_=custom_getter ) # Demonstration optimization. self.fn_demo_optimization = tf.make_template( name_='demo-optimization', func_=self.tf_demo_optimization, custom_getter_=custom_getter ) return custom_getter
def __init__(self, states_spec, actions_spec, network_spec, device=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, normalize_rewards=False, variable_noise=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, preprocessing=None, exploration=None, reward_preprocessing=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: actions_spec: network_spec: device: scope: saver_spec: summary_spec: distributed_spec: optimizer: discount: normalize_rewards: variable_noise: distributions_spec: entropy_regularization: target_sync_frequency: target_update_weight: double_q_model: huber_loss: preprocessing: exploration: reward_preprocessing: batched_observe: batch_size: memory: first_update: update_frequency: repeat_update: expert_margin: supervised_weight: demo_memory_capacity: demo_sampling_ratio: """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.normalize_rewards = normalize_rewards self.variable_noise = variable_noise self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, preprocessing=preprocessing, exploration=exploration, reward_preprocessing=reward_preprocessing, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
def __init__(self, states_spec, actions_spec, network_spec, device=None, session_config=None, scope='dqfd', saver_spec=None, summary_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, batched_observe=1000, batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: states_spec: Dict containing at least one state definition. In the case of a single state, keys `shape` and `type` are necessary. For multiple states, pass a dict of dicts where each state is a dict itself with a unique name as its key. actions_spec: Dict containing at least one action definition. Actions have types and either `num_actions` for discrete actions or a `shape` for continuous actions. Consult documentation and tests for more. network_spec: List of layers specifying a neural network via layer types, sizes and optional arguments such as activation or regularisation. Full examples are in the examples/configs folder. device: Device string specifying model device. session_config: optional tf.ConfigProto with additional desired session configurations scope: TensorFlow scope, defaults to agent name (e.g. `dqn`). saver_spec: Dict specifying automated saving. Use `directory` to specify where checkpoints are saved. Use either `seconds` or `steps` to specify how often the model should be saved. The `load` flag specifies if a model is initially loaded (set to True) from a file `file`. summary_spec: Dict specifying summaries for TensorBoard. Requires a 'directory' to store summaries, `steps` or `seconds` to specify how often to save summaries, and a list of `labels` to indicate which values to export, e.g. `losses`, `variables`. Consult neural network class and model for all available labels. distributed_spec: Dict specifying distributed functionality. Use `parameter_server` and `replica_model` Boolean flags to indicate workers and parameter servers. Use a `cluster_spec` key to pass a TensorFlow cluster spec. optimizer: Dict specifying optimizer type and its optional parameters, typically a `learning_rate`. Available optimizer types include standard TensorFlow optimizers, `natural_gradient`, and `evolutionary`. Consult the optimizer test or example configurations for more. discount: Float specifying reward discount factor. variable_noise: Experimental optional parameter specifying variable noise (NoisyNet). states_preprocessing_spec: Optional list of states preprocessors to apply to state (e.g. `image_resize`, `grayscale`). explorations_spec: Optional dict specifying action exploration type (epsilon greedy or Gaussian noise). reward_preprocessing_spec: Optional dict specifying reward preprocessing. distributions_spec: Optional dict specifying action distributions to override default distribution choices. Must match action names. entropy_regularization: Optional positive float specifying an entropy regularization value. target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. batched_observe: Optional int specifying how many observe calls are batched into one session run. Without batching, throughput will be lower because every `observe` triggers a session invocation to update rewards in the graph. batch_size: Int specifying batch size used to sample from memory. Should be smaller than memory size. memory: Dict describing memory via `type` (e.g. `replay`) and `capacity`. first_update: Int describing at which time step the first update is performed. Should be larger than batch size. update_frequency: Int specifying number of observe steps to perform until an update is executed. repeat_update: Int specifying how many update steps are performed per update, where each update step implies sampling a batch from the memory and passing it to the model. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ if network_spec is None: raise TensorForceError("No network_spec provided.") if optimizer is None: self.optimizer = dict(type='adam', learning_rate=1e-3) else: self.optimizer = optimizer if memory is None: memory = dict(type='replay', capacity=100000) else: self.memory = memory self.network_spec = network_spec self.device = device self.session_config = session_config self.scope = scope self.saver_spec = saver_spec self.summary_spec = summary_spec self.distributed_spec = distributed_spec self.discount = discount self.variable_noise = variable_noise self.states_preprocessing_spec = states_preprocessing_spec self.explorations_spec = explorations_spec self.reward_preprocessing_spec = reward_preprocessing_spec self.distributions_spec = distributions_spec self.entropy_regularization = entropy_regularization self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss # DQFD always uses double dqn, which is a required key for a q-model. self.double_q_model = True self.target_sync_frequency = target_sync_frequency self.demo_memory_capacity = demo_memory_capacity self.expert_margin = expert_margin self.supervised_weight = supervised_weight # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop super(DQFDAgent, self).__init__(states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)
def __init__( self, states_spec, actions_spec, batched_observe=1000, scope='dqfd', # parameters specific to LearningAgents summary_spec=None, network_spec=None, device=None, session_config=None, saver_spec=None, distributed_spec=None, optimizer=None, discount=0.99, variable_noise=None, states_preprocessing_spec=None, explorations_spec=None, reward_preprocessing_spec=None, distributions_spec=None, entropy_regularization=None, # parameters specific to MemoryAgents batch_size=32, memory=None, first_update=10000, update_frequency=4, repeat_update=1, # parameters specific to DQFD agents target_sync_frequency=10000, target_update_weight=1.0, huber_loss=None, expert_margin=0.5, supervised_weight=0.1, demo_memory_capacity=10000, demo_sampling_ratio=0.2): """ Deep Q-learning from demonstration (DQFD) agent ([Hester et al., 2017](https://arxiv.org/abs/1704.03732)). This agent uses DQN to pre-train from demonstration data in combination with a supervised loss. Args: target_sync_frequency: Interval between optimization calls synchronizing the target network. target_update_weight: Update weight, 1.0 meaning a full assignment to target network from training network. huber_loss: Optional flat specifying Huber-loss clipping. expert_margin: Positive float specifying enforced supervised margin between expert action Q-value and other Q-values. supervised_weight: Weight of supervised loss term. demo_memory_capacity: Int describing capacity of expert demonstration memory. demo_sampling_ratio: Runtime sampling ratio of expert data. """ self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.huber_loss = huber_loss self.expert_margin = expert_margin self.supervised_weight = supervised_weight super(DQFDAgent, self).__init__( states_spec=states_spec, actions_spec=actions_spec, batched_observe=batched_observe, scope=scope, # parameters specific to LearningAgent summary_spec=summary_spec, network_spec=network_spec, discount=discount, device=device, session_config=session_config, saver_spec=saver_spec, distributed_spec=distributed_spec, optimizer=optimizer, variable_noise=variable_noise, states_preprocessing_spec=states_preprocessing_spec, explorations_spec=explorations_spec, reward_preprocessing_spec=reward_preprocessing_spec, distributions_spec=distributions_spec, entropy_regularization=entropy_regularization, # parameters specific to MemoryAgents batch_size=batch_size, memory=memory, first_update=first_update, update_frequency=update_frequency, repeat_update=repeat_update) # The demo_sampling_ratio, called p in paper, controls ratio of expert vs online training samples # p = n_demo / (n_demo + n_replay) => n_demo = p * n_replay / (1 - p) self.demo_memory_capacity = demo_memory_capacity self.demo_batch_size = int(demo_sampling_ratio * batch_size / (1.0 - demo_sampling_ratio)) assert self.demo_batch_size > 0, 'Check DQFD sampling parameters to ensure ' \ 'demo_batch_size is positive. (Calculated {} based on current' \ ' parameters)'.format(self.demo_batch_size) # This is the demonstration memory that we will fill with observations before starting # the main training loop self.demo_memory = Replay(self.states_spec, self.actions_spec, self.demo_memory_capacity)