def initialize(self, custom_getter): super(QModel, self).initialize(custom_getter) # TEMP: Random sampling fix if self.random_sampling_fix: self.next_state_inputs = dict() for name, state in self.states_spec.items(): self.next_state_inputs[name] = tf.placeholder( dtype=util.tf_dtype(state['type']), shape=(None, ) + tuple(state['shape']), name=('next-' + name)) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target', summary_labels=self.summary_labels)) # Target network optimizer self.target_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) # Target network distributions self.target_distributions = self.generate_distributions( self.actions_spec, self.distributions_spec, self.summary_labels)
def setup_components_and_tf_funcs(self, custom_getter=None): custom_getter = super( DPGTargetModel, self).setup_components_and_tf_funcs(custom_getter) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target-network', summary_labels=self.summary_labels)) # Target network optimizer self.target_network_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) # Target network distributions self.target_distributions = self.create_distributions() # Critic #print ("type of self.critic_network_spec[]:") #print (type(self.critic_network_spec)) #for element in self.critic_network_spec: # print (element) # oliver: CHANGES HERE!! size_t0 = self.critic_network_spec[0]['size'] size_t1 = self.critic_network_spec[1]['size'] self.critic = DDPGCriticNetwork(scope='critic', size_t0=size_t0, size_t1=size_t1) self.critic_optimizer = Optimizer.from_spec( spec=self.critic_optimizer_spec, kwargs=dict(summary_labels=self.summary_labels)) self.target_critic = DDPGCriticNetwork(scope='target-critic', size_t0=size_t0, size_t1=size_t1) # Target critic optimizer self.target_critic_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) self.fn_target_actions_and_internals = tf.make_template( name_='target-actions-and-internals', func_=self.tf_target_actions_and_internals, custom_getter_=custom_getter) self.fn_predict_target_q = tf.make_template( name_='predict-target-q', func_=self.tf_predict_target_q, custom_getter_=custom_getter) return custom_getter
def initialize(self, custom_getter): super(QModel, self).initialize(custom_getter) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target', summary_labels=self.summary_labels)) # Target network optimizer self.target_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight)
def setup_components_and_tf_funcs(self, custom_getter=None): custom_getter = super(DPGTargetModel, self).setup_components_and_tf_funcs(custom_getter) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target-network', summary_labels=self.summary_labels) ) # Target network optimizer self.target_network_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight ) # Target network distributions self.target_distributions = self.create_distributions() # critic self.critic_network = Network.from_spec( spec=self.critic_network_spec, kwargs=dict(scope='critic') ) self.target_critic_network = Network.from_spec( spec=self.critic_network_spec, kwargs=dict(scope='target-critic') ) self.critic_optimizer = Optimizer.from_spec( spec=self.critic_optimizer_spec, kwargs=dict(summary_labels=self.summary_labels) ) # Target critic optimizer self.target_critic_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight ) self.fn_target_actions_and_internals = tf.make_template( name_='target-actions-and-internals', func_=self.tf_target_actions_and_internals, custom_getter_=custom_getter ) self.fn_predict_target_q = tf.make_template( name_='predict-target-q', func_=self.tf_predict_target_q, custom_getter_=custom_getter ) return custom_getter
def initialize(self, custom_getter): super(DPGTargetModel, self).initialize(custom_getter) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target-network', summary_labels=self.summary_labels)) # Target network optimizer self.target_network_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) # Target network distributions self.target_distributions = self.create_distributions() # Critic size_t0 = self.critic_network_spec['size_t0'] size_t1 = self.critic_network_spec['size_t1'] self.critic = DDPGCriticNetwork(scope='critic', size_t0=size_t0, size_t1=size_t1) self.critic_optimizer = Optimizer.from_spec( spec=self.critic_optimizer_spec, kwargs=dict(summary_labels=self.summary_labels)) self.target_critic = DDPGCriticNetwork(scope='target-critic', size_t0=size_t0, size_t1=size_t1) # Target critic optimizer self.target_critic_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) self.fn_target_actions_and_internals = tf.make_template( name_='target-actions-and-internals', func_=self.tf_target_actions_and_internals, custom_getter_=custom_getter) self.fn_predict_target_q = tf.make_template( name_='predict-target-q', func_=self.tf_predict_target_q, custom_getter_=custom_getter)
def __init__(self, states_spec, actions_spec, network_spec, config): with tf.name_scope(name=config.scope): # Target network self.target_network = Network.from_spec( spec=network_spec, kwargs=dict(scope='target')) # Target network optimizer self.target_optimizer = Synchronization( sync_frequency=config.target_sync_frequency, update_weight=config.target_update_weight) self.double_q_model = config.double_q_model assert config.huber_loss is None or config.huber_loss > 0.0 self.huber_loss = config.huber_loss super(QModel, self).__init__(states_spec=states_spec, actions_spec=actions_spec, network_spec=network_spec, config=config)
class QModel(DistributionModel): """ Q-value model. """ def __init__( self, states_spec, actions_spec, network_spec, device, scope, saver_spec, summary_spec, distributed_spec, optimizer, discount, normalize_rewards, variable_noise, distributions_spec, entropy_regularization, target_sync_frequency, target_update_weight, double_q_model, huber_loss, # TEMP: Random sampling fix random_sampling_fix): self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight self.double_q_model = double_q_model assert huber_loss is None or huber_loss > 0.0 self.huber_loss = huber_loss # TEMP: Random sampling fix self.random_sampling_fix = random_sampling_fix super(QModel, self).__init__( states_spec=states_spec, actions_spec=actions_spec, network_spec=network_spec, device=device, scope=scope, saver_spec=saver_spec, summary_spec=summary_spec, distributed_spec=distributed_spec, optimizer=optimizer, discount=discount, normalize_rewards=normalize_rewards, variable_noise=variable_noise, distributions_spec=distributions_spec, entropy_regularization=entropy_regularization, ) def initialize(self, custom_getter): super(QModel, self).initialize(custom_getter) # TEMP: Random sampling fix if self.random_sampling_fix: self.next_state_inputs = dict() for name, state in self.states_spec.items(): self.next_state_inputs[name] = tf.placeholder( dtype=util.tf_dtype(state['type']), shape=(None, ) + tuple(state['shape']), name=('next-' + name)) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target', summary_labels=self.summary_labels)) # Target network optimizer self.target_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) # Target network distributions self.target_distributions = self.generate_distributions( self.actions_spec, self.distributions_spec, self.summary_labels) def tf_q_value(self, embedding, distr_params, action, name): # Mainly for NAF. return self.distributions[name].state_action_value( distr_params=distr_params, action=action) def tf_q_delta(self, q_value, next_q_value, terminal, reward): """ Creates the deltas (or advantage) of the Q values. :return: A list of deltas per action """ for _ in range(util.rank(q_value) - 1): terminal = tf.expand_dims(input=terminal, axis=1) reward = tf.expand_dims(input=reward, axis=1) multiples = (1, ) + util.shape(q_value)[1:] terminal = tf.tile(input=terminal, multiples=multiples) reward = tf.tile(input=reward, multiples=multiples) zeros = tf.zeros_like(tensor=next_q_value) next_q_value = tf.where(condition=terminal, x=zeros, y=(self.discount * next_q_value)) return reward + next_q_value - q_value # tf.stop_gradient(q_target) def tf_loss_per_instance(self, states, internals, actions, terminal, reward, update): # TEMP: Random sampling fix if self.random_sampling_fix: next_states = self.get_states(states=self.next_state_inputs) next_states = { name: tf.stop_gradient(input=state) for name, state in next_states.items() } embedding, next_internals = self.network.apply( x=states, internals=internals, update=update, return_internals=True) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_embedding = self.target_network.apply( x=next_states, internals=next_internals, update=update) else: embedding = self.network.apply( x={name: state[:-1] for name, state in states.items()}, internals=[internal[:-1] for internal in internals], update=update) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_embedding = self.target_network.apply( x={name: state[1:] for name, state in states.items()}, internals=[internal[1:] for internal in internals], update=update) actions = {name: action[:-1] for name, action in actions.items()} terminal = terminal[:-1] reward = reward[:-1] deltas = list() for name, distribution in self.distributions.items(): target_distribution = self.target_distributions[name] distr_params = distribution.parameterize(x=embedding) target_distr_params = target_distribution.parameterize( x=target_embedding) q_value = self.tf_q_value(embedding=embedding, distr_params=distr_params, action=actions[name], name=name) if self.double_q_model: action_taken = distribution.sample(distr_params=distr_params, deterministic=True) else: action_taken = target_distribution.sample( distr_params=target_distr_params, deterministic=True) next_q_value = target_distribution.state_action_value( distr_params=target_distr_params, action=action_taken) delta = self.tf_q_delta(q_value=q_value, next_q_value=next_q_value, terminal=terminal, reward=reward) collapsed_size = util.prod(util.shape(delta)[1:]) delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards loss_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=deltas, axis=1), axis=1) # Optional Huber loss if self.huber_loss is not None and self.huber_loss > 0.0: return tf.where( condition=(tf.abs(x=loss_per_instance) <= self.huber_loss), x=(0.5 * tf.square(x=loss_per_instance)), y=(self.huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss))) else: return tf.square(x=loss_per_instance) def tf_optimization(self, states, internals, actions, terminal, reward, update): optimization = super(QModel, self).tf_optimization(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, update=update) network_distributions_variables = self.get_distributions_variables( self.distributions) target_distributions_variables = self.get_distributions_variables( self.target_distributions) target_optimization = self.target_optimizer.minimize( time=self.timestep, variables=self.target_network.get_variables() + target_distributions_variables, source_variables=self.network.get_variables() + network_distributions_variables) return tf.group(optimization, target_optimization) def get_variables(self, include_non_trainable=False): model_variables = super( QModel, self).get_variables(include_non_trainable=include_non_trainable) if include_non_trainable: # Target network and optimizer variables only included if 'include_non_trainable' set target_variables = self.target_network.get_variables( include_non_trainable=include_non_trainable) target_distributions_variables = self.get_distributions_variables( self.target_distributions) target_optimizer_variables = self.target_optimizer.get_variables() return model_variables + target_variables + target_optimizer_variables + target_distributions_variables else: return model_variables def get_summaries(self): target_distributions_summaries = self.get_distributions_summaries( self.target_distributions) return super(QModel, self).get_summaries() + self.target_network.get_summaries( ) + target_distributions_summaries # TEMP: Random sampling fix def update(self, states, internals, actions, terminal, reward, return_loss_per_instance=False): fetches = [self.optimization] # Optionally fetch loss per instance if return_loss_per_instance: fetches.append(self.loss_per_instance) terminal = np.asarray(terminal) batched = (terminal.ndim == 1) if batched: # TEMP: Random sampling fix if self.random_sampling_fix: feed_dict = { state_input: states[name][0] for name, state_input in self.state_inputs.items() } feed_dict.update({ state_input: states[name][1] for name, state_input in self.next_state_inputs.items() }) else: feed_dict = { state_input: states[name] for name, state_input in self.state_inputs.items() } feed_dict.update({ internal_input: internals[n] for n, internal_input in enumerate(self.internal_inputs) }) feed_dict.update({ action_input: actions[name] for name, action_input in self.action_inputs.items() }) feed_dict[self.terminal_input] = terminal feed_dict[self.reward_input] = reward else: # TEMP: Random sampling fix if self.random_sampling_fix: raise TensorForceError("Unbatched version not covered by fix.") else: feed_dict = { state_input: (states[name], ) for name, state_input in self.state_inputs.items() } feed_dict.update({ internal_input: (internals[n], ) for n, internal_input in enumerate(self.internal_inputs) }) feed_dict.update({ action_input: (actions[name], ) for name, action_input in self.action_inputs.items() }) feed_dict[self.terminal_input] = (terminal, ) feed_dict[self.reward_input] = (reward, ) feed_dict[self.deterministic_input] = True feed_dict[self.update_input] = True fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict) if return_loss_per_instance: return fetched[1]
class QModel(DistributionModel): """ Q-value model. """ def __init__(self, states_spec, actions_spec, network_spec, config): self.target_sync_frequency = config.target_sync_frequency self.target_update_weight = config.target_update_weight self.double_q_model = config.double_q_model assert config.huber_loss is None or config.huber_loss > 0.0 self.huber_loss = config.huber_loss super(QModel, self).__init__(states_spec=states_spec, actions_spec=actions_spec, network_spec=network_spec, config=config) def initialize(self, custom_getter): super(QModel, self).initialize(custom_getter) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target', summary_labels=self.summary_labels)) # Target network optimizer self.target_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) def tf_q_value(self, embedding, distr_params, action, name): # Mainly for NAF. return self.distributions[name].state_action_value( distr_params=distr_params, action=action) def tf_q_delta(self, q_value, next_q_value, terminal, reward): """ Creates the deltas (or advantage) of the Q values. :return: A list of deltas per action """ for _ in range(util.rank(q_value) - 1): terminal = tf.expand_dims(input=terminal, axis=1) reward = tf.expand_dims(input=reward, axis=1) multiples = (1, ) + util.shape(q_value)[1:] terminal = tf.tile(input=terminal, multiples=multiples) reward = tf.tile(input=reward, multiples=multiples) zeros = tf.zeros_like(tensor=next_q_value) next_q_value = tf.where(condition=terminal, x=zeros, y=(self.discount * next_q_value)) return reward + next_q_value - q_value # tf.stop_gradient(q_target) def tf_loss_per_instance(self, states, internals, actions, terminal, reward, update): embedding = self.network.apply( x={name: state[:-1] for name, state in states.items()}, internals=[internal[:-1] for internal in internals], update=update) # Both networks can use the same internals, could that be a problem? # Otherwise need to handle internals indices correctly everywhere target_embedding = self.target_network.apply( x={name: state[1:] for name, state in states.items()}, internals=[internal[1:] for internal in internals], update=update) deltas = list() for name, distribution in self.distributions.items(): distr_params = distribution.parameterize(x=embedding) target_distr_params = distribution.parameterize( x=target_embedding) # TODO: separate distribution parameters? q_value = self.tf_q_value(embedding=embedding, distr_params=distr_params, action=actions[name][:-1], name=name) if self.double_q_model: action_taken = distribution.sample(distr_params=distr_params, deterministic=True) else: action_taken = distribution.sample( distr_params=target_distr_params, deterministic=True) next_q_value = distribution.state_action_value( distr_params=target_distr_params, action=action_taken) delta = self.tf_q_delta(q_value=q_value, next_q_value=next_q_value, terminal=terminal[:-1], reward=reward[:-1]) collapsed_size = util.prod(util.shape(delta)[1:]) delta = tf.reshape(tensor=delta, shape=(-1, collapsed_size)) deltas.append(delta) # Surrogate loss as the mean squared error between actual observed rewards and expected rewards loss_per_instance = tf.reduce_mean(input_tensor=tf.concat( values=deltas, axis=1), axis=1) # Optional Huber loss if self.huber_loss is not None and self.huber_loss > 0.0: return tf.where( condition=(tf.abs(x=loss_per_instance) <= self.huber_loss), x=(0.5 * tf.square(x=loss_per_instance)), y=(self.huber_loss * (tf.abs(x=loss_per_instance) - 0.5 * self.huber_loss))) else: return tf.square(x=loss_per_instance) def tf_optimization(self, states, internals, actions, terminal, reward, update): optimization = super(QModel, self).tf_optimization(states=states, internals=internals, actions=actions, terminal=terminal, reward=reward, update=update) target_optimization = self.target_optimizer.minimize( time=self.timestep, variables=self.target_network.get_variables(), source_variables=self.network.get_variables()) return tf.group(optimization, target_optimization) def get_variables(self, include_non_trainable=False): model_variables = super( QModel, self).get_variables(include_non_trainable=include_non_trainable) if include_non_trainable: # Target network and optimizer variables only included if 'include_non_trainable' set target_variables = self.target_network.get_variables( include_non_trainable=include_non_trainable) target_optimizer_variables = self.target_optimizer.get_variables() return model_variables + target_variables + target_optimizer_variables else: return model_variables def get_summaries(self): return super( QModel, self).get_summaries() + self.target_network.get_summaries()
class DPGTargetModel(DistributionModel): """ Policy gradient model log likelihood model with target network (e.g. DDPG) """ COMPONENT_CRITIC = "critic" COMPONENT_TARGET_NETWORK = "target_network" COMPONENT_TARGET_DISTRIBUTION = "target_distribution" def __init__(self, states, actions, scope, device, saver, summarizer, execution, batching_capacity, variable_noise, states_preprocessing, actions_exploration, reward_preprocessing, update_mode, memory, optimizer, discount, network, distributions, entropy_regularization, critic_network, critic_optimizer, target_sync_frequency, target_update_weight): self.critic_network_spec = critic_network self.critic_optimizer_spec = critic_optimizer self.target_sync_frequency = target_sync_frequency self.target_update_weight = target_update_weight # self.network is the actor, self.critic is the critic self.target_network = None self.target_network_optimizer = None self.critic_network = None self.critic_optimizer = None self.target_critic_network = None self.target_critic_optimizer = None super(DPGTargetModel, self).__init__(states=states, actions=actions, scope=scope, device=device, saver=saver, summarizer=summarizer, execution=execution, batching_capacity=batching_capacity, variable_noise=variable_noise, states_preprocessing=states_preprocessing, actions_exploration=actions_exploration, reward_preprocessing=reward_preprocessing, update_mode=update_mode, memory=memory, optimizer=optimizer, discount=discount, network=network, distributions=distributions, entropy_regularization=entropy_regularization, requires_deterministic=True) assert self.memory_spec["include_next_states"] assert self.requires_deterministic def setup_components_and_tf_funcs(self, custom_getter=None): custom_getter = super( DPGTargetModel, self).setup_components_and_tf_funcs(custom_getter) # Target network self.target_network = Network.from_spec( spec=self.network_spec, kwargs=dict(scope='target-network', summary_labels=self.summary_labels)) # Target network optimizer self.target_network_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) # Target network distributions self.target_distributions = self.create_distributions() # Critic size_t0 = self.critic_network_spec['size_t0'] size_t1 = self.critic_network_spec['size_t1'] self.critic_network = DDPGCriticNetwork(scope='critic', size_t0=size_t0, size_t1=size_t1) self.critic_optimizer = Optimizer.from_spec( spec=self.critic_optimizer_spec, kwargs=dict(summary_labels=self.summary_labels)) self.target_critic_network = DDPGCriticNetwork(scope='target-critic', size_t0=size_t0, size_t1=size_t1) # Target critic optimizer self.target_critic_optimizer = Synchronization( sync_frequency=self.target_sync_frequency, update_weight=self.target_update_weight) self.fn_target_actions_and_internals = tf.make_template( name_='target-actions-and-internals', func_=self.tf_target_actions_and_internals, custom_getter_=custom_getter) self.fn_predict_target_q = tf.make_template( name_='predict-target-q', func_=self.tf_predict_target_q, custom_getter_=custom_getter) return custom_getter def tf_target_actions_and_internals(self, states, internals, deterministic=True): embedding, internals = self.target_network.apply( x=states, internals=internals, update=tf.constant(value=False), return_internals=True) actions = dict() for name in sorted(self.target_distributions): distribution = self.target_distributions[name] distr_params = distribution.parameterize(x=embedding) actions[name] = distribution.sample( distr_params=distr_params, deterministic=tf.logical_or(x=deterministic, y=self.requires_deterministic)) return actions, internals def tf_loss_per_instance(self, states, internals, actions, terminal, reward, next_states, next_internals, update, reference=None): q = self.critic_network.apply(dict(states=states, actions=actions), internals=internals, update=update) return -q def tf_predict_target_q(self, states, internals, terminal, actions, reward, update): q_value = self.target_critic_network.apply(dict(states=states, actions=actions), internals=internals, update=update) return reward + ( 1. - tf.cast(terminal, dtype=tf.float32)) * self.discount * q_value def tf_optimization(self, states, internals, actions, terminal, reward, next_states=None, next_internals=None): update = tf.constant(value=True) # Predict actions from target actor next_target_actions, next_target_internals = self.fn_target_actions_and_internals( states=next_states, internals=next_internals, deterministic=True) # Predicted Q value of next states predicted_q = self.fn_predict_target_q(states=next_states, internals=next_internals, actions=next_target_actions, terminal=terminal, reward=reward, update=update) predicted_q = tf.stop_gradient(input=predicted_q) real_q = self.critic_network.apply(dict(states=states, actions=actions), internals=internals, update=update) # Update critic def fn_critic_loss(predicted_q, real_q): return tf.reduce_mean(tf.square(real_q - predicted_q)) critic_optimization = self.critic_optimizer.minimize( time=self.timestep, variables=self.critic_network.get_variables(), arguments=dict(predicted_q=predicted_q, real_q=real_q), fn_loss=fn_critic_loss) # Update actor predicted_actions, predicted_internals = self.fn_actions_and_internals( states=states, internals=internals, deterministic=True) optimization = super(DPGTargetModel, self).tf_optimization( states=states, internals=internals, actions=predicted_actions, terminal=terminal, reward=reward, next_states=next_states, next_internals=next_internals) # Update target actor (network) and critic network_distributions_variables = [ variable for name in sorted(self.distributions) for variable in self.distributions[name].get_variables( include_nontrainable=False) ] target_distributions_variables = [ variable for name in sorted(self.target_distributions) for variable in self.target_distributions[name].get_variables( include_nontrainable=False) ] target_optimization = self.target_network_optimizer.minimize( time=self.timestep, variables=self.target_network.get_variables() + target_distributions_variables, source_variables=self.network.get_variables() + network_distributions_variables) target_critic_optimization = self.target_critic_optimizer.minimize( time=self.timestep, variables=self.target_critic_network.get_variables(), source_variables=self.critic_network.get_variables()) return tf.group(critic_optimization, optimization, target_optimization, target_critic_optimization) def get_variables(self, include_submodules=False, include_nontrainable=False): model_variables = super(DPGTargetModel, self).get_variables( include_submodules=include_submodules, include_nontrainable=include_nontrainable) critic_variables = self.critic_network.get_variables( include_nontrainable=include_nontrainable) model_variables += critic_variables if include_nontrainable: critic_optimizer_variables = self.critic_optimizer.get_variables() for variable in critic_optimizer_variables: if variable in model_variables: model_variables.remove(variable) model_variables += critic_optimizer_variables if include_submodules: target_variables = self.target_network.get_variables( include_nontrainable=include_nontrainable) model_variables += target_variables target_distributions_variables = [ variable for name in sorted(self.target_distributions) for variable in self.target_distributions[name].get_variables( include_nontrainable=include_nontrainable) ] model_variables += target_distributions_variables target_critic_variables = self.target_critic_network.get_variables( include_nontrainable=include_nontrainable) model_variables += target_critic_variables if include_nontrainable: target_optimizer_variables = self.target_network_optimizer.get_variables( ) model_variables += target_optimizer_variables target_critic_optimizer_variables = self.target_critic_optimizer.get_variables( ) model_variables += target_critic_optimizer_variables return model_variables def get_components(self): result = dict(super(DPGTargetModel, self).get_components()) result[DPGTargetModel.COMPONENT_CRITIC] = self.critic_network result[DPGTargetModel.COMPONENT_TARGET_NETWORK] = self.target_network for name in sorted(self.target_distributions): result["%s_%s" % (DPGTargetModel.COMPONENT_TARGET_DISTRIBUTION, name)] = self.target_distributions[name] if len(self.target_distributions) == 1: result[DPGTargetModel. COMPONENT_TARGET_DISTRIBUTION] = self.target_distributions[ next(iter(sorted(self.target_distributions)))] return result