def __init__(self, config, scope, define_network=None): super(PGModel, self).__init__(config, scope) self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.episode = 0 self.input_feed = None self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.policy = None scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-' if define_network is None: define_network = NeuralNetwork.layered_network(self.config.network_layers) self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function') self.saver = tf.train.Saver() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions') # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count]) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() # TODO configurable value functions self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
def __init__(self, config, scope, task_index, cluster_spec, define_network=None): """ A distributed agent must synchronise local and global parameters under different scopes. :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = None self.saver = None self.config = create_config(config, default=self.default_config) self.scope = scope self.task_index = task_index self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() if define_network is None: self.define_network = NeuralNetwork.layered_network( self.config.network_layers) else: self.define_network = define_network # This is the scope used to prefix variable creation for distributed TensorFlow self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) self.optimizer = None self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index) with tf.device( tf.train.replica_device_setter( 1, worker_device=self.worker_device, cluster=cluster_spec)): with tf.variable_scope("global"): self.global_state = tf.placeholder( tf.float32, self.batch_shape + list(self.config.state_shape), name="global_state") self.global_network = NeuralNetwork(self.define_network, [self.global_state]) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') if self.continuous: self.global_policy = GaussianPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'gaussian_policy') self.global_prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.global_prev_dist = dict( policy_output=self.global_prev_action_means, policy_log_std=self.global_prev_action_log_stds) else: self.global_policy = CategoricalOneHotPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'categorical_policy') self.global_prev_dist = dict( policy_output=self.global_prev_action_means) # Probability distribution used in the current policy self.global_baseline_value_function = LinearValueFunction() # self.optimizer = config.get('optimizer') # self.optimizer_args = config.get('optimizer_args', []) # self.optimizer_kwargs = config.get('optimizer_kwargs', {}) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs) self.create_training_operations()
def create_training_operations(self): """ Currently a duplicate of the pg agent logic, to be made generic later to allow all models to be executed asynchronously/distributed seamlessly. """ # TODO rewrite agent logic so core update logic can be composed into # TODO distributed logic with tf.device(self.worker_device): with tf.variable_scope("local"): self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') self.local_network = NeuralNetwork(self.define_network, [self.state]) # TODO possibly problematic, check self.local_step = self.global_step if self.continuous: self.policy = GaussianPolicy(self.local_network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.prev_dist = dict( policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy( self.local_network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.baseline_value_function = LinearValueFunction() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.dist = self.policy.get_distribution() self.log_probabilities = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that self.loss = -tf.reduce_mean( self.log_probabilities * self.advantage, name="loss_op") self.gradients = tf.gradients(self.loss, self.local_network.get_variables()) grad_var_list = list( zip(self.gradients, self.global_network.get_variables())) global_step_inc = self.global_step.assign_add( tf.shape(self.state)[0]) self.assign_global_to_local = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.local_network.get_variables(), self.global_network.get_variables()) ]) # TODO write summaries # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index) if not self.optimizer: self.optimizer = tf.train.AdamOptimizer(self.alpha) else: optimizer_cls = get_function(self.optimizer) self.optimizer = optimizer_cls(self.alpha, *self.optimizer_args, **self.optimizer_kwargs) self.optimize_op = tf.group( self.optimizer.apply_gradients(grad_var_list), global_step_inc)
def __init__(self, config, scope, network_builder=None): super(PGModel, self).__init__(config, scope) self.continuous = self.config.continuous self.batch_size = self.config.batch_size self.max_episode_length = min(self.config.max_episode_length, self.batch_size) self.action_count = self.config.actions # advantage estimation self.gamma = self.config.gamma self.generalized_advantage_estimation = self.config.gae self.gae_lambda = self.config.gae_lambda self.normalize_advantage = self.config.normalize_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state_shape = tuple(self.config.state_shape) self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state") self.actions = tf.placeholder(tf.float32, (None, None, self.action_count), name='actions') self.prev_action_means = tf.placeholder( tf.float32, (None, None, self.action_count), name='prev_actions') self.advantage = tf.placeholder(tf.float32, shape=(None, None, 1), name='advantage') if network_builder is None: network_builder = NeuralNetwork.layered_network( self.config.network_layers) if self.config.tf_scope is None: scope = '' else: scope = self.config.tf_scope + '-' self.network = NeuralNetwork(network_builder, inputs=[self.state], episode_length=self.episode_length, scope=scope + 'value_function') self.internal_states = self.network.internal_state_inits # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, (None, None, self.action_count)) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() size = 1 for dims in self.state_shape: size *= dims self.baseline_value_function = LinearValueFunction()