def __init__(self, config, scope): """ :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = tf.Session() self.total_states = 0 self.saver = None self.config = create_config(config, default=self.default_config) self.logger = logging.getLogger(__name__) self.logger.setLevel(log_levels[config.get('loglevel', 'info')]) # This is the scope used to prefix variable creation for distributed TensorFlow self.scope = scope self.deterministic_mode = config.get('deterministic_mode', False) self.episode_length = tf.placeholder(tf.int32, (None, ), name='episode_length') self.learning_rate = config.get('learning_rate', 0.001) if self.config.seed is not None: self.random = global_seed(self.config.seed) tf.set_random_seed(self.config.seed) else: self.random = np.random.RandomState() optimizer = config.get('optimizer') if not optimizer: self.optimizer = tf.train.AdamOptimizer(self.learning_rate) else: args = config.get('optimizer_args', []) kwargs = config.get('optimizer_kwargs', {}) optimizer_cls = get_function(optimizer) self.optimizer = optimizer_cls(self.learning_rate, *args, **kwargs) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs)
def __init__(self, config, scope): """ :param config: Configuration parameters :param scope: TensorFlow scope """ # TODO move several default params up here self.session = tf.Session() self.total_states = 0 self.saver = None self.config = create_config(config, default=self.default_config) # This is the scope used to prefix variable creation for distributed TensorFlow self.scope = scope self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) optimizer = config.get('optimizer') if not optimizer: self.optimizer = tf.train.AdamOptimizer(self.alpha) else: args = config.get('optimizer_args', []) kwargs = config.get('optimizer_kwargs', {}) optimizer_cls = get_function(optimizer) self.optimizer = optimizer_cls(self.alpha, *args, **kwargs) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs)
def create_training_operations(self): """ Currently a duplicate of the pg agent logic, to be made generic later to allow all models to be executed asynchronously/distributed seamlessly. """ # TODO rewrite agent logic so core update logic can be composed into # TODO distributed logic with tf.device(self.worker_device): with tf.variable_scope("local"): self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') self.local_network = NeuralNetwork(self.define_network, [self.state]) # TODO possibly problematic, check self.local_step = self.global_step if self.continuous: self.policy = GaussianPolicy(self.local_network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.prev_dist = dict( policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy( self.local_network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.baseline_value_function = LinearValueFunction() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.dist = self.policy.get_distribution() self.log_probabilities = self.dist.log_prob( self.policy.get_policy_variables(), self.actions) # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that self.loss = -tf.reduce_mean( self.log_probabilities * self.advantage, name="loss_op") self.gradients = tf.gradients(self.loss, self.local_network.get_variables()) grad_var_list = list( zip(self.gradients, self.global_network.get_variables())) global_step_inc = self.global_step.assign_add( tf.shape(self.state)[0]) self.assign_global_to_local = tf.group(*[ v1.assign(v2) for v1, v2 in zip(self.local_network.get_variables(), self.global_network.get_variables()) ]) # TODO write summaries # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index) if not self.optimizer: self.optimizer = tf.train.AdamOptimizer(self.alpha) else: optimizer_cls = get_function(self.optimizer) self.optimizer = optimizer_cls(self.alpha, *self.optimizer_args, **self.optimizer_kwargs) self.optimize_op = tf.group( self.optimizer.apply_gradients(grad_var_list), global_step_inc)