def __init__(self, config, scope, network_builder=None): """ Training logic for DQN. :param config: Configuration dict """ super(DQNModel, self).__init__(config, scope) self.action_count = self.config.actions self.tau = self.config.tau self.gamma = self.config.gamma # self.batch_size = self.config.batch_size self.double_dqn = self.config.double_dqn self.clip_value = None if self.config.clip_gradients: self.clip_value = self.config.clip_value if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.target_network_update = [] # output layer output_layer_config = [{"type": "linear", "num_outputs": self.config.actions, "trainable": True}] # Input placeholders self.state_shape = tuple(self.config.state_shape) self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state") self.next_states = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="next_states") self.terminals = tf.placeholder(tf.float32, (None, None), name='terminals') self.rewards = tf.placeholder(tf.float32, (None, None), name='rewards') if network_builder is None: network_builder = NeuralNetwork.layered_network(self.config.network_layers + output_layer_config) self.training_network = NeuralNetwork(network_builder, [self.state], episode_length=self.episode_length, scope=self.scope + 'training') self.target_network = NeuralNetwork(network_builder, [self.next_states], episode_length=self.episode_length, scope=self.scope + 'target') self.training_internal_states = self.training_network.internal_state_inits self.target_internal_states = self.target_network.internal_state_inits self.training_output = self.training_network.output self.target_output = self.target_network.output # Create training operations self.create_training_operations() self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver() self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph()) self.session.run(self.init_op)
def __init__(self, config, scope, define_network=None): super(PGModel, self).__init__(config, scope) self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.episode = 0 self.input_feed = None self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage') self.policy = None scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-' if define_network is None: define_network = NeuralNetwork.layered_network(self.config.network_layers) self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function') self.saver = tf.train.Saver() self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions') # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count]) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() # TODO configurable value functions self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
def __init__(self, memory_capacity, state_shape, action_shape, state_type=np.float32, action_type=np.int, reward_type=np.float32, deterministic_mode=False, *args, **kwargs): """ Initializes a replay memory. :param memory_capacity: Memory size :param state_shape: Shape of state tensor :param state_type: Data type of state tensor :param action_shape: Shape of action tensor :param action_type: Data type of action tensor :param reward_type: Data type of reward function :param deterministic_mode: If true, global random number generation is controlled by passing the same seed to all generators, if false, no seed is used for sampling. """ self.step_count = 0 self.capacity = int(memory_capacity) self.size = 0 # Explicitly set data types for every tensor to make for easier adjustments # if backend precision changes self.state_shape = state_shape self.state_type = state_type self.action_shape = action_shape self.action_type = action_type self.reward_type = reward_type # self batch shape self.states = np.zeros([self.capacity] + list(self.state_shape), dtype=self.state_type) self.actions = np.zeros([self.capacity] + list(self.action_shape), dtype=self.action_type) self.rewards = np.zeros([self.capacity], dtype=self.reward_type) self.terminals = np.zeros([self.capacity], dtype=bool) if deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() # Index to control sampling self.top = 0
def __init__(self, config, scope, define_network=None): """ Training logic for NAFs. :param config: Configuration parameters """ super(NAFModel, self).__init__(config, scope) self.action_count = self.config.actions self.tau = self.config.tau self.epsilon = self.config.epsilon self.gamma = self.config.gamma self.batch_size = self.config.batch_size if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.next_states = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="next_states") self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions') self.terminals = tf.placeholder(tf.float32, [None], name='terminals') self.rewards = tf.placeholder(tf.float32, [None], name='rewards') self.q_targets = tf.placeholder(tf.float32, [None], name='q_targets') self.target_network_update = [] self.episode = 0 # Get hidden layers from network generator, then add NAF outputs, same for target network scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-' if define_network is None: define_network = NeuralNetwork.layered_network(self.config.network_layers) self.training_model = NeuralNetwork(define_network, [self.state], scope=scope + 'training') self.target_model = NeuralNetwork(define_network, [self.next_states], scope=scope + 'target') # Create output fields self.training_v, self.mu, self.advantage, self.q, self.training_output_vars = self.create_outputs( self.training_model.get_output(), 'outputs_training') self.target_v, _, _, _, self.target_output_vars = self.create_outputs(self.target_model.get_output(), 'outputs_target') self.create_training_operations() self.saver = tf.train.Saver() self.session.run(tf.global_variables_initializer())
def __init__(self, config, scope): """ :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = tf.Session() self.total_states = 0 self.saver = None self.config = create_config(config, default=self.default_config) self.logger = logging.getLogger(__name__) self.logger.setLevel(log_levels[config.get('loglevel', 'info')]) # This is the scope used to prefix variable creation for distributed TensorFlow self.scope = scope self.deterministic_mode = config.get('deterministic_mode', False) self.episode_length = tf.placeholder(tf.int32, (None, ), name='episode_length') self.learning_rate = config.get('learning_rate', 0.001) if self.config.seed is not None: self.random = global_seed(self.config.seed) tf.set_random_seed(self.config.seed) else: self.random = np.random.RandomState() optimizer = config.get('optimizer') if not optimizer: self.optimizer = tf.train.AdamOptimizer(self.learning_rate) else: args = config.get('optimizer_args', []) kwargs = config.get('optimizer_kwargs', {}) optimizer_cls = get_function(optimizer) self.optimizer = optimizer_cls(self.learning_rate, *args, **kwargs) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs)
def __init__(self, config, scope, define_network=None): """ Training logic for DQN. :param config: Configuration dict """ super(DQNModel, self).__init__(config, scope) self.action_count = self.config.actions self.tau = self.config.tau self.gamma = self.config.gamma self.batch_size = self.config.batch_size self.double_dqn = self.config.double_dqn self.clip_value = None if self.config.clip_gradients: self.clip_value = self.config.clip_value if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.target_network_update = [] # output layer output_layer_config = [{ "type": "linear", "num_outputs": self.config.actions, "trainable": True }] self.device = self.config.tf_device if self.device == 'replica': self.device = tf.train.replica_device_setter( ps_tasks=1, worker_device=self.config.tf_worker_device) with tf.device(self.device): # Input placeholders self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state") self.next_states = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="next_states") self.terminals = tf.placeholder(tf.float32, self.batch_shape, name='terminals') self.rewards = tf.placeholder(tf.float32, self.batch_shape, name='rewards') if define_network is None: define_network = NeuralNetwork.layered_network( self.config.network_layers + output_layer_config) self.training_model = NeuralNetwork(define_network, [self.state], scope=self.scope + 'training') self.target_model = NeuralNetwork(define_network, [self.next_states], scope=self.scope + 'target') self.training_output = self.training_model.get_output() self.target_output = self.target_model.get_output() # Create training operations self.create_training_operations() self.optimizer = tf.train.RMSPropOptimizer(self.alpha, momentum=0.95, epsilon=0.01) self.training_output = self.training_model.get_output() self.target_output = self.target_model.get_output() self.init_op = tf.global_variables_initializer() self.saver = tf.train.Saver() self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph())
def __init__(self, config, scope, task_index, cluster_spec, define_network=None): """ A distributed agent must synchronise local and global parameters under different scopes. :param config: Configuration parameters :param scope: TensorFlow scope """ self.session = None self.saver = None self.config = create_config(config, default=self.default_config) self.scope = scope self.task_index = task_index self.batch_size = self.config.batch_size self.action_count = self.config.actions self.use_gae = self.config.use_gae self.gae_lambda = self.config.gae_lambda self.gamma = self.config.gamma self.continuous = self.config.continuous self.normalize_advantage = self.config.normalise_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() if define_network is None: self.define_network = NeuralNetwork.layered_network( self.config.network_layers) else: self.define_network = define_network # This is the scope used to prefix variable creation for distributed TensorFlow self.batch_shape = [None] self.deterministic_mode = config.get('deterministic_mode', False) self.alpha = config.get('alpha', 0.001) self.optimizer = None self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index) with tf.device( tf.train.replica_device_setter( 1, worker_device=self.worker_device, cluster=cluster_spec)): with tf.variable_scope("global"): self.global_state = tf.placeholder( tf.float32, self.batch_shape + list(self.config.state_shape), name="global_state") self.global_network = NeuralNetwork(self.define_network, [self.global_state]) self.global_step = tf.get_variable( "global_step", [], tf.int32, initializer=tf.constant_initializer(0, dtype=tf.int32), trainable=False) self.global_prev_action_means = tf.placeholder( tf.float32, [None, self.action_count], name='prev_actions') if self.continuous: self.global_policy = GaussianPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'gaussian_policy') self.global_prev_action_log_stds = tf.placeholder( tf.float32, [None, self.action_count]) self.global_prev_dist = dict( policy_output=self.global_prev_action_means, policy_log_std=self.global_prev_action_log_stds) else: self.global_policy = CategoricalOneHotPolicy( self.global_network, self.session, self.global_state, self.random, self.action_count, 'categorical_policy') self.global_prev_dist = dict( policy_output=self.global_prev_action_means) # Probability distribution used in the current policy self.global_baseline_value_function = LinearValueFunction() # self.optimizer = config.get('optimizer') # self.optimizer_args = config.get('optimizer_args', []) # self.optimizer_kwargs = config.get('optimizer_kwargs', {}) exploration = config.get('exploration') if not exploration: self.exploration = exploration_mode['constant'](self, 0) else: args = config.get('exploration_args', []) kwargs = config.get('exploration_kwargs', {}) self.exploration = exploration_mode[exploration](self, *args, **kwargs) self.create_training_operations()
def __init__(self, model=None): self.model = model if self.model.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState()
def __init__(self, config, scope, network_builder=None): super(PGModel, self).__init__(config, scope) self.continuous = self.config.continuous self.batch_size = self.config.batch_size self.max_episode_length = min(self.config.max_episode_length, self.batch_size) self.action_count = self.config.actions # advantage estimation self.gamma = self.config.gamma self.generalized_advantage_estimation = self.config.gae self.gae_lambda = self.config.gae_lambda self.normalize_advantage = self.config.normalize_advantage if self.config.deterministic_mode: self.random = global_seed() else: self.random = np.random.RandomState() self.state_shape = tuple(self.config.state_shape) self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state") self.actions = tf.placeholder(tf.float32, (None, None, self.action_count), name='actions') self.prev_action_means = tf.placeholder( tf.float32, (None, None, self.action_count), name='prev_actions') self.advantage = tf.placeholder(tf.float32, shape=(None, None, 1), name='advantage') if network_builder is None: network_builder = NeuralNetwork.layered_network( self.config.network_layers) if self.config.tf_scope is None: scope = '' else: scope = self.config.tf_scope + '-' self.network = NeuralNetwork(network_builder, inputs=[self.state], episode_length=self.episode_length, scope=scope + 'value_function') self.internal_states = self.network.internal_state_inits # From an API perspective, continuous vs discrete might be easier than # requiring to set the concrete policy, at least currently if self.continuous: self.policy = GaussianPolicy(self.network, self.session, self.state, self.random, self.action_count, 'gaussian_policy') self.prev_action_log_stds = tf.placeholder( tf.float32, (None, None, self.action_count)) self.prev_dist = dict(policy_output=self.prev_action_means, policy_log_std=self.prev_action_log_stds) else: self.policy = CategoricalOneHotPolicy(self.network, self.session, self.state, self.random, self.action_count, 'categorical_policy') self.prev_dist = dict(policy_output=self.prev_action_means) # Probability distribution used in the current policy self.dist = self.policy.get_distribution() size = 1 for dims in self.state_shape: size *= dims self.baseline_value_function = LinearValueFunction()