Beispiel #1
0
    def __init__(self, config, scope, network_builder=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        # self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{"type": "linear", "num_outputs": self.config.actions, "trainable": True}]

        # Input placeholders
        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32, (None, None) + self.state_shape, name="state")
        self.next_states = tf.placeholder(tf.float32, (None, None) + self.state_shape,
                                          name="next_states")
        self.terminals = tf.placeholder(tf.float32, (None, None), name='terminals')
        self.rewards = tf.placeholder(tf.float32, (None, None), name='rewards')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(self.config.network_layers + output_layer_config)

        self.training_network = NeuralNetwork(network_builder, [self.state], episode_length=self.episode_length,
                                              scope=self.scope + 'training')
        self.target_network = NeuralNetwork(network_builder, [self.next_states], episode_length=self.episode_length,
                                            scope=self.scope + 'target')

        self.training_internal_states = self.training_network.internal_state_inits
        self.target_internal_states = self.target_network.internal_state_inits

        self.training_output = self.training_network.output
        self.target_output = self.target_network.output

        # Create training operations
        self.create_training_operations()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs', graph=tf.get_default_graph())
        self.session.run(self.init_op)
Beispiel #2
0
    def __init__(self, config, scope, define_network=None):
        super(PGModel, self).__init__(config, scope)
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.episode = 0
        self.input_feed = None

        self.advantage = tf.placeholder(tf.float32, shape=[None, 1], name='advantage')
        self.policy = None

        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.hidden_layers = NeuralNetwork(define_network, [self.state], scope=scope + 'value_function')

        self.saver = tf.train.Saver()
        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.prev_action_means = tf.placeholder(tf.float32, [None, self.action_count], name='prev_actions')

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.hidden_layers, self.session, self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(tf.float32, [None, self.action_count])

            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.hidden_layers, self.session, self.state, self.random,
                                                  self.action_count, 'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        # TODO configurable value functions
        self.baseline_value_function = MLPValueFunction(self.session, 100, 64)
Beispiel #3
0
    def __init__(self,
                 memory_capacity,
                 state_shape,
                 action_shape,
                 state_type=np.float32,
                 action_type=np.int,
                 reward_type=np.float32,
                 deterministic_mode=False,
                 *args,
                 **kwargs):
        """
        Initializes a replay memory.

        :param memory_capacity: Memory size
        :param state_shape: Shape of state tensor
        :param state_type: Data type of state tensor
        :param action_shape: Shape of action tensor
        :param action_type: Data type of action tensor
        :param reward_type: Data type of reward function
        :param deterministic_mode: If true, global random number generation
        is controlled by passing the same seed to all generators, if false,
        no seed is used for sampling.
        """

        self.step_count = 0
        self.capacity = int(memory_capacity)
        self.size = 0

        # Explicitly set data types for every tensor to make for easier adjustments
        # if backend precision changes
        self.state_shape = state_shape
        self.state_type = state_type
        self.action_shape = action_shape
        self.action_type = action_type
        self.reward_type = reward_type

        # self batch shape
        self.states = np.zeros([self.capacity] + list(self.state_shape),
                               dtype=self.state_type)
        self.actions = np.zeros([self.capacity] + list(self.action_shape),
                                dtype=self.action_type)
        self.rewards = np.zeros([self.capacity], dtype=self.reward_type)
        self.terminals = np.zeros([self.capacity], dtype=bool)

        if deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        # Index to control sampling
        self.top = 0
Beispiel #4
0
    def __init__(self, config, scope, define_network=None):
        """
        Training logic for NAFs.

        :param config: Configuration parameters
        """
        super(NAFModel, self).__init__(config, scope)
        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.epsilon = self.config.epsilon
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape), name="state")
        self.next_states = tf.placeholder(tf.float32, self.batch_shape + list(self.config.state_shape),
                                          name="next_states")

        self.actions = tf.placeholder(tf.float32, [None, self.action_count], name='actions')
        self.terminals = tf.placeholder(tf.float32, [None], name='terminals')
        self.rewards = tf.placeholder(tf.float32, [None], name='rewards')
        self.q_targets = tf.placeholder(tf.float32, [None], name='q_targets')
        self.target_network_update = []
        self.episode = 0

        # Get hidden layers from network generator, then add NAF outputs, same for target network
        scope = '' if self.config.tf_scope is None else self.config.tf_scope + '-'

        if define_network is None:
            define_network = NeuralNetwork.layered_network(self.config.network_layers)

        self.training_model = NeuralNetwork(define_network, [self.state], scope=scope + 'training')
        self.target_model = NeuralNetwork(define_network, [self.next_states], scope=scope + 'target')

        # Create output fields
        self.training_v, self.mu, self.advantage, self.q, self.training_output_vars = self.create_outputs(
            self.training_model.get_output(), 'outputs_training')
        self.target_v, _, _, _, self.target_output_vars = self.create_outputs(self.target_model.get_output(),
                                                                              'outputs_target')
        self.create_training_operations()
        self.saver = tf.train.Saver()
        self.session.run(tf.global_variables_initializer())
Beispiel #5
0
    def __init__(self, config, scope):
        """
        
        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """
        self.session = tf.Session()
        self.total_states = 0
        self.saver = None
        self.config = create_config(config, default=self.default_config)

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(log_levels[config.get('loglevel', 'info')])

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.scope = scope

        self.deterministic_mode = config.get('deterministic_mode', False)
        self.episode_length = tf.placeholder(tf.int32, (None, ),
                                             name='episode_length')

        self.learning_rate = config.get('learning_rate', 0.001)

        if self.config.seed is not None:
            self.random = global_seed(self.config.seed)
            tf.set_random_seed(self.config.seed)
        else:
            self.random = np.random.RandomState()

        optimizer = config.get('optimizer')
        if not optimizer:
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        else:
            args = config.get('optimizer_args', [])
            kwargs = config.get('optimizer_kwargs', {})
            optimizer_cls = get_function(optimizer)
            self.optimizer = optimizer_cls(self.learning_rate, *args, **kwargs)

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)
Beispiel #6
0
    def __init__(self, config, scope, define_network=None):
        """
        Training logic for DQN.

        :param config: Configuration dict
        """
        super(DQNModel, self).__init__(config, scope)

        self.action_count = self.config.actions
        self.tau = self.config.tau
        self.gamma = self.config.gamma
        self.batch_size = self.config.batch_size

        self.double_dqn = self.config.double_dqn

        self.clip_value = None
        if self.config.clip_gradients:
            self.clip_value = self.config.clip_value

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.target_network_update = []

        # output layer
        output_layer_config = [{
            "type": "linear",
            "num_outputs": self.config.actions,
            "trainable": True
        }]

        self.device = self.config.tf_device
        if self.device == 'replica':
            self.device = tf.train.replica_device_setter(
                ps_tasks=1, worker_device=self.config.tf_worker_device)

        with tf.device(self.device):
            # Input placeholders
            self.state = tf.placeholder(tf.float32,
                                        self.batch_shape +
                                        list(self.config.state_shape),
                                        name="state")
            self.next_states = tf.placeholder(tf.float32,
                                              self.batch_shape +
                                              list(self.config.state_shape),
                                              name="next_states")
            self.terminals = tf.placeholder(tf.float32,
                                            self.batch_shape,
                                            name='terminals')
            self.rewards = tf.placeholder(tf.float32,
                                          self.batch_shape,
                                          name='rewards')

            if define_network is None:
                define_network = NeuralNetwork.layered_network(
                    self.config.network_layers + output_layer_config)

            self.training_model = NeuralNetwork(define_network, [self.state],
                                                scope=self.scope + 'training')
            self.target_model = NeuralNetwork(define_network,
                                              [self.next_states],
                                              scope=self.scope + 'target')

            self.training_output = self.training_model.get_output()
            self.target_output = self.target_model.get_output()

            # Create training operations
            self.create_training_operations()
            self.optimizer = tf.train.RMSPropOptimizer(self.alpha,
                                                       momentum=0.95,
                                                       epsilon=0.01)

        self.training_output = self.training_model.get_output()
        self.target_output = self.target_model.get_output()

        self.init_op = tf.global_variables_initializer()

        self.saver = tf.train.Saver()
        self.writer = tf.summary.FileWriter('logs',
                                            graph=tf.get_default_graph())
    def __init__(self,
                 config,
                 scope,
                 task_index,
                 cluster_spec,
                 define_network=None):
        """

        A distributed agent must synchronise local and global parameters under different
        scopes.

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        self.session = None
        self.saver = None
        self.config = create_config(config, default=self.default_config)
        self.scope = scope
        self.task_index = task_index
        self.batch_size = self.config.batch_size
        self.action_count = self.config.actions
        self.use_gae = self.config.use_gae
        self.gae_lambda = self.config.gae_lambda

        self.gamma = self.config.gamma
        self.continuous = self.config.continuous
        self.normalize_advantage = self.config.normalise_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        if define_network is None:
            self.define_network = NeuralNetwork.layered_network(
                self.config.network_layers)
        else:
            self.define_network = define_network

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.batch_shape = [None]
        self.deterministic_mode = config.get('deterministic_mode', False)
        self.alpha = config.get('alpha', 0.001)
        self.optimizer = None

        self.worker_device = "/job:worker/task:{}/cpu:0".format(task_index)

        with tf.device(
                tf.train.replica_device_setter(
                    1, worker_device=self.worker_device,
                    cluster=cluster_spec)):
            with tf.variable_scope("global"):
                self.global_state = tf.placeholder(
                    tf.float32,
                    self.batch_shape + list(self.config.state_shape),
                    name="global_state")

                self.global_network = NeuralNetwork(self.define_network,
                                                    [self.global_state])
                self.global_step = tf.get_variable(
                    "global_step", [],
                    tf.int32,
                    initializer=tf.constant_initializer(0, dtype=tf.int32),
                    trainable=False)

                self.global_prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                if self.continuous:
                    self.global_policy = GaussianPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'gaussian_policy')
                    self.global_prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means,
                        policy_log_std=self.global_prev_action_log_stds)

                else:
                    self.global_policy = CategoricalOneHotPolicy(
                        self.global_network, self.session, self.global_state,
                        self.random, self.action_count, 'categorical_policy')
                    self.global_prev_dist = dict(
                        policy_output=self.global_prev_action_means)

                # Probability distribution used in the current policy
                self.global_baseline_value_function = LinearValueFunction()

            # self.optimizer = config.get('optimizer')
            # self.optimizer_args = config.get('optimizer_args', [])
            # self.optimizer_kwargs = config.get('optimizer_kwargs', {})

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)

        self.create_training_operations()
 def __init__(self, model=None):
     self.model = model
     if self.model.config.deterministic_mode:
         self.random = global_seed()
     else:
         self.random = np.random.RandomState()
Beispiel #9
0
    def __init__(self, config, scope, network_builder=None):
        super(PGModel, self).__init__(config, scope)

        self.continuous = self.config.continuous
        self.batch_size = self.config.batch_size
        self.max_episode_length = min(self.config.max_episode_length,
                                      self.batch_size)
        self.action_count = self.config.actions

        # advantage estimation
        self.gamma = self.config.gamma
        self.generalized_advantage_estimation = self.config.gae
        self.gae_lambda = self.config.gae_lambda
        self.normalize_advantage = self.config.normalize_advantage

        if self.config.deterministic_mode:
            self.random = global_seed()
        else:
            self.random = np.random.RandomState()

        self.state_shape = tuple(self.config.state_shape)
        self.state = tf.placeholder(tf.float32,
                                    (None, None) + self.state_shape,
                                    name="state")
        self.actions = tf.placeholder(tf.float32,
                                      (None, None, self.action_count),
                                      name='actions')
        self.prev_action_means = tf.placeholder(
            tf.float32, (None, None, self.action_count), name='prev_actions')
        self.advantage = tf.placeholder(tf.float32,
                                        shape=(None, None, 1),
                                        name='advantage')

        if network_builder is None:
            network_builder = NeuralNetwork.layered_network(
                self.config.network_layers)
        if self.config.tf_scope is None:
            scope = ''
        else:
            scope = self.config.tf_scope + '-'
        self.network = NeuralNetwork(network_builder,
                                     inputs=[self.state],
                                     episode_length=self.episode_length,
                                     scope=scope + 'value_function')
        self.internal_states = self.network.internal_state_inits

        # From an API perspective, continuous vs discrete might be easier than
        # requiring to set the concrete policy, at least currently
        if self.continuous:
            self.policy = GaussianPolicy(self.network, self.session,
                                         self.state, self.random,
                                         self.action_count, 'gaussian_policy')
            self.prev_action_log_stds = tf.placeholder(
                tf.float32, (None, None, self.action_count))
            self.prev_dist = dict(policy_output=self.prev_action_means,
                                  policy_log_std=self.prev_action_log_stds)

        else:
            self.policy = CategoricalOneHotPolicy(self.network, self.session,
                                                  self.state, self.random,
                                                  self.action_count,
                                                  'categorical_policy')
            self.prev_dist = dict(policy_output=self.prev_action_means)

        # Probability distribution used in the current policy
        self.dist = self.policy.get_distribution()

        size = 1
        for dims in self.state_shape:
            size *= dims
        self.baseline_value_function = LinearValueFunction()