Ejemplo n.º 1
0
    def __init__(self, config, scope):
        """
        
        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """
        self.session = tf.Session()
        self.total_states = 0
        self.saver = None
        self.config = create_config(config, default=self.default_config)

        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(log_levels[config.get('loglevel', 'info')])

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.scope = scope

        self.deterministic_mode = config.get('deterministic_mode', False)
        self.episode_length = tf.placeholder(tf.int32, (None, ),
                                             name='episode_length')

        self.learning_rate = config.get('learning_rate', 0.001)

        if self.config.seed is not None:
            self.random = global_seed(self.config.seed)
            tf.set_random_seed(self.config.seed)
        else:
            self.random = np.random.RandomState()

        optimizer = config.get('optimizer')
        if not optimizer:
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        else:
            args = config.get('optimizer_args', [])
            kwargs = config.get('optimizer_kwargs', {})
            optimizer_cls = get_function(optimizer)
            self.optimizer = optimizer_cls(self.learning_rate, *args, **kwargs)

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)
Ejemplo n.º 2
0
    def __init__(self, config, scope):
        """

        :param config: Configuration parameters
        :param scope: TensorFlow scope
        """

        # TODO move several default params up here
        self.session = tf.Session()
        self.total_states = 0
        self.saver = None
        self.config = create_config(config, default=self.default_config)

        # This is the scope used to prefix variable creation for distributed TensorFlow
        self.scope = scope
        self.batch_shape = [None]

        self.deterministic_mode = config.get('deterministic_mode', False)

        self.alpha = config.get('alpha', 0.001)

        optimizer = config.get('optimizer')
        if not optimizer:
            self.optimizer = tf.train.AdamOptimizer(self.alpha)
        else:
            args = config.get('optimizer_args', [])
            kwargs = config.get('optimizer_kwargs', {})
            optimizer_cls = get_function(optimizer)
            self.optimizer = optimizer_cls(self.alpha, *args, **kwargs)

        exploration = config.get('exploration')
        if not exploration:
            self.exploration = exploration_mode['constant'](self, 0)
        else:
            args = config.get('exploration_args', [])
            kwargs = config.get('exploration_kwargs', {})
            self.exploration = exploration_mode[exploration](self, *args,
                                                             **kwargs)
    def create_training_operations(self):
        """
        Currently a duplicate of the pg agent logic, to be made generic later to allow
        all models to be executed asynchronously/distributed seamlessly.

        """
        # TODO rewrite agent logic so core update logic can be composed into
        # TODO distributed logic

        with tf.device(self.worker_device):
            with tf.variable_scope("local"):
                self.state = tf.placeholder(tf.float32,
                                            self.batch_shape +
                                            list(self.config.state_shape),
                                            name="state")
                self.prev_action_means = tf.placeholder(
                    tf.float32, [None, self.action_count], name='prev_actions')

                self.local_network = NeuralNetwork(self.define_network,
                                                   [self.state])
                # TODO possibly problematic, check
                self.local_step = self.global_step

                if self.continuous:
                    self.policy = GaussianPolicy(self.local_network,
                                                 self.session, self.state,
                                                 self.random,
                                                 self.action_count,
                                                 'gaussian_policy')
                    self.prev_action_log_stds = tf.placeholder(
                        tf.float32, [None, self.action_count])

                    self.prev_dist = dict(
                        policy_output=self.prev_action_means,
                        policy_log_std=self.prev_action_log_stds)

                else:
                    self.policy = CategoricalOneHotPolicy(
                        self.local_network, self.session, self.state,
                        self.random, self.action_count, 'categorical_policy')
                    self.prev_dist = dict(policy_output=self.prev_action_means)

                # Probability distribution used in the current policy
                self.baseline_value_function = LinearValueFunction()

            self.actions = tf.placeholder(tf.float32,
                                          [None, self.action_count],
                                          name='actions')
            self.advantage = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name='advantage')

            self.dist = self.policy.get_distribution()
            self.log_probabilities = self.dist.log_prob(
                self.policy.get_policy_variables(), self.actions)

            # Concise: Get log likelihood of actions, weigh by advantages, compute gradient on that
            self.loss = -tf.reduce_mean(
                self.log_probabilities * self.advantage, name="loss_op")

            self.gradients = tf.gradients(self.loss,
                                          self.local_network.get_variables())

            grad_var_list = list(
                zip(self.gradients, self.global_network.get_variables()))

            global_step_inc = self.global_step.assign_add(
                tf.shape(self.state)[0])

            self.assign_global_to_local = tf.group(*[
                v1.assign(v2)
                for v1, v2 in zip(self.local_network.get_variables(),
                                  self.global_network.get_variables())
            ])

            # TODO write summaries
            # self.summary_writer = tf.summary.FileWriter('log' + "_%d" % self.task_index)
            if not self.optimizer:
                self.optimizer = tf.train.AdamOptimizer(self.alpha)

            else:
                optimizer_cls = get_function(self.optimizer)
                self.optimizer = optimizer_cls(self.alpha,
                                               *self.optimizer_args,
                                               **self.optimizer_kwargs)

            self.optimize_op = tf.group(
                self.optimizer.apply_gradients(grad_var_list), global_step_inc)