def test_baseline(self):
        config = Configuration(discount=0.75,
                               batch_size=8,
                               learning_rate=0.001,
                               states=dict(shape=(1, )),
                               actions=dict(continuous=True),
                               network=layered_network_builder(()))
        agent = VPGAgent(config=config)

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        discounted_rewards = np.array([
            0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2,
            0.75, 1.0, 0.0
        ])
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        agent.model.baseline = dict(state=Baseline())
        agent.model.baseline['state'].predict = lambda states: baseline

        result, _ = agent.model.reward_estimation(states=dict(state=states),
                                                  rewards=rewards,
                                                  terminals=terminals)
        expected = discounted_rewards - baseline
        print(result)
        print(expected)
        self.assertTrue((result == expected).all())
    def __init__(self, config):
        config.default(PolicyGradientModel.default_config)

        # distribution
        self.distribution = dict()
        for name, action in config.actions:
            if 'distribution' in action:
                kwargs = dict(action)
                self.distribution[name] = Distribution.from_config(
                    config=action.distribution, kwargs=kwargs)
            elif action.continuous:
                if action.min_value is None:
                    assert action.max_value is None
                    self.distribution[name] = Gaussian(shape=action.shape)
                else:
                    assert action.max_value is not None
                    self.distribution[name] = Beta(min_value=action.min_value,
                                                   max_value=action.max_value,
                                                   shape=action.shape)
            else:
                self.distribution[name] = Categorical(
                    shape=action.shape, num_actions=action.num_actions)

        # baseline
        if config.baseline is None:
            self.baseline = None
        else:
            self.baseline = Baseline.from_config(config=config.baseline)

        # advantage estimation
        self.gae_rewards = config.gae_rewards
        self.gae_lambda = config.gae_lambda
        self.normalize_rewards = config.normalize_rewards

        super(PolicyGradientModel, self).__init__(config)
Beispiel #3
0
    def initialize(self, custom_getter):
        super(PGModel, self).initialize(custom_getter)

        # Baseline
        if self.baseline_spec is None:
            assert self.baseline_mode is None

        elif all(name in self.states_spec for name in self.baseline_spec):
            # Implies AggregatedBaseline.
            assert self.baseline_mode == 'states'
            self.baseline = AggregatedBaseline(baselines=self.baseline_spec)

        else:
            assert self.baseline_mode is not None
            self.baseline = Baseline.from_spec(
                spec=self.baseline_spec,
                kwargs=dict(summary_labels=self.summary_labels))

        # Baseline optimizer
        if self.baseline_optimizer_spec is not None:
            assert self.baseline_mode is not None
            self.baseline_optimizer = Optimizer.from_spec(
                spec=self.baseline_optimizer_spec)

        # TODO: Baseline internal states !!! (see target_network q_model)

        # Reward estimation
        self.fn_reward_estimation = tf.make_template(
            name_='reward-estimation',
            func_=self.tf_reward_estimation,
            custom_getter_=custom_getter)
        # Baseline loss
        self.fn_baseline_loss = tf.make_template(name_='baseline-loss',
                                                 func_=self.tf_baseline_loss,
                                                 custom_getter_=custom_getter)
    def __init__(self, config):
        config.default(PolicyGradientModel.default_config)

        # distribution
        self.distribution = dict()
        for name, action in config.actions:
            if 'distribution' in action:
                if not action.continuous:
                    kwargs = dict(num_actions=action.num_actions)
                elif 'min_value' in action:
                    kwargs = dict(min_value=action.min_value, max_value=action.max_value)
                else:
                    kwargs = dict()
                self.distribution[name] = Distribution.from_config(config=action.distribution, kwargs=kwargs)
            # elif 'min_value' in action:
            #     ...
            elif action.continuous:
                self.distribution[name] = Gaussian()
            else:
                self.distribution[name] = Categorical(num_actions=action.num_actions)

        # baseline
        if config.baseline is None:
            self.baseline = None
        else:
            self.baseline = Baseline.from_config(config=config.baseline)

        super(PolicyGradientModel, self).__init__(config)

        # advantage estimation
        self.generalized_advantage_estimation = config.generalized_advantage_estimation
        if self.generalized_advantage_estimation:
            self.gae_lambda = config.gae_lambda
        self.normalize_advantage = config.normalize_advantage
    def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()):
        """
        Aggregated baseline.

        Args:
            baselines: Dict of per-state baseline specification dicts
        """

        self.baselines = dict()
        for name in sorted(baselines):
            self.baselines[name] = Baseline.from_spec(
                spec=baselines[name],
                kwargs=dict(summary_labels=summary_labels))

        self.linear = Linear(size=1, bias=0.0, scope='prediction', summary_labels=summary_labels)

        super(AggregatedBaseline, self).__init__(scope, summary_labels)
    def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()):
        """
        Aggregated baseline.

        Args:
            baselines: Dict of per-state baseline specification dicts
        """

        with tf.name_scope(name=scope):
            self.baselines = dict()
            for name, baseline_spec in baselines.items():
                with tf.name_scope(name=(name + '-baseline')):
                    self.baselines[name] = Baseline.from_spec(
                        spec=baseline_spec,
                        kwargs=dict(summary_labels=summary_labels)
                    )

            self.linear = Linear(size=1, bias=0.0, scope='prediction')

        super(AggregatedBaseline, self).__init__(scope, summary_labels)
Beispiel #7
0
    def __init__(self, states_spec, actions_spec, network_spec, config):
        # Baseline mode
        assert config.baseline_mode is None or config.baseline_mode in (
            'states', 'network')
        self.baseline_mode = config.baseline_mode

        with tf.name_scope(name=config.scope):
            # Baseline
            if config.baseline is None:
                assert self.baseline_mode is None
                self.baseline = None

            elif all(name in states_spec for name in config.baseline):
                # Implies AggregatedBaseline
                assert self.baseline_mode == 'states'
                self.baseline = AggregatedBaseline(baselines=config.baseline)

            else:
                assert self.baseline_mode is not None
                self.baseline = Baseline.from_spec(
                    spec=config.baseline,
                    kwargs=dict(summary_labels=config.summary_labels))

            # Baseline optimizer
            if config.baseline_optimizer is None:
                self.baseline_optimizer = None
            else:
                assert self.baseline_mode is not None
                self.baseline_optimizer = Optimizer.from_spec(
                    spec=config.baseline_optimizer)

        # Generalized advantage function
        assert config.gae_lambda is None or (
            0.0 <= config.gae_lambda <= 1.0 and self.baseline_mode is not None)
        self.gae_lambda = config.gae_lambda

        super(PGModel, self).__init__(states_spec=states_spec,
                                      actions_spec=actions_spec,
                                      network_spec=network_spec,
                                      config=config)
    def test_gae(self):
        config = Configuration(discount=0.75,
                               batch_size=8,
                               learning_rate=0.001,
                               gae_rewards=True,
                               gae_lambda=0.5,
                               states=dict(shape=(1, )),
                               actions=dict(continuous=True),
                               network=layered_network_builder(()))
        agent = VPGAgent(config=config)

        states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
        rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]
        terminals = [
            False, False, False, False, True, False, False, False, True
        ]
        baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0])
        agent.model.baseline = dict(state=Baseline())
        agent.model.baseline['state'].predict = lambda states: baseline
        td_residuals = np.array([
            0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0,
            1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0
        ])

        result, _ = agent.model.reward_estimation(states=dict(state=states),
                                                  rewards=rewards,
                                                  terminals=terminals)
        expected = np.array([
            np.sum(
                ((0.5 * 0.75)**np.array([0, 1, 2, 3, 4])) * td_residuals[:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[1:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[2:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[3:5]),
            np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[4:5]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[5:]),
            np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[6:]),
            np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[7:]),
            np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[8:])
        ])
        self.assertTrue((result == expected).all())
Beispiel #9
0
    def initialize(self, custom_getter):
        super(PGModel, self).initialize(custom_getter)

        # Baseline
        if self.baseline is None:
            assert self.baseline_mode is None
            self.baseline = None

        elif all(name in self.states_spec for name in self.baseline):
            # Implies AggregatedBaseline.
            assert self.baseline_mode == 'states'
            self.baseline = AggregatedBaseline(baselines=self.baseline)

        else:
            assert self.baseline_mode is not None
            self.baseline = Baseline.from_spec(
                spec=self.baseline,
                kwargs=dict(summary_labels=self.summary_labels))

        # Baseline optimizer
        if self.baseline_optimizer is None:
            self.baseline_optimizer = None
        else:
            assert self.baseline_mode is not None
            self.baseline_optimizer = Optimizer.from_spec(
                spec=self.baseline_optimizer)

        # TODO: Baseline internal states !!! (see target_network q_model)

        # Reward estimation
        self.fn_reward_estimation = tf.make_template(
            name_=(self.scope + '/reward-estimation'),
            func_=self.tf_reward_estimation,
            custom_getter_=custom_getter)
        # PG loss per instance function
        self.fn_pg_loss_per_instance = tf.make_template(
            name_=(self.scope + '/pg-loss-per-instance'),
            func_=self.tf_pg_loss_per_instance,
            custom_getter_=custom_getter)