def test_baseline(self): config = Configuration(discount=0.75, batch_size=8, learning_rate=0.001, states=dict(shape=(1, )), actions=dict(continuous=True), network=layered_network_builder(())) agent = VPGAgent(config=config) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] discounted_rewards = np.array([ 0.75 + 0.75**4, 1.0 + 0.75**3, 0.75**2, 0.75, 1.0, 1.0 + 0.75**2, 0.75, 1.0, 0.0 ]) baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) agent.model.baseline = dict(state=Baseline()) agent.model.baseline['state'].predict = lambda states: baseline result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = discounted_rewards - baseline print(result) print(expected) self.assertTrue((result == expected).all())
def __init__(self, config): config.default(PolicyGradientModel.default_config) # distribution self.distribution = dict() for name, action in config.actions: if 'distribution' in action: kwargs = dict(action) self.distribution[name] = Distribution.from_config( config=action.distribution, kwargs=kwargs) elif action.continuous: if action.min_value is None: assert action.max_value is None self.distribution[name] = Gaussian(shape=action.shape) else: assert action.max_value is not None self.distribution[name] = Beta(min_value=action.min_value, max_value=action.max_value, shape=action.shape) else: self.distribution[name] = Categorical( shape=action.shape, num_actions=action.num_actions) # baseline if config.baseline is None: self.baseline = None else: self.baseline = Baseline.from_config(config=config.baseline) # advantage estimation self.gae_rewards = config.gae_rewards self.gae_lambda = config.gae_lambda self.normalize_rewards = config.normalize_rewards super(PolicyGradientModel, self).__init__(config)
def initialize(self, custom_getter): super(PGModel, self).initialize(custom_getter) # Baseline if self.baseline_spec is None: assert self.baseline_mode is None elif all(name in self.states_spec for name in self.baseline_spec): # Implies AggregatedBaseline. assert self.baseline_mode == 'states' self.baseline = AggregatedBaseline(baselines=self.baseline_spec) else: assert self.baseline_mode is not None self.baseline = Baseline.from_spec( spec=self.baseline_spec, kwargs=dict(summary_labels=self.summary_labels)) # Baseline optimizer if self.baseline_optimizer_spec is not None: assert self.baseline_mode is not None self.baseline_optimizer = Optimizer.from_spec( spec=self.baseline_optimizer_spec) # TODO: Baseline internal states !!! (see target_network q_model) # Reward estimation self.fn_reward_estimation = tf.make_template( name_='reward-estimation', func_=self.tf_reward_estimation, custom_getter_=custom_getter) # Baseline loss self.fn_baseline_loss = tf.make_template(name_='baseline-loss', func_=self.tf_baseline_loss, custom_getter_=custom_getter)
def __init__(self, config): config.default(PolicyGradientModel.default_config) # distribution self.distribution = dict() for name, action in config.actions: if 'distribution' in action: if not action.continuous: kwargs = dict(num_actions=action.num_actions) elif 'min_value' in action: kwargs = dict(min_value=action.min_value, max_value=action.max_value) else: kwargs = dict() self.distribution[name] = Distribution.from_config(config=action.distribution, kwargs=kwargs) # elif 'min_value' in action: # ... elif action.continuous: self.distribution[name] = Gaussian() else: self.distribution[name] = Categorical(num_actions=action.num_actions) # baseline if config.baseline is None: self.baseline = None else: self.baseline = Baseline.from_config(config=config.baseline) super(PolicyGradientModel, self).__init__(config) # advantage estimation self.generalized_advantage_estimation = config.generalized_advantage_estimation if self.generalized_advantage_estimation: self.gae_lambda = config.gae_lambda self.normalize_advantage = config.normalize_advantage
def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): """ Aggregated baseline. Args: baselines: Dict of per-state baseline specification dicts """ self.baselines = dict() for name in sorted(baselines): self.baselines[name] = Baseline.from_spec( spec=baselines[name], kwargs=dict(summary_labels=summary_labels)) self.linear = Linear(size=1, bias=0.0, scope='prediction', summary_labels=summary_labels) super(AggregatedBaseline, self).__init__(scope, summary_labels)
def __init__(self, baselines, scope='aggregated-baseline', summary_labels=()): """ Aggregated baseline. Args: baselines: Dict of per-state baseline specification dicts """ with tf.name_scope(name=scope): self.baselines = dict() for name, baseline_spec in baselines.items(): with tf.name_scope(name=(name + '-baseline')): self.baselines[name] = Baseline.from_spec( spec=baseline_spec, kwargs=dict(summary_labels=summary_labels) ) self.linear = Linear(size=1, bias=0.0, scope='prediction') super(AggregatedBaseline, self).__init__(scope, summary_labels)
def __init__(self, states_spec, actions_spec, network_spec, config): # Baseline mode assert config.baseline_mode is None or config.baseline_mode in ( 'states', 'network') self.baseline_mode = config.baseline_mode with tf.name_scope(name=config.scope): # Baseline if config.baseline is None: assert self.baseline_mode is None self.baseline = None elif all(name in states_spec for name in config.baseline): # Implies AggregatedBaseline assert self.baseline_mode == 'states' self.baseline = AggregatedBaseline(baselines=config.baseline) else: assert self.baseline_mode is not None self.baseline = Baseline.from_spec( spec=config.baseline, kwargs=dict(summary_labels=config.summary_labels)) # Baseline optimizer if config.baseline_optimizer is None: self.baseline_optimizer = None else: assert self.baseline_mode is not None self.baseline_optimizer = Optimizer.from_spec( spec=config.baseline_optimizer) # Generalized advantage function assert config.gae_lambda is None or ( 0.0 <= config.gae_lambda <= 1.0 and self.baseline_mode is not None) self.gae_lambda = config.gae_lambda super(PGModel, self).__init__(states_spec=states_spec, actions_spec=actions_spec, network_spec=network_spec, config=config)
def test_gae(self): config = Configuration(discount=0.75, batch_size=8, learning_rate=0.001, gae_rewards=True, gae_lambda=0.5, states=dict(shape=(1, )), actions=dict(continuous=True), network=layered_network_builder(())) agent = VPGAgent(config=config) states = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] rewards = [0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0] terminals = [ False, False, False, False, True, False, False, False, True ] baseline = np.array([0.25, 0.5, 0.0, 0.25, 0.5, 0.5, 0.25, 0.5, 0.0]) agent.model.baseline = dict(state=Baseline()) agent.model.baseline['state'].predict = lambda states: baseline td_residuals = np.array([ 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.75 * 0.25, 0.75 * 0.5 - 0.25, 1.0, 1.0 + 0.75 * 0.25 - 0.5, 0.75 * 0.5 - 0.25, 1.0 - 0.5, 0.0 ]) result, _ = agent.model.reward_estimation(states=dict(state=states), rewards=rewards, terminals=terminals) expected = np.array([ np.sum( ((0.5 * 0.75)**np.array([0, 1, 2, 3, 4])) * td_residuals[:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[1:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[2:5]), np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[3:5]), np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[4:5]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2, 3])) * td_residuals[5:]), np.sum(((0.5 * 0.75)**np.array([0, 1, 2])) * td_residuals[6:]), np.sum(((0.5 * 0.75)**np.array([0, 1])) * td_residuals[7:]), np.sum(((0.5 * 0.75)**np.array([0])) * td_residuals[8:]) ]) self.assertTrue((result == expected).all())
def initialize(self, custom_getter): super(PGModel, self).initialize(custom_getter) # Baseline if self.baseline is None: assert self.baseline_mode is None self.baseline = None elif all(name in self.states_spec for name in self.baseline): # Implies AggregatedBaseline. assert self.baseline_mode == 'states' self.baseline = AggregatedBaseline(baselines=self.baseline) else: assert self.baseline_mode is not None self.baseline = Baseline.from_spec( spec=self.baseline, kwargs=dict(summary_labels=self.summary_labels)) # Baseline optimizer if self.baseline_optimizer is None: self.baseline_optimizer = None else: assert self.baseline_mode is not None self.baseline_optimizer = Optimizer.from_spec( spec=self.baseline_optimizer) # TODO: Baseline internal states !!! (see target_network q_model) # Reward estimation self.fn_reward_estimation = tf.make_template( name_=(self.scope + '/reward-estimation'), func_=self.tf_reward_estimation, custom_getter_=custom_getter) # PG loss per instance function self.fn_pg_loss_per_instance = tf.make_template( name_=(self.scope + '/pg-loss-per-instance'), func_=self.tf_pg_loss_per_instance, custom_getter_=custom_getter)