Example #1
0
    def __init__(
        self,
        # Model
        name, device, parallel_interactions, buffer_observe, seed, execution, saver, summarizer,
        config, states, actions, preprocessing, exploration, variable_noise,
        l2_regularization,
        # TensorforceModel
        policy, memory, update, optimizer, objective, reward_estimation, baseline_policy,
        baseline_optimizer, baseline_objective, entropy_regularization, max_episode_timesteps
    ):
        preprocessed_states = OrderedDict(states)
        for state_name, state_spec in states.items():
            if preprocessing is None:
                layers = None
            elif state_name in preprocessing:
                layers = preprocessing[state_name]
            elif state_spec['type'] in preprocessing:
                layers = preprocessing[state_spec['type']]
            else:
                layers = None
            if layers is not None:
                preprocessed_states[state_name] = Preprocessor.output_spec(
                    input_spec=state_spec, layers=layers
                )

        # Policy internals specification
        policy_cls, first_arg, kwargs = Module.get_module_class_and_kwargs(
            name='policy', module=policy, modules=policy_modules, states_spec=preprocessed_states,
            actions_spec=actions
        )
        if first_arg is None:
            internals = policy_cls.internals_spec(name='policy', **kwargs)
        else:
            internals = policy_cls.internals_spec(first_arg, name='policy', **kwargs)
        if any(internal.startswith('baseline-') for internal in internals):
            raise TensorforceError.value(
                name='model', argument='internals', value=list(internals),
                hint='starts with baseline-'
            )

        # Baseline internals specification
        if baseline_policy is None:
            pass
        else:
            baseline_cls, first_arg, kwargs = Module.get_module_class_and_kwargs(
                name='baseline', module=baseline_policy, modules=policy_modules,
                states_spec=preprocessed_states, actions_spec=actions
            )
            if first_arg is None:
                baseline_internals = baseline_cls.internals_spec(name='baseline', **kwargs)
            else:
                baseline_internals = baseline_cls.internals_spec(
                    first_arg, name='baseline', **kwargs
                )
            for internal, spec in baseline_internals.items():
                if internal in internals:
                    raise TensorforceError.collision(
                        name='model', value='internals', group1='policy', group2='baseline'
                    )
                internals[internal] = spec

        super().__init__(
            # Model
            name=name, device=device, parallel_interactions=parallel_interactions,
            buffer_observe=buffer_observe, seed=seed, execution=execution, saver=saver,
            summarizer=summarizer, config=config, states=states, internals=internals,
            actions=actions, preprocessing=preprocessing, exploration=exploration,
            variable_noise=variable_noise, l2_regularization=l2_regularization
        )

        # Policy
        self.policy = self.add_module(
            name='policy', module=policy, modules=policy_modules, states_spec=self.states_spec,
            actions_spec=self.actions_spec
        )

        # Update mode
        if not all(key in ('batch_size', 'frequency', 'start', 'unit') for key in update):
            raise TensorforceError.value(
                name='agent', argument='update', value=list(update),
                hint='not from {batch_size,frequency,start,unit}'
            )
        # update: unit
        elif 'unit' not in update:
            raise TensorforceError.required(name='agent', argument='update[unit]')
        elif update['unit'] not in ('timesteps', 'episodes'):
            raise TensorforceError.value(
                name='agent', argument='update[unit]', value=update['unit'],
                hint='not in {timesteps,episodes}'
            )
        # update: batch_size
        elif 'batch_size' not in update:
            raise TensorforceError.required(name='agent', argument='update[batch_size]')

        self.update_unit = update['unit']
        self.update_batch_size = self.add_module(
            name='update-batch-size', module=update['batch_size'], modules=parameter_modules,
            is_trainable=False, dtype='long', min_value=1
        )
        if 'frequency' in update and update['frequency'] == 'never':
            self.update_frequency = None
        else:
            self.update_frequency = self.add_module(
                name='update-frequency', module=update.get('frequency', update['batch_size']),
                modules=parameter_modules, is_trainable=False, dtype='long', min_value=1,
                max_value=max(2, self.update_batch_size.max_value())
            )
            self.update_start = self.add_module(
                name='update-start', module=update.get('start', 0), modules=parameter_modules,
                is_trainable=False, dtype='long', min_value=0
            )

        # Optimizer
        self.optimizer = self.add_module(
            name='optimizer', module=optimizer, modules=optimizer_modules, is_trainable=False
        )

        # Objective
        self.objective = self.add_module(
            name='objective', module=objective, modules=objective_modules, is_trainable=False
        )

        # Baseline optimization overview:
        # Policy    Objective   Optimizer   Config
        #   n         n           n           estimate_horizon=False
        #   n         n           f           invalid!!!
        #   n         n           y           invalid!!!
        #   n         y           n           bl trainable, weighted 1.0
        #   n         y           f           bl trainable, weighted
        #   n         y           y           separate, use main policy
        #   y         n           n           bl trainable, estimate_advantage=True, equal horizon
        #   y         n           f           invalid!!!
        #   y         n           y           separate, use main objective
        #   y         y           n           bl trainable, weighted 1.0, equal horizon
        #   y         y           f           bl trainable, weighted, equal horizon
        #   y         y           y           separate

        # Baseline objective
        if baseline_objective is None:
            self.baseline_objective = None
        else:
            self.baseline_objective = self.add_module(
                name='baseline-objective', module=baseline_objective, modules=objective_modules,
                is_trainable=False, is_subscope=True
            )

        # Baseline optimizer
        if baseline_optimizer is None:
            self.baseline_optimizer = None
            if self.baseline_objective is None:
                self.baseline_loss_weight = None
            else:
                self.baseline_loss_weight = 1.0
        elif isinstance(baseline_optimizer, float):
            assert self.baseline_objective is not None
            self.baseline_optimizer = None
            self.baseline_loss_weight = baseline_optimizer
        else:
            assert self.baseline_objective is not None or baseline_policy is not None
            self.baseline_optimizer = self.add_module(
                name='baseline-optimizer', module=baseline_optimizer, modules=optimizer_modules,
                is_trainable=False, is_subscope=True
            )
            self.baseline_loss_weight = None

        # Baseline
        if (baseline_policy is not None or self.baseline_objective is not None) and \
                self.baseline_optimizer is None:
            # since otherwise not part of training
            assert self.baseline_objective is not None or \
                reward_estimation.get('estimate_advantage', True)
            is_trainable = True
        else:
            is_trainable = False
        if baseline_policy is None:
            self.baseline_policy = self.policy
            self.separate_baseline_policy = False
        else:
            self.baseline_policy = self.add_module(
                name='baseline', module=baseline_policy, modules=policy_modules,
                is_trainable=is_trainable, is_subscope=True, states_spec=self.states_spec,
                actions_spec=self.actions_spec
            )
            self.separate_baseline_policy = True

        # Estimator
        if not all(key in (
            'discount', 'estimate_actions', 'estimate_advantage', 'estimate_horizon',
            'estimate_terminal', 'horizon'
        ) for key in reward_estimation):
            raise TensorforceError.value(
                name='agent', argument='reward_estimation', value=reward_estimation,
                hint='not from {discount,estimate_actions,estimate_advantage,estimate_horizon,'
                     'estimate_terminal,horizon}'
            )
        if not self.separate_baseline_policy and self.baseline_optimizer is None and \
                self.baseline_objective is None:
            estimate_horizon = False
        else:
            estimate_horizon = 'late'
        if self.separate_baseline_policy and self.baseline_objective is None and \
                self.baseline_optimizer is None:
            estimate_advantage = True
        else:
            estimate_advantage = False
        self.estimator = self.add_module(
            name='estimator', module=Estimator, is_trainable=False, is_saved=False,
            values_spec=self.values_spec, horizon=reward_estimation['horizon'],
            discount=reward_estimation.get('discount', 1.0),
            estimate_horizon=reward_estimation.get('estimate_horizon', estimate_horizon),
            estimate_actions=reward_estimation.get('estimate_actions', False),
            estimate_terminal=reward_estimation.get('estimate_terminal', False),
            estimate_advantage=reward_estimation.get('estimate_advantage', estimate_advantage),
            # capacity=reward_estimation['capacity']
            min_capacity=self.buffer_observe,
            max_past_horizon=self.baseline_policy.max_past_horizon(is_optimization=False)
        )

        # Memory
        if self.update_unit == 'timesteps':
            policy_horizon = self.policy.max_past_horizon(is_optimization=True)
            baseline_horizon = self.baseline_policy.max_past_horizon(is_optimization=True) - \
                self.estimator.min_future_horizon()
            min_capacity = self.update_batch_size.max_value() + 1 + \
                self.estimator.max_future_horizon() + max(policy_horizon, baseline_horizon)
        elif self.update_unit == 'episodes':
            if max_episode_timesteps is None:
                min_capacity = 0
            else:
                min_capacity = (self.update_batch_size.max_value() + 1) * max_episode_timesteps
        else:
            assert False

        self.memory = self.add_module(
            name='memory', module=memory, modules=memory_modules, is_trainable=False,
            values_spec=self.values_spec, min_capacity=min_capacity
        )

        # Entropy regularization
        entropy_regularization = 0.0 if entropy_regularization is None else entropy_regularization
        self.entropy_regularization = self.add_module(
            name='entropy-regularization', module=entropy_regularization,
            modules=parameter_modules, is_trainable=False, dtype='float', min_value=0.0
        )

        # Internals initialization
        self.internals_init.update(self.policy.internals_init())
        self.internals_init.update(self.baseline_policy.internals_init())
        if any(internal_init is None for internal_init in self.internals_init.values()):
            raise TensorforceError.required(name='model', argument='internals_init')

        # Register global tensors
        Module.register_tensor(name='update', spec=dict(type='long', shape=()), batched=False)
        Module.register_tensor(
            name='dependency_starts', spec=dict(type='long', shape=()), batched=True
        )
        Module.register_tensor(
            name='dependency_lengths', spec=dict(type='long', shape=()), batched=True
        )
Example #2
0
    def __init__(
        # Environment
        self, states, actions, max_episode_timesteps=None,
        # TensorFlow etc
        parallel_interactions=1, buffer_observe=True, seed=None, recorder=None
    ):
        assert hasattr(self, 'spec')

        if seed is not None:
            assert isinstance(seed, int)
            random.seed(a=seed)
            np.random.seed(seed=seed)

        # States/actions specification
        self.states_spec = util.valid_values_spec(
            values_spec=states, value_type='state', return_normalized=True
        )
        self.actions_spec = util.valid_values_spec(
            values_spec=actions, value_type='action', return_normalized=True
        )
        self.max_episode_timesteps = max_episode_timesteps

        # Check for name overlap
        for name in self.states_spec:
            if name in self.actions_spec:
                TensorforceError.collision(
                    name='name', value=name, group1='states', group2='actions'
                )

        # Parallel episodes
        if isinstance(parallel_interactions, int):
            if parallel_interactions <= 0:
                raise TensorforceError.value(
                    name='parallel_interactions', value=parallel_interactions
                )
            self.parallel_interactions = parallel_interactions
        else:
            raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions)

        # Buffer observe
        if isinstance(buffer_observe, bool):
            if not buffer_observe and self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if self.max_episode_timesteps is None and self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if not buffer_observe:
                self.buffer_observe = 1
            elif self.max_episode_timesteps is None:
                self.buffer_observe = 100
            else:
                self.buffer_observe = self.max_episode_timesteps
        elif isinstance(buffer_observe, int):
            if buffer_observe <= 0:
                raise TensorforceError.value(name='buffer_observe', value=buffer_observe)
            if self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if self.max_episode_timesteps is None:
                self.buffer_observe = buffer_observe
            else:
                self.buffer_observe = min(buffer_observe, self.max_episode_timesteps)
        else:
            raise TensorforceError.type(name='buffer_observe', value=buffer_observe)

        # Parallel terminal/reward buffers
        self.terminal_buffers = np.ndarray(
            shape=(self.parallel_interactions, self.buffer_observe),
            dtype=util.np_dtype(dtype='long')
        )
        self.reward_buffers = np.ndarray(
            shape=(self.parallel_interactions, self.buffer_observe),
            dtype=util.np_dtype(dtype='float')
        )

        # Parallel buffer indices
        self.buffer_indices = np.zeros(
            shape=(self.parallel_interactions,), dtype=util.np_dtype(dtype='int')
        )

        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Recorder
        if recorder is None:
            pass
        elif not all(key in ('directory', 'frequency', 'max-traces') for key in recorder):
            raise TensorforceError.value(name='recorder', value=list(recorder))
        self.recorder_spec = recorder
        if self.recorder_spec is not None:
            self.record_states = OrderedDict(((name, list()) for name in self.states_spec))
            for name, spec in self.actions_spec.items():
                if spec['type'] == 'int':
                    self.record_states[name + '_mask'] = list()
            self.record_actions = OrderedDict(((name, list()) for name in self.actions_spec))
            self.record_terminal = list()
            self.record_reward = list()
            self.num_episodes = 0
Example #3
0
    def __init__(
        # Environment
        self, states, actions, max_episode_timesteps=None,
        # TensorFlow etc
        parallel_interactions=1, buffer_observe=True, seed=None, recorder=None
    ):
        assert hasattr(self, 'spec')

        if seed is not None:
            assert isinstance(seed, int)
            random.seed(a=seed)
            np.random.seed(seed=seed)

        # States/actions specification
        self.states_spec = util.valid_values_spec(
            values_spec=states, value_type='state', return_normalized=True
        )
        self.actions_spec = util.valid_values_spec(
            values_spec=actions, value_type='action', return_normalized=True
        )
        self.max_episode_timesteps = max_episode_timesteps

        # Check for name overlap
        for name in self.states_spec:
            if name in self.actions_spec:
                TensorforceError.collision(
                    name='name', value=name, group1='states', group2='actions'
                )

        # Parallel episodes
        if isinstance(parallel_interactions, int):
            if parallel_interactions <= 0:
                raise TensorforceError.value(
                    name='parallel_interactions', value=parallel_interactions
                )
            self.parallel_interactions = parallel_interactions
        else:
            raise TensorforceError.type(name='parallel_interactions', value=parallel_interactions)

        # Buffer observe
        if isinstance(buffer_observe, bool):
            if not buffer_observe and self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if self.max_episode_timesteps is None and self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if not buffer_observe:
                self.buffer_observe = 1
            elif self.max_episode_timesteps is None:
                self.buffer_observe = 100
            else:
                self.buffer_observe = self.max_episode_timesteps
        elif isinstance(buffer_observe, int):
            if buffer_observe <= 0:
                raise TensorforceError.value(name='buffer_observe', value=buffer_observe)
            if self.parallel_interactions > 1:
                raise TensorforceError.unexpected()
            if self.max_episode_timesteps is None:
                self.buffer_observe = buffer_observe
            else:
                self.buffer_observe = min(buffer_observe, self.max_episode_timesteps)
        else:
            raise TensorforceError.type(name='buffer_observe', value=buffer_observe)

        # Recorder
        if recorder is None:
            pass
        elif not all(key in ('directory', 'frequency', 'max-traces', 'start') for key in recorder):
            raise TensorforceError.value(name='recorder', value=list(recorder))
        self.recorder_spec = recorder if recorder is None else dict(recorder)

        self.is_initialized = False
    def __init__(self,
                 states,
                 actions,
                 parallel_interactions=1,
                 buffer_observe=1000,
                 seed=None):
        """
        Agent constructor.

        Args:
            states (specification): States specification, arbitrarily nested dictionary of state
                descriptions with the following attributes:
                - type ('bool' | 'int' | 'float'): state data type (default: 'float').
                - shape (int | iter[int]): state shape (required).
                - num_states (int > 0): number of discrete state values (required for type 'int').
                - min_value/max_value (float): minimum/maximum state value (optional for type
                'float').
            actions (specification): Actions specification, arbitrarily nested dictionary of action
                descriptions with the following attributes:
                - type ('bool' | 'int' | 'float'): action data type (required).
                - shape (int > 0 | iter[int > 0]): action shape (default: []).
                - num_actions (int > 0): number of discrete action values (required for type
                'int').
                - min_value/max_value (float): minimum/maximum action value (optional for type
                'float').
            parallel_interactions (int > 0): Maximum number of parallel interactions to support,
                for instance, to enable multiple parallel episodes, environments or (centrally
                controlled) agents within an environment.
            buffer_observe (int > 0): Maximum number of timesteps within an episode to buffer
                before executing internal observe operations, to reduce calls to TensorFlow for
                improved performance.
        """
        if seed is not None:
            assert isinstance(seed, int)
            random.seed(n=seed)
            np.random.seed(seed=seed)
            tf.random.set_random_seed(seed=seed)

        # States/actions specification
        self.states_spec = util.valid_values_spec(values_spec=states,
                                                  value_type='state',
                                                  return_normalized=True)
        self.actions_spec = util.valid_values_spec(values_spec=actions,
                                                   value_type='action',
                                                   return_normalized=True)

        # Check for name overlap
        for name in self.states_spec:
            if name in self.actions_spec:
                TensorforceError.collision(name='name',
                                           value=name,
                                           group1='states',
                                           group2='actions')

        # Parallel episodes
        if isinstance(parallel_interactions, int):
            if parallel_interactions <= 0:
                raise TensorforceError.value(name='parallel_interactions',
                                             value=parallel_interactions)
            self.parallel_interactions = parallel_interactions
        else:
            raise TensorforceError.type(name='parallel_interactions',
                                        value=parallel_interactions)

        # Buffer observe
        if isinstance(buffer_observe, bool):
            # if update_mode['unit'] == 'episodes':
            #     self.buffer_observe = 1000 if buffer_observe else 1
            # else:
            #     self.buffer_observe = update_mode['batch_size']
            self.buffer_observe = 1000 if buffer_observe else 1
        elif isinstance(buffer_observe, int):
            if buffer_observe <= 0:
                raise TensorforceError.value(name='buffer_observe',
                                             value=buffer_observe)
            self.buffer_observe = buffer_observe
        else:
            raise TensorforceError.type(name='buffer_observe',
                                        value=buffer_observe)

        # Parallel terminal/reward buffers
        self.terminal_buffers = np.ndarray(shape=(self.parallel_interactions,
                                                  self.buffer_observe),
                                           dtype=util.np_dtype(dtype='bool'))
        self.reward_buffers = np.ndarray(shape=(self.parallel_interactions,
                                                self.buffer_observe),
                                         dtype=util.np_dtype(dtype='float'))

        # Parallel buffer indices
        self.buffer_indices = np.zeros(shape=(self.parallel_interactions, ),
                                       dtype=util.np_dtype(dtype='int'))

        self.timestep = 0
        self.episode = 0