Esempio n. 1
0
    def __init__(self,
                 name,
                 tensors,
                 aggregation='concat',
                 axis=0,
                 input_spec=None,
                 summary_labels=None):
        if not isinstance(tensors, str) and not util.is_iterable(x=tensors):
            raise TensorforceError.type(name='retrieve',
                                        argument='tensors',
                                        dtype=type(tensors))
        elif util.is_iterable(x=tensors) and len(tensors) == 0:
            raise TensorforceError.value(name='retrieve',
                                         argument='tensors',
                                         value=tensors,
                                         hint='zero length')
        if aggregation not in ('concat', 'product', 'stack', 'sum'):
            raise TensorforceError.value(
                name='retrieve',
                argument='aggregation',
                value=aggregation,
                hint='not in {concat,product,stack,sum}')

        self.tensors = (tensors, ) if isinstance(tensors,
                                                 str) else tuple(tensors)
        self.aggregation = aggregation
        self.axis = axis

        super().__init__(name=name,
                         input_spec=input_spec,
                         summary_labels=summary_labels,
                         l2_regularization=0.0)

        self.input_spec = None
Esempio n. 2
0
    def __init__(
        self, *, size, window=3, stride=1, padding='same', dilation=1, bias=True, activation='relu',
        dropout=0.0, initialization_scale=1.0, vars_trainable=True, l2_regularization=None,
        name=None, input_spec=None
    ):
        if isinstance(window, int):
            self.window = (window, window)
        elif util.is_iterable(x=window) and len(window) == 2:
            self.window = tuple(window)
        else:
            raise TensorforceError.type(name='Conv2d', argument='window', dtype=type(window))

        if isinstance(stride, int):
            self.stride = (1, stride, stride, 1)
        elif util.is_iterable(x=stride) and len(stride) == 2:
            self.stride = (1, stride[0], stride[1], 1)
        else:
            raise TensorforceError.type(name='Conv2d', argument='stride', dtype=type(stride))

        self.padding = padding

        if isinstance(dilation, int):
            self.dilation = (1, dilation, dilation, 1)
        elif util.is_iterable(x=dilation) and len(dilation) == 2:
            self.dilation = (1, dilation[0], dilation[1], 1)
        else:
            raise TensorforceError.type(name='Conv2d', argument='dilation', dtype=type(dilation))

        super().__init__(
            name=name, size=size, bias=bias, activation=activation, dropout=dropout,
            vars_trainable=vars_trainable, input_spec=input_spec,
            l2_regularization=l2_regularization
        )

        self.initialization_scale = initialization_scale
Esempio n. 3
0
    def __init__(
        self, *, size, window=3, stride=1, padding='same', dilation=1, bias=True, activation='relu',
        dropout=0.0, initialization_scale=1.0, vars_trainable=True, l2_regularization=None,
        name=None, input_spec=None
    ):
        if isinstance(window, int):
            self.window = (window, window)
        elif util.is_iterable(x=window) and len(window) == 2:
            self.window = tuple(window)
        else:
            raise TensorforceError.type(name='Conv2d', argument='window', dtype=type(window))

        if isinstance(stride, int):
            self.stride = (1, stride, stride, 1)
        elif util.is_iterable(x=stride) and len(stride) == 2:
            self.stride = (1, stride[0], stride[1], 1)
        else:
            raise TensorforceError.type(name='Conv2d', argument='stride', dtype=type(stride))

        self.padding = padding

        if isinstance(dilation, int):
            self.dilation = (1, dilation, dilation, 1)
        elif util.is_iterable(x=dilation) and len(dilation) == 2:
            self.dilation = (1, dilation[0], dilation[1], 1)
        else:
            raise TensorforceError.type(name='Conv2d', argument='dilation', dtype=type(dilation))

        super().__init__(
            name=name, size=size, bias=bias, activation=activation, dropout=dropout,
            vars_trainable=vars_trainable, input_spec=input_spec,
            l2_regularization=l2_regularization
        )

        self.initialization_scale = initialization_scale

        self.architecture_kwargs['size'] = str(size)
        self.architecture_kwargs['window'] = str(window)
        self.architecture_kwargs['padding'] = str(padding)
        if stride != 1:
            self.architecture_kwargs['stride'] = str(stride)
        if dilation != 1:
            self.architecture_kwargs['dilation'] = str(dilation)
        self.architecture_kwargs['bias'] = str(bias)
        if activation is not None:
            self.architecture_kwargs['activation'] = str(activation)
        if dropout != 0.0:
            self.architecture_kwargs['dropout'] = str(dropout)
        if initialization_scale != 1.0:
            self.architecture_kwargs['initialization_scale'] = str(initialization_scale)
        if not vars_trainable:
            self.architecture_kwargs['trainable'] = str(vars_trainable)
        if l2_regularization is not None:
            self.architecture_kwargs['l2_regularization'] = str(l2_regularization)
Esempio n. 4
0
    def __init__(self, agent, environments):
        if not util.is_iterable(x=environments):
            raise TensorforceError.type(name='parallel-runner',
                                        argument='environments',
                                        value=environments)
        elif len(environments) == 0:
            raise TensorforceError.value(name='parallel-runner',
                                         argument='environments',
                                         value=environments)

        if not isinstance(agent, Agent):
            agent = Agent.from_spec(spec=agent,
                                    states=environments[0].states(),
                                    actions=environments[0].actions(),
                                    parallel_interactions=len(environments))

        if len(environments) > agent.parallel_interactions:
            raise TensorforceError(message="Too many environments.")

        self.agent = agent
        self.environments = tuple(environments)

        self.agent.initialize()
        self.global_episode = self.agent.episode
        self.global_timestep = self.agent.timestep
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_times = list()
Esempio n. 5
0
    def is_summary_logged(self, label):
        # Check whether any summaries are logged
        if self.summary_labels is None:
            return False

        # Check whether not in while loop
        if Module.while_counter > 0:
            return False
        # Check whether not in nested condition
        if Module.cond_counter > 1:
            return False

        # Temporary
        if label == 'variables' or label == 'variables-histogram':
            return False

        # Check whether given label is logged
        if util.is_iterable(x=label):
            assert all(not x.endswith('-histogram') for x in label)
            if self.summary_labels != 'all' and all(x not in self.summary_labels for x in label):
                return False
        else:
            if (self.summary_labels != 'all' or label.endswith('-histogram')) and \
                    label not in self.summary_labels:
                return False

        return True
Esempio n. 6
0
    def __init__(self,
                 *,
                 tensors,
                 l2_regularization=None,
                 name=None,
                 input_spec=None):
        super(Layer, self).__init__(l2_regularization=l2_regularization,
                                    name=name)

        Layer._REGISTERED_LAYERS[self.name] = self

        if isinstance(tensors, str):
            pass
        elif not util.is_iterable(x=tensors):
            raise TensorforceError.type(name='MultiInputLayer',
                                        argument='tensors',
                                        dtype=type(tensors))
        elif len(tensors) == 0:
            raise TensorforceError.value(name='MultiInputLayer',
                                         argument='tensors',
                                         value=tensors,
                                         hint='zero length')

        if isinstance(tensors, str):
            self.tensors = (tensors, )
        else:
            self.tensors = tuple(tensors)

        self.input_spec = self.default_input_spec()
        if not isinstance(self.input_spec, TensorsSpec):
            raise TensorforceError.unexpected()

        self.input_spec = self.input_spec.unify(other=input_spec)
Esempio n. 7
0
        def fn(query=None, **kwargs):
            # Feed_dict dictionary
            feed_dict = dict()
            for key, arg in kwargs.items():
                if arg is None:
                    continue
                elif isinstance(arg, dict):
                    # Support single nesting (for states, internals, actions)
                    for key, arg in arg.items():
                        feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg
                else:
                    feed_dict[util.join_scopes(self.name, key) + '-input:0'] = arg
            if not all(isinstance(x, str) and x.endswith('-input:0') for x in feed_dict):
                raise TensorforceError.unexpected()

            # Fetches value/tuple
            fetches = util.fmap(function=(lambda x: x.name), xs=results)
            if query is not None:
                # If additional tensors are to be fetched
                query = util.fmap(
                    function=(lambda x: util.join_scopes(name, x) + '-output:0'), xs=query
                )
                if util.is_iterable(x=fetches):
                    fetches = tuple(fetches) + (query,)
                else:
                    fetches = (fetches, query)
            if not util.reduce_all(
                predicate=(lambda x: isinstance(x, str) and x.endswith('-output:0')), xs=fetches
            ):
                raise TensorforceError.unexpected()

            # TensorFlow session call
            fetched = self.monitored_session.run(fetches=fetches, feed_dict=feed_dict)

            return fetched
Esempio n. 8
0
    def __init__(self,
                 agent,
                 environments,
                 evaluation_environment=None,
                 save_best_agent=False):
        # save_best overwrites saver...
        if not util.is_iterable(x=environments):
            raise TensorforceError.type(name='parallel-runner',
                                        argument='environments',
                                        value=environments)
        elif len(environments) == 0:
            raise TensorforceError.value(name='parallel-runner',
                                         argument='environments',
                                         value=environments)

        self.is_environment_external = tuple(
            isinstance(environment, Environment)
            for environment in environments)
        self.environments = tuple(
            Environment.create(environment=environment)
            for environment in environments)

        self.is_eval_environment_external = isinstance(evaluation_environment,
                                                       Environment)
        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment)

        self.save_best_agent = save_best_agent
        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict(parallel_interactions=len(environments))
        # warning: save_best_agent
        if not self.is_agent_external and self.save_best_agent:
            # Disable periodic saving
            kwargs = dict(saver=dict(seconds=None, steps=None))
        self.agent = Agent.create(agent=agent,
                                  environment=self.environments[0],
                                  **kwargs)
        if not self.agent.model.is_initialized:
            self.agent.initialize()

        # self.global_episodes = self.agent.episodes
        # self.global_timesteps = self.agent.timesteps
        # self.global_updates = self.agent.updates
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        self.evaluation_rewards = list()
        self.evaluation_timesteps = list()
        self.evaluation_seconds = list()
        self.evaluation_agent_seconds = list()
    def __init__(self,
                 name,
                 tensors,
                 aggregation='concat',
                 axis=0,
                 input_spec=None):
        """
        Retrieve constructor.

        Args:
            tensors (iter[string]): Global names of tensors to retrieve.
            aggregation ('concat' | 'product' | 'stack' | 'sum'): Aggregation type.
            axis (int >= 0): Aggregation axis (excluding batch axis).

        """
        if not isinstance(tensors, str) and not util.is_iterable(x=tensors):
            raise TensorforceError.type(name='retrieve',
                                        argument='tensors',
                                        value=tensors)
        elif util.is_iterable(x=tensors) and len(tensors) == 0:
            raise TensorforceError.value(name='retrieve',
                                         argument='tensors',
                                         value=tensors)
        if aggregation not in ('concat', 'product', 'stack', 'sum'):
            raise TensorforceError.value(name='retrieve',
                                         argument='aggregation',
                                         value=aggregation)

        self.tensors = (tensors, ) if isinstance(tensors,
                                                 str) else tuple(tensors)
        self.aggregation = aggregation
        self.axis = axis

        super().__init__(name=name,
                         input_spec=input_spec,
                         l2_regularization=0.0)

        self.input_spec = None
Esempio n. 10
0
    def add_placeholder(self, name, dtype, shape, batched, default=None):
        # name
        name = name + '-input'
        if not util.is_valid_name(name=name):
            raise TensorforceError.value(name='placeholder',
                                         argument='name',
                                         value=name)
        # dtype
        if not util.is_valid_type(dtype=dtype):
            raise TensorforceError.value(name='placeholder',
                                         argument='dtype',
                                         value=dtype)
        # shape
        if not util.is_iterable(x=shape) or \
                not all(isinstance(num_dims, int) for num_dims in shape):
            raise TensorforceError.type(name='placeholder',
                                        argument='shape',
                                        value=shape)
        elif not all(num_dims > 0 for num_dims in shape):
            raise TensorforceError.value(name='placeholder',
                                         argument='shape',
                                         value=shape)
        # batched
        if not isinstance(batched, bool):
            raise TensorforceError.type(name='placeholder',
                                        argument='batched',
                                        value=batched)
        # default
        if default is not None:
            if batched:
                raise TensorforceError.unexpected()
            elif not isinstance(default, tf.Tensor):
                raise TensorforceError.unexpected()
            elif util.dtype(x=default) != dtype:
                raise TensorforceError.unexpected()

        # Placeholder
        if batched:
            shape = (None, ) + shape
        if default is None:
            dtype = util.tf_dtype(dtype=dtype)
            placeholder = tf.placeholder(dtype=dtype, shape=shape, name=name)
        else:
            # check dtype and shape !!!
            placeholder = tf.placeholder_with_default(input=default,
                                                      shape=shape,
                                                      name=name)

        return placeholder
Esempio n. 11
0
    def observe(self, reward=0.0, terminal=False, parallel=0):
        # Check whether inputs are batched
        if util.is_iterable(x=reward) or (isinstance(reward, np.ndarray)
                                          and reward.ndim > 0):
            reward = np.asarray(reward)
            num_parallel = reward.shape[0]
            if not isinstance(terminal, np.ndarray) and terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)
            if not isinstance(parallel, np.ndarray) and parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=terminal) or \
                (isinstance(terminal, np.ndarray) and terminal.ndim > 0):
            terminal = np.asarray(terminal, dtype=util.np_dtype(dtype='int'))
            num_parallel = terminal.shape[0]
            if not isinstance(reward, np.ndarray) and reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if not isinstance(parallel, np.ndarray) and parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=parallel) or \
                (isinstance(parallel, np.ndarray) and parallel.ndim > 0):
            parallel = np.asarray(parallel)
            num_parallel = parallel.shape[0]
            if not isinstance(reward, np.ndarray) and reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if not isinstance(terminal, np.ndarray) and terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)

        else:
            reward = np.asarray([float(reward)])
            terminal = np.asarray([int(terminal)])
            parallel = np.asarray([int(parallel)])
            num_parallel = 1

        # Check whether shapes/lengths are consistent
        if parallel.shape[0] == 0:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(parallel)',
                                         value=parallel.shape[0],
                                         hint='= 0')
        if reward.shape != parallel.shape:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(reward)',
                                         value=reward.shape,
                                         hint='!= parallel length')
        if terminal.shape != parallel.shape:
            raise TensorforceError.value(name='Agent.observe',
                                         argument='len(terminal)',
                                         value=terminal.shape,
                                         hint='!= parallel length')

        # Convert terminal to int if necessary
        if terminal.dtype is util.np_dtype(dtype='bool'):
            zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int'))
            ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int'))
            terminal = np.where(terminal, ones, zeros)

        # Check whether current timesteps are not completed
        if self.timestep_completed[parallel].any():
            raise TensorforceError(
                message="Calling agent.observe must be preceded by agent.act.")
        self.timestep_completed[parallel] = True

        # Check whether episode is too long
        self.timestep_counter[parallel] += 1
        if self.max_episode_timesteps is not None and np.logical_and(
                terminal == 0, self.timestep_counter[parallel] >
                self.max_episode_timesteps).any():
            raise TensorforceError(
                message="Episode longer than max_episode_timesteps.")
        self.timestep_counter[parallel] = np.where(
            terminal > 0, 0, self.timestep_counter[parallel])

        if self.recorder is None:
            pass

        elif self.num_episodes < self.recorder.get('start', 0):
            # Increment num_episodes
            for t in terminal.tolist():
                if t > 0:
                    self.num_episodes += 1

        else:
            # Store values per parallel interaction
            for p, t, r in zip(parallel.tolist(), terminal.tolist(),
                               reward.tolist()):

                # Buffer inputs
                self.buffers['terminal'][p].append(t)
                self.buffers['reward'][p].append(r)

                # Continue if not terminal
                if t == 0:
                    continue
                self.num_episodes += 1

                # Buffered terminal/reward inputs
                for name in self.states_spec:
                    self.recorded['states'][name].append(
                        np.stack(self.buffers['states'][name][p], axis=0))
                    self.buffers['states'][name][p].clear()
                for name, spec in self.actions_spec.items():
                    self.recorded['actions'][name].append(
                        np.stack(self.buffers['actions'][name][p], axis=0))
                    self.buffers['actions'][name][p].clear()
                self.recorded['terminal'].append(
                    np.array(self.buffers['terminal'][p],
                             dtype=self.terminal_spec.np_type()))
                self.buffers['terminal'][p].clear()
                self.recorded['reward'].append(
                    np.array(self.buffers['reward'][p],
                             dtype=self.reward_spec.np_type()))
                self.buffers['reward'][p].clear()

                # Check whether recording step
                if (self.num_episodes - self.recorder.get('start', 0)) \
                        % self.recorder.get('frequency', 1) != 0:
                    continue

                # Manage recorder directory
                directory = self.recorder['directory']
                if os.path.isdir(directory):
                    files = sorted(
                        f for f in os.listdir(directory)
                        if os.path.isfile(os.path.join(directory, f))
                        and os.path.splitext(f)[1] == '.npz')
                else:
                    os.makedirs(directory)
                    files = list()
                max_traces = self.recorder.get('max-traces')
                if max_traces is not None and len(files) > max_traces - 1:
                    for filename in files[:-max_traces + 1]:
                        filename = os.path.join(directory, filename)
                        os.remove(filename)

                # Write recording file
                filename = os.path.join(
                    directory,
                    'trace-{:09d}.npz'.format(self.num_episodes - 1))
                # time.strftime('%Y%m%d-%H%M%S')
                kwargs = self.recorded.fmap(function=np.concatenate,
                                            cls=ArrayDict).items()
                np.savez_compressed(file=filename, **dict(kwargs))

                # Clear recorded values
                for recorded in self.recorded.values():
                    recorded.clear()

        if self._is_agent:
            return reward, terminal, parallel
        else:
            return 0
Esempio n. 12
0
    def add_variable(self,
                     name,
                     dtype,
                     shape,
                     is_trainable,
                     initializer='zeros',
                     summarize=None,
                     shared=None):
        # name
        if not util.is_valid_name(name=name):
            raise TensorforceError.value(name='variable',
                                         argument='name',
                                         value=name)
        elif name in self.variables:
            raise TensorforceError.exists(name='variable', value=name)
        # dtype
        if not util.is_valid_type(dtype=dtype):
            raise TensorforceError.value(name='variable',
                                         argument='dtype',
                                         value=dtype)
        # shape
        if not util.is_iterable(x=shape) or \
                not all(isinstance(num_dims, int) for num_dims in shape):
            raise TensorforceError.type(name='variable',
                                        argument='shape',
                                        value=shape)
        elif not all(num_dims > 0 for num_dims in shape):
            raise TensorforceError.value(name='variable',
                                         argument='shape',
                                         value=shape)
        # is_trainable
        if not isinstance(is_trainable, bool):
            raise TensorforceError.type(name='variable',
                                        argument='is_trainable',
                                        value=is_trainable)
        # initializer
        if not isinstance(initializer, (util.py_dtype(dtype=dtype), np.ndarray, tf.Tensor)) and \
                initializer not in ('random', 'zeros', 'ones'):
            raise TensorforceError.value(name='variable',
                                         argument='initializer',
                                         value=initializer)
        elif isinstance(initializer, np.ndarray) and \
                initializer.dtype != util.np_dtype(dtype=dtype):
            raise TensorforceError.type(name='variable',
                                        argument='initializer',
                                        value=initializer)
        elif isinstance(initializer,
                        tf.Tensor) and util.dtype(x=initializer) != dtype:
            raise TensorforceError.type(name='variable',
                                        argument='initializer',
                                        value=initializer)
        elif isinstance(initializer,
                        str) and initializer == 'random' and dtype != 'float':
            raise TensorforceError(
                message=
                "Invalid variable initializer value for non-float variable: {}."
                .format(initializer))
        # summarize
        if summarize is not None and not isinstance(summarize, bool):
            raise TensorforceError.type(name='variable',
                                        argument='summarize',
                                        value=summarize)
        # shared
        if shared is not None and not isinstance(shared, str):
            raise TensorforceError.type(name='variable',
                                        argument='shared',
                                        value=shared)

        variable = None

        if shared is not None and len(tf.get_collection(key=shared)) > 0:
            # Retrieve shared variable from TensorFlow
            collection = tf.get_collection(key=shared)
            if len(collection) > 1:
                raise TensorforceError.unexpected()
            variable = collection[0]

        else:
            tf_dtype = util.tf_dtype(dtype=dtype)

            # Variable initializer
            if isinstance(initializer, util.py_dtype(dtype=dtype)):
                initializer = tf.constant(value=initializer,
                                          dtype=tf_dtype,
                                          shape=shape)
            elif isinstance(initializer, np.ndarray):
                if initializer.shape != shape:
                    raise TensorforceError(
                        "Invalid variable initializer shape: {}.".format(
                            initializer.shape))
                initializer = initializer
            elif isinstance(initializer, tf.Tensor):
                if util.shape(x=initializer) != shape:
                    raise TensorforceError(
                        "Invalid variable initializer shape: {}.".format(
                            util.shape(x=initializer)))
                initializer = initializer
            elif not isinstance(initializer, str):
                raise TensorforceError(
                    "Invalid variable initializer: {}".format(initializer))
            elif initializer == 'random':
                stddev = min(
                    0.1, sqrt(2.0 / (util.product(xs=shape[:-1]) + shape[-1])))
                initializer = tf.random_normal(
                    shape=shape,
                    mean=0.0,
                    stddev=stddev,
                    dtype=util.tf_dtype(dtype=dtype))
            elif initializer == 'zeros':
                initializer = tf.zeros(shape=shape, dtype=tf_dtype)
            elif initializer == 'ones':
                initializer = tf.ones(shape=shape, dtype=tf_dtype)

            # Variable
            variable = tf.Variable(initial_value=initializer,
                                   trainable=is_trainable,
                                   validate_shape=True,
                                   name=name,
                                   dtype=tf_dtype,
                                   expected_shape=shape)  # collections=

            # Register shared variable with TensorFlow
            if shared is not None:
                tf.add_to_collection(name=shared, value=variable)

        # Register variable
        self.variables[name] = variable
        if is_trainable:
            self.trainable_variables[name] = variable

        # Add summary
        if (summarize is None and is_trainable) or summarize:
            variable = tf.identity(input=variable)
            variable = self.add_summary(label='variables',
                                        name=name,
                                        tensor=variable,
                                        mean_variance=True)

        return variable
Esempio n. 13
0
    def run(
        self,
        # General
        num_episodes=None, num_timesteps=None, num_updates=None, num_repeat_actions=1,
        # Callback
        callback=None, callback_episode_frequency=None, callback_timestep_frequency=None,
        # Tqdm
        use_tqdm=True, mean_horizon=1,
        # Evaluation
        evaluation=False, evaluation_callback=None, evaluation_frequency=None,
        num_evaluation_iterations=1
    ):
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if num_updates is None:
            self.num_updates = float('inf')
        else:
            self.num_updates = num_updates
        self.num_repeat_actions = num_repeat_actions

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r: True)
        elif util.is_iterable(x=callback):
            def sequential_callback(runner):
                result = True
                for fn in callback:
                    x = fn(runner)
                    if isinstance(result, bool):
                        result = result and x
                return result
            self.callback = sequential_callback
        else:
            def boolean_callback(runner):
                result = callback(runner)
                if isinstance(result, bool):
                    return result
                else:
                    return True
            self.callback = boolean_callback

        # Timestep/episode/update counter
        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Tqdm
        if use_tqdm:
            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float('inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                bar_format = (
                    '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep='
                    '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent='
                    '{postfix[4]:.1f}%]'
                )
                postfix = [0.0, 0, 0.0, 0.0, 0.0]
                self.tqdm = tqdm(
                    desc='Episodes', total=self.num_episodes, bar_format=bar_format,
                    initial=self.episodes, postfix=postfix
                )
                self.tqdm_last_update = self.episodes

                def tqdm_callback(runner):
                    mean_reward = float(np.mean(runner.episode_rewards[-mean_horizon:]))
                    mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:]))
                    mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:]))
                    mean_agent_sec = float(np.mean(runner.episode_agent_seconds[-mean_horizon:]))
                    mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep
                    mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep
                    runner.tqdm.postfix[0] = mean_reward
                    runner.tqdm.postfix[1] = mean_ts_per_ep
                    runner.tqdm.postfix[2] = mean_sec_per_ep
                    runner.tqdm.postfix[3] = mean_ms_per_ts
                    runner.tqdm.postfix[4] = mean_rel_agent
                    runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.episodes
                    return inner_callback(runner)

            else:
                # Timestep-based tqdm
                assert self.num_timesteps != float('inf')
                self.tqdm = tqdm(
                    desc='Timesteps', total=self.num_timesteps, initial=self.timesteps,
                    postfix=dict(mean_reward='n/a')
                )
                self.tqdm_last_update = self.timesteps

                def tqdm_callback(runner):
                    # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
                    # num_timesteps = min(num_mean_reward, runner.episode_timestep)
                    # mean_reward = sum_timesteps_reward / num_episodes
                    runner.tqdm.set_postfix(mean_reward='n/a')
                    runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.timesteps
                    return inner_callback(runner)

            self.callback = tqdm_callback

        # Evaluation
        self.evaluation = evaluation
        if evaluation_callback is None:
            self.evaluation_callback = (lambda r: None)
        else:
            assert not self.evaluation
            self.evaluation_callback = evaluation_callback
        if self.evaluation:
            assert evaluation_frequency is None
        self.evaluation_frequency = evaluation_frequency
        self.num_evaluation_iterations = num_evaluation_iterations
        if self.save_best_agent is not None:
            assert not self.evaluation
            inner_evaluation_callback = self.evaluation_callback

            def mean_reward_callback(runner):
                result = inner_evaluation_callback(runner)
                if result is None:
                    return float(np.mean(runner.evaluation_rewards))
                else:
                    return result

            self.evaluation_callback = mean_reward_callback
            self.best_evaluation_score = None

        # Required if agent was previously stopped mid-episode
        self.agent.reset()

        # Episode loop
        while True:
            # Run episode
            if not self.run_episode(environment=self.environment, evaluation=self.evaluation):
                return

            # Increment episode counter (after calling callback)
            self.episodes += 1

            # Update experiment statistics
            self.episode_rewards.append(self.episode_reward)
            self.episode_timesteps.append(self.episode_timestep)
            self.episode_seconds.append(self.episode_second)
            self.episode_agent_seconds.append(self.episode_agent_second)

            # Run evaluation
            if self.evaluation_frequency is None:
                is_evaluation = False
            elif self.evaluation_frequency == 'update':
                is_evaluation = self.episode_updated
            else:
                is_evaluation = (self.episodes % self.evaluation_frequency == 0)
            if is_evaluation:
                if self.evaluation_environment is None:
                    environment = self.environment
                else:
                    environment = self.evaluation_environment

                self.evaluation_rewards = list()
                self.evaluation_timesteps = list()
                self.evaluation_seconds = list()
                self.evaluation_agent_seconds = list()

                # Evaluation loop
                for _ in range(self.num_evaluation_iterations):
                    self.run_episode(environment=environment, evaluation=True)

                    self.evaluation_rewards.append(self.episode_reward)
                    self.evaluation_timesteps.append(self.episode_timestep)
                    self.evaluation_seconds.append(self.episode_second)
                    self.evaluation_agent_seconds.append(self.episode_agent_second)

                # Evaluation callback
                if self.save_best_agent is not None:
                    evaluation_score = self.evaluation_callback(self)
                    assert isinstance(evaluation_score, float)
                    if self.best_evaluation_score is None:
                        self.best_evaluation_score = evaluation_score
                    elif evaluation_score > self.best_evaluation_score:
                        self.best_evaluation_score = evaluation_score
                        self.agent.save(
                            directory=self.save_best_agent, filename='best-model',
                            append_timestep=False
                        )
                else:
                    self.evaluation_callback(self)

            # # Update global timestep/episode/update
            # self.global_timesteps = self.agent.timesteps
            # self.global_episodes = self.agent.episodes
            # self.global_updates = self.agent.updates

            # Callback
            if self.episodes % self.callback_episode_frequency == 0 and not self.callback(self):
                return

            # Terminate experiment if too long
            if self.timesteps >= self.num_timesteps:
                return
            # elif self.evaluation and self.timesteps >= self.num_timesteps:
            #     return
            elif self.episodes >= self.num_episodes:
                return
            # elif self.evaluation and self.episodes >= self.num_episodes:
            #     return
            elif self.updates >= self.num_updates:
                return
            # elif self.evaluation and self.updates >= self.num_updates:
            #     return
            elif self.agent.should_stop():
                return
Esempio n. 14
0
    def add_summary(
        self, label, name, tensor, pass_tensors=None, return_summaries=False, mean_variance=False,
        enumerate_last_rank=False
    ):
        # should be "labels" !!!
        # label
        if util.is_iterable(x=label):
            if not all(isinstance(x, str) for x in label):
                raise TensorforceError.value(
                    name='Module.add_summary', argument='label', value=label
                )
        else:
            if not isinstance(label, str):
                raise TensorforceError.type(
                    name='Module.add_summary', argument='label', dtype=type(label)
                )
        # name
        if not isinstance(name, str):
            raise TensorforceError.type(
                name='Module.add_summary', argument='name', dtype=type(name)
            )
        # tensor
        if not isinstance(tensor, (tf.Tensor, tf.Variable)):
            raise TensorforceError.type(
                name='Module.add_summary', argument='tensor', dtype=type(tensor)
            )
        # pass_tensors
        if util.is_iterable(x=pass_tensors):
            if not all(isinstance(x, (tf.Tensor, tf.IndexedSlices)) for x in pass_tensors):
                raise TensorforceError.value(
                    name='Module.add_summary', argument='pass_tensors', value=pass_tensors
                )
        elif pass_tensors is not None:
            if not isinstance(pass_tensors, tf.Tensor):
                raise TensorforceError.type(
                    name='Module.add_summary', argument='pass_tensors', dtype=type(pass_tensors)
                )
        # enumerate_last_rank
        if not isinstance(enumerate_last_rank, bool):
            raise TensorforceError.type(
                name='Module.add_summary', argument='enumerate_last_rank', dtype=type(tensor)
            )

        if pass_tensors is None:
            pass_tensors = tensor

        # Check whether summary is logged
        if not self.is_summary_logged(label=label):
            return pass_tensors

        # Add to available summaries
        if util.is_iterable(x=label):
            self.available_summaries.update(label)
        else:
            self.available_summaries.add(label)

        # Handle enumerate_last_rank
        if enumerate_last_rank:
            dims = util.shape(x=tensor)[-1]
            tensors = OrderedDict([(name + str(n), tensor[..., n]) for n in range(dims)])
        else:
            tensors = OrderedDict([(name, tensor)])

        if mean_variance:
            for name in list(tensors):
                tensor = tensors.pop(name)
                mean, variance = tf.nn.moments(x=tensor, axes=tuple(range(util.rank(x=tensor))))
                tensors[name + '-mean'] = mean
                tensors[name + '-variance'] = variance

        # Scope handling
        if Module.scope_stack is not None:
            for scope in reversed(Module.scope_stack[1:]):
                scope.__exit__(None, None, None)
            if len(Module.global_scope) > 0:
                temp_scope = tf.name_scope(name='/'.join(Module.global_scope))
                temp_scope.__enter__()
            tensors = util.fmap(function=util.identity_operation, xs=tensors)

        # TensorFlow summaries
        assert Module.global_summary_step is not None
        step = Module.retrieve_tensor(name=Module.global_summary_step)
        summaries = list()
        for name, tensor in tensors.items():
            shape = util.shape(x=tensor)
            if shape == ():
                summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
            elif shape == (-1,):
                tensor = tf.math.reduce_sum(input_tensor=tensor, axis=0)
                summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
            elif shape == (1,):
                tensor = tf.squeeze(input=tensor, axis=-1)
                summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
            elif shape == (-1, 1):
                tensor = tf.math.reduce_sum(input_tensor=tf.squeeze(input=tensor, axis=-1), axis=0)
                summaries.append(tf.summary.scalar(name=name, data=tensor, step=step))
            else:
                # General tensor as histogram
                assert not util.is_iterable(x=label) and label.endswith('-histogram')
                summaries.append(tf.summary.histogram(name=name, data=tensor, step=step))

        # Scope handling
        if Module.scope_stack is not None:
            if len(Module.global_scope) > 0:
                temp_scope.__exit__(None, None, None)
            for scope in Module.scope_stack[1:]:
                scope.__enter__()

        with tf.control_dependencies(control_inputs=summaries):
            return util.fmap(function=util.identity_operation, xs=pass_tensors)
Esempio n. 15
0
    def add_summary(self,
                    label,
                    name,
                    tensor,
                    pass_tensors=None,
                    return_summaries=False,
                    mean_variance=False,
                    enumerate_last_rank=False):
        # should be "labels" !!!
        # label
        if util.is_iterable(x=label):
            if not all(isinstance(x, str) for x in label):
                raise TensorforceError.type(name='summary',
                                            argument='label',
                                            value=label)
        else:
            if not isinstance(label, str):
                raise TensorforceError.type(name='summary',
                                            argument='label',
                                            value=label)
        # name
        if not isinstance(name, str):
            raise TensorforceError.type(name='summary',
                                        argument='name',
                                        value=name)
        # tensor
        if not isinstance(tensor, tf.Tensor):
            raise TensorforceError.type(name='summary',
                                        argument='tensor',
                                        value=tensor)
        # pass_tensors
        if util.is_iterable(x=pass_tensors):
            if not all(isinstance(x, tf.Tensor) for x in pass_tensors):
                raise TensorforceError.type(name='summary',
                                            argument='pass_tensors',
                                            value=pass_tensors)
        elif pass_tensors is not None:
            if not isinstance(pass_tensors, tf.Tensor):
                raise TensorforceError.type(name='summary',
                                            argument='pass_tensors',
                                            value=pass_tensors)
        # enumerate_last_rank
        if not isinstance(enumerate_last_rank, bool):
            raise TensorforceError.type(name='summary',
                                        argument='enumerate_last_rank',
                                        value=tensor)

        if pass_tensors is None:
            pass_tensors = tensor

        # Check whether summaries are logged
        if self.summary_labels is None:
            return pass_tensors

        # Check whether not in while loop
        if 'while' in Module.global_scope:  # 'cond' in Module.global_scope
            return pass_tensors

        # Check whether given label is logged
        if util.is_iterable(x=label):
            if all(x not in self.summary_labels for x in label):
                return pass_tensors
        else:
            if label not in self.summary_labels:
                return pass_tensors

        # Handle enumerate_last_rank
        if enumerate_last_rank:
            num_dims = util.shape(x=tensor)[-1]
            tensors = OrderedDict([(name + str(n), tensor[..., n])
                                   for n in range(num_dims)])
        else:
            tensors = OrderedDict([(name, tensor)])

        if mean_variance:
            for name in list(tensors):
                tensor = tensors.pop(name)
                mean, variance = tf.nn.moments(x=tensor,
                                               axes=tuple(
                                                   range(util.rank(x=tensor))))
                tensors[name + '-mean'] = mean
                tensors[name + '-variance'] = variance

        # TensorFlow summaries
        summaries = list()
        for name, tensor in tensors.items():
            shape = util.shape(x=tensor)
            if shape == () or shape == (-1, ):
                # Scalar
                summaries.append(
                    tf.contrib.summary.scalar(name=name, tensor=tensor))
            elif shape == (1, ) or shape == (-1, 1):
                # Single-value tensor as scalar
                tensor = tf.squeeze(input=tensor, axis=-1)
                summaries.append(
                    tf.contrib.summary.scalar(name=name, tensor=tensor))
            else:
                # General tensor as histogram
                summaries.append(
                    tf.contrib.summary.histogram(name=name, tensor=tensor))

        with tf.control_dependencies(control_inputs=summaries):
            if util.is_iterable(x=pass_tensors):
                return tuple(
                    util.identity_operation(x=x) for x in pass_tensors)
            else:
                return util.identity_operation(x=pass_tensors)
Esempio n. 16
0
    def run(
            self,
            # General
            num_episodes=None,
            num_timesteps=None,
            max_episode_timesteps=None,
            deterministic=False,
            num_sleep_secs=0.01,
            sync_timesteps=False,
            sync_episodes=False,
            # Callback
            callback=None,
            callback_episode_frequency=None,
            callback_timestep_frequency=None,
            # Tqdm
            use_tqdm=True,
            num_mean_reward=100):
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if max_episode_timesteps is None:
            self.max_episode_timesteps = float('inf')
        else:
            self.max_episode_timesteps = max_episode_timesteps
        self.deterministic = deterministic
        self.num_sleep_secs = num_sleep_secs
        self.sync_timesteps = sync_timesteps
        self.sync_episodes = sync_episodes

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r, p: True)
        elif util.is_iterable(x=callback):

            def sequential_callback(runner, parallel):
                result = True
                for fn in callback:
                    x = fn(runner, parallel)
                    if isinstance(result, bool):
                        result = result and x
                return result

            self.callback = sequential_callback
        else:

            def boolean_callback(runner, parallel):
                result = callback(runner, parallel)
                if isinstance(result, bool):
                    return result
                else:
                    return True

            self.callback = boolean_callback

        # Tqdm
        if use_tqdm:
            from tqdm import tqdm

            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float(
                'inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                self.tqdm = tqdm(
                    desc='Episodes',
                    total=self.num_episodes,
                    initial=self.global_episode,
                    postfix=dict(mean_reward='{:.2f}'.format(0.0)))
                self.tqdm_last_update = self.global_episode

                def tqdm_callback(runner, parallel):
                    mean_reward = float(
                        np.mean(runner.episode_rewards[-num_mean_reward:]))
                    runner.tqdm.set_postfix(
                        mean_reward='{:.2f}'.format(mean_reward))
                    runner.tqdm.update(n=(runner.global_episode -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.global_episode
                    return inner_callback(runner, parallel)

            else:
                # Timestep-based tqdm
                self.tqdm = tqdm(desc='Timesteps',
                                 total=self.num_timesteps,
                                 initial=self.global_timestep,
                                 postfix=dict(mean_reward='n/a'))
                self.tqdm_last_update = self.global_timestep

                def tqdm_callback(runner, parallel):
                    # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
                    # num_timesteps = min(num_mean_reward, runner.episode_timestep)
                    # mean_reward = sum_timesteps_reward / num_episodes
                    runner.tqdm.set_postfix(mean_reward='n/a')
                    runner.tqdm.update(n=(runner.global_timestep -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.global_timestep
                    return inner_callback(runner, parallel)

            self.callback = tqdm_callback

        # Reset agent
        self.agent.reset()

        # Episode counter
        self.episode = 1

        # Reset environments and episode statistics
        for environment in self.environments:
            environment.start_reset()
        self.episode_reward = [0 for _ in self.environments]
        self.episode_timestep = [0 for _ in self.environments]
        episode_start = [time.time() for _ in self.environments]

        if self.sync_episodes:
            terminated = [False for _ in self.environments]

        # Runner loop
        while True:

            if not self.sync_timesteps:
                no_environment_ready = True

            # Parallel environments loop
            for parallel, environment in enumerate(self.environments):

                if self.sync_episodes and terminated[parallel]:
                    # Continue if episode terminated
                    continue

                if self.sync_timesteps:
                    # Wait until environment is ready
                    while True:
                        observation = environment.retrieve_execute()
                        if observation is not None:
                            break
                        time.sleep(num_sleep_secs)

                else:
                    # Check whether environment is ready
                    observation = environment.retrieve_execute()
                    if observation is None:
                        continue
                    no_environment_ready = False

                states, terminal, reward = observation

                if terminal is None:
                    # Retrieve actions from agent
                    actions = self.agent.act(states=states,
                                             deterministic=deterministic,
                                             parallel=parallel)
                    self.episode_timestep[parallel] += 1

                    # Execute actions in environment
                    environment.start_execute(actions=actions)
                    continue

                # Terminate episode if too long
                if self.episode_timestep[
                        parallel] >= self.max_episode_timesteps:
                    terminal = True

                # Observe unless episode just started
                assert (terminal is
                        None) == (self.episode_timestep[parallel] == 0)
                if terminal is not None:
                    self.agent.observe(terminal=terminal,
                                       reward=reward,
                                       parallel=parallel)
                    self.episode_reward[parallel] += reward

                # Update global timestep/episode
                self.global_timestep = self.agent.timestep
                self.global_episode = self.agent.episode

                # Callback plus experiment termination check
                if self.episode_timestep[parallel] % self.callback_timestep_frequency == 0 and \
                        not self.callback(self, parallel):
                    return

                if terminal:
                    # Update experiment statistics
                    self.episode_rewards.append(self.episode_reward[parallel])
                    self.episode_timesteps.append(
                        self.episode_timestep[parallel])
                    self.episode_times.append(time.time() -
                                              episode_start[parallel])

                    # Callback
                    if self.episode % self.callback_episode_frequency == 0 and \
                            not self.callback(self, parallel):
                        return

                # Terminate experiment if too long
                if self.global_timestep >= self.num_timesteps:
                    return
                elif self.global_episode >= self.num_episodes:
                    return
                elif self.agent.should_stop():
                    return

                # Check whether episode terminated
                if terminal:
                    # Increment episode counter (after calling callback)
                    self.episode += 1

                    # Reset environment and episode statistics
                    environment.start_reset()
                    self.episode_reward[parallel] = 0
                    self.episode_timestep[parallel] = 0
                    episode_start[parallel] = time.time()

                    if self.sync_episodes:
                        terminated[parallel] = True

                else:
                    # Retrieve actions from agent
                    actions = self.agent.act(states=states,
                                             deterministic=deterministic,
                                             parallel=parallel)
                    self.episode_timestep[parallel] += 1

                    # Execute actions in environment
                    environment.start_execute(actions=actions)

            if not self.sync_timesteps and no_environment_ready:
                # Sleep if no environment was ready
                time.sleep(num_sleep_secs)

            if self.sync_episodes and all(terminated):
                # Reset if all episodes terminated
                terminated = [False for _ in self.environments]
Esempio n. 17
0
    def run(
            self,
            # General
            num_episodes=None,
            num_timesteps=None,
            max_episode_timesteps=None,
            deterministic=False,
            num_repeat_actions=1,
            # Callback
            callback=None,
            callback_episode_frequency=None,
            callback_timestep_frequency=None,
            # Tqdm
            use_tqdm=True,
            num_mean_reward=100,
            # Evaluation
            evaluation_callback=None,
            evaluation_frequency=None,
            update_as_evaluation_frequency=False,
            max_evaluation_timesteps=None,
            num_evaluation_iterations=1,
            save_best_agent=False):
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if max_episode_timesteps is None:
            self.max_episode_timesteps = float('inf')
        else:
            self.max_episode_timesteps = max_episode_timesteps
        self.deterministic = deterministic
        self.num_repeat_actions = num_repeat_actions

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r: True)
        elif util.is_iterable(x=callback):

            def sequential_callback(runner):
                result = True
                for fn in callback:
                    x = fn(runner)
                    if isinstance(result, bool):
                        result = result and x
                return result

            self.callback = sequential_callback
        else:

            def boolean_callback(runner):
                result = callback(runner)
                if isinstance(result, bool):
                    return result
                else:
                    return True

            self.callback = boolean_callback

        # Tqdm
        if use_tqdm:
            from tqdm import tqdm

            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float(
                'inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                self.tqdm = tqdm(
                    desc='Episodes',
                    total=self.num_episodes,
                    initial=self.global_episode,
                    postfix=dict(mean_reward='{:.2f}'.format(0.0)))
                self.tqdm_last_update = self.global_episode

                def tqdm_callback(runner):
                    mean_reward = float(
                        np.mean(runner.episode_rewards[-num_mean_reward:]))
                    runner.tqdm.set_postfix(
                        mean_reward='{:.2f}'.format(mean_reward))
                    runner.tqdm.update(n=(runner.global_episode -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.global_episode
                    return inner_callback(runner)

            else:
                # Timestep-based tqdm
                assert self.num_timesteps != float('inf')
                self.tqdm = tqdm(desc='Timesteps',
                                 total=self.num_timesteps,
                                 initial=self.global_timestep,
                                 postfix=dict(mean_reward='n/a'))
                self.tqdm_last_update = self.global_timestep

                def tqdm_callback(runner):
                    # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
                    # num_timesteps = min(num_mean_reward, runner.episode_timestep)
                    # mean_reward = sum_timesteps_reward / num_episodes
                    runner.tqdm.set_postfix(mean_reward='n/a')
                    runner.tqdm.update(n=(runner.global_timestep -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.global_timestep
                    return inner_callback(runner)

            self.callback = tqdm_callback

        # Evaluation
        if evaluation_callback is None:
            self.evaluation_callback = (lambda r: None)
        else:
            self.evaluation_callback = evaluation_callback
        if evaluation_frequency is None:
            if update_as_evaluation_frequency:
                assert evaluation_frequency is None
                batch_size = self.agent.model.update_mode['batch_size']
                frequency = self.agent.model.update_mode.get(
                    'frequency', batch_size)
                if self.agent.model.update_unit == 'episodes':
                    self.evaluation_frequency = frequency
                else:
                    # Approximates maximum number of episodes for an update to happen
                    self.evaluation_frequency = frequency // self.max_episode_timesteps
            else:
                self.evaluation_frequency = float('inf')
        else:
            assert not update_as_evaluation_frequency
            self.evaluation_frequency = evaluation_frequency
        if max_evaluation_timesteps is None:
            self.max_evaluation_timesteps = float('inf')
        else:
            self.max_evaluation_timesteps = max_evaluation_timesteps
        self.num_evaluation_iterations = num_evaluation_iterations
        self.save_best_agent = save_best_agent
        if self.save_best_agent:
            inner_evaluation_callback = self.evaluation_callback

            def mean_reward_callback(runner):
                result = inner_evaluation_callback(runner)
                if result is None:
                    return float(np.mean(runner.evaluation_rewards))
                else:
                    return result

            self.evaluation_callback = mean_reward_callback
            self.best_evaluation_score = None

        # Reset agent
        self.agent.reset()

        # Episode counter
        self.episode = 1

        # Episode loop
        while True:
            # Run episode
            if not self.run_episode(environment=self.environment,
                                    max_timesteps=self.max_episode_timesteps,
                                    evaluation=False):
                return

            # Update experiment statistics
            self.episode_rewards.append(self.episode_reward)
            self.episode_timesteps.append(self.episode_timestep)
            self.episode_times.append(self.episode_time)

            # Run evaluation
            if self.episode % self.evaluation_frequency == 0:
                if self.evaluation_environment is None:
                    environment = self.environment
                else:
                    environment = self.evaluation_environment

                self.evaluation_rewards = list()
                self.evaluation_timesteps = list()
                self.evaluation_times = list()

                # Evaluation loop
                for _ in range(self.num_evaluation_iterations):
                    self.run_episode(
                        environment=environment,
                        max_timesteps=self.max_evaluation_timesteps,
                        evaluation=True)

                    self.evaluation_rewards.append(self.episode_reward)
                    self.evaluation_timesteps.append(self.episode_timestep)
                    self.evaluation_times.append(self.episode_time)

                # Update global timestep/episode
                self.global_timestep = self.agent.timestep
                self.global_episode = self.agent.episode

                # Evaluation callback
                if self.save_best_agent:
                    evaluation_score = self.evaluation_callback(self)
                    assert isinstance(evaluation_score, float)
                    if self.best_evaluation_score is None:
                        self.best_evaluation_score = evaluation_score
                    elif evaluation_score > self.best_evaluation_score:
                        self.best_evaluation_score = evaluation_score
                        self.agent.save(filename='best-model',
                                        append_timestep=False)
                else:
                    self.evaluation_callback(self)

            # Update global timestep/episode
            self.global_timestep = self.agent.timestep
            self.global_episode = self.agent.episode

            # Callback
            if self.episode % self.callback_episode_frequency == 0 and not self.callback(
                    self):
                return

            # Terminate experiment if too long
            if self.global_timestep >= self.num_timesteps:
                return
            elif self.global_episode >= self.num_episodes:
                return
            elif self.agent.should_stop():
                return

            # Increment episode counter (after calling callback)
            self.episode += 1
Esempio n. 18
0
    def run(
        self,
        # General
        num_episodes=None, num_timesteps=None, num_updates=None,
        # Parallel
        batch_agent_calls=False, sync_timesteps=False, sync_episodes=False, num_sleep_secs=0.001,
        # Callback
        callback=None, callback_episode_frequency=None, callback_timestep_frequency=None,
        # Tqdm
        use_tqdm=True, mean_horizon=1,
        # Evaluation
        evaluation=False, save_best_agent=None, evaluation_callback=None
    ):
        """
        Run experiment.

        Args:
            num_episodes (int > 0): Number of episodes to run experiment
                (<span style="color:#00C000"><b>default</b></span>: no episode limit).
            num_timesteps (int > 0): Number of timesteps to run experiment
                (<span style="color:#00C000"><b>default</b></span>: no timestep limit).
            num_updates (int > 0): Number of agent updates to run experiment
                (<span style="color:#00C000"><b>default</b></span>: no update limit).
            batch_agent_calls (bool): Whether to batch agent calls for parallel environment
                execution
                (<span style="color:#00C000"><b>default</b></span>: false, separate call per
                environment).
            sync_timesteps (bool): Whether to synchronize parallel environment execution on
                timestep-level, implied by batch_agent_calls
                (<span style="color:#00C000"><b>default</b></span>: false, unless
                batch_agent_calls is true).
            sync_episodes (bool): Whether to synchronize parallel environment execution on
                episode-level
                (<span style="color:#00C000"><b>default</b></span>: false).
            num_sleep_secs (float): Sleep duration if no environment is ready
                (<span style="color:#00C000"><b>default</b></span>: one milliseconds).
            callback ((Runner, parallel) -> bool): Callback function taking the runner instance
                plus parallel index and returning a boolean value indicating whether execution
                should continue
                (<span style="color:#00C000"><b>default</b></span>: callback always true).
            callback_episode_frequency (int): Episode interval between callbacks
                (<span style="color:#00C000"><b>default</b></span>: every episode).
            callback_timestep_frequency (int): Timestep interval between callbacks
                (<span style="color:#00C000"><b>default</b></span>: not specified).
            use_tqdm (bool): Whether to display a tqdm progress bar for the experiment run
                (<span style="color:#00C000"><b>default</b></span>: true), with the following
                additional information (averaged over number of episodes given via mean_horizon):
                <ul>
                <li>reward &ndash; cumulative episode reward</li>
                <li>ts/ep &ndash; timesteps per episode</li>
                <li>sec/ep &ndash; seconds per episode</li>
                <li>ms/ts &ndash; milliseconds per timestep</li>
                <li>agent &ndash; percentage of time spent on agent computation</li>
                <li>comm &ndash; if remote environment execution, percentage of time spent on
                communication</li>
                </ul>
            mean_horizon (int): Number of episodes progress bar values and evaluation score are
                averaged over (<span style="color:#00C000"><b>default</b></span>: not averaged).
            evaluation (bool): Whether to run in evaluation mode, only valid if single environment
                (<span style="color:#00C000"><b>default</b></span>: no evaluation).
            save_best_agent (string): Directory to save the best version of the agent according to
                the evaluation score
                (<span style="color:#00C000"><b>default</b></span>: best agent is not saved).
            evaluation_callback (int | Runner -> float): Callback function taking the runner
                instance and returning an evaluation score
                (<span style="color:#00C000"><b>default</b></span>: cumulative evaluation reward
                averaged over mean_horizon episodes).
        """
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if num_updates is None:
            self.num_updates = float('inf')
        else:
            self.num_updates = num_updates

        # Parallel
        if len(self.environments) > 1:
            pass
        elif batch_agent_calls:
            raise TensorforceError.invalid(
                name='Runner.run', argument='batch_agent_calls', condition='single environment'
            )
        elif sync_timesteps:
            raise TensorforceError.invalid(
                name='Runner.run', argument='sync_timesteps', condition='single environment'
            )
        elif sync_episodes:
            raise TensorforceError.invalid(
                name='Runner.run', argument='sync_episodes', condition='single environment'
            )
        self.batch_agent_calls = batch_agent_calls
        self.sync_timesteps = sync_timesteps or self.batch_agent_calls
        self.sync_episodes = sync_episodes
        self.num_sleep_secs = num_sleep_secs

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r, p: True)
        elif util.is_iterable(x=callback):
            def sequential_callback(runner, parallel):
                result = True
                for fn in callback:
                    x = fn(runner, parallel)
                    if isinstance(result, bool):
                        result = result and x
                return result
            self.callback = sequential_callback
        else:
            def boolean_callback(runner, parallel):
                result = callback(runner, parallel)
                if isinstance(result, bool):
                    return result
                else:
                    return True
            self.callback = boolean_callback

        # Experiment statistics
        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        if self.is_environment_remote:
            self.episode_env_seconds = list()
        if self.evaluation or evaluation:
            self.evaluation_rewards = list()
            self.evaluation_timesteps = list()
            self.evaluation_seconds = list()
            self.evaluation_agent_seconds = list()
            if self.is_environment_remote:
                self.evaluation_env_seconds = list()
            if len(self.environments) == 1:
                # for tqdm
                self.episode_rewards = self.evaluation_rewards
                self.episode_timesteps = self.evaluation_timesteps
                self.episode_seconds = self.evaluation_seconds
                self.episode_agent_seconds = self.evaluation_agent_seconds
                if self.is_environment_remote:
                    self.episode_env_seconds = self.evaluation_env_seconds
        else:
            # for tqdm
            self.evaluation_rewards = self.episode_rewards
            self.evaluation_timesteps = self.episode_timesteps
            self.evaluation_seconds = self.episode_seconds
            self.evaluation_agent_seconds = self.episode_agent_seconds
            if self.is_environment_remote:
                self.evaluation_env_seconds = self.episode_env_seconds

        # Timestep/episode/update counter
        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Tqdm
        if use_tqdm:
            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float('inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                bar_format = (
                    '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep='
                    '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent='
                    '{postfix[4]:.1f}%]'
                )
                postfix = [0.0, 0, 0.0, 0.0, 0.0]
                if self.is_environment_remote:
                    bar_format = bar_format[:-1] + ', comm={postfix[5]:.1f}%]'
                    postfix.append(0.0)

                self.tqdm = tqdm(
                    desc='Episodes', total=self.num_episodes, bar_format=bar_format,
                    initial=self.episodes, postfix=postfix
                )
                self.tqdm_last_update = self.episodes

                def tqdm_callback(runner, parallel):
                    if len(runner.evaluation_rewards) > 0:
                        mean_reward = float(np.mean(runner.evaluation_rewards[-mean_horizon:]))
                        runner.tqdm.postfix[0] = mean_reward
                    if len(runner.episode_timesteps) > 0:
                        mean_ts_per_ep = int(np.mean(runner.episode_timesteps[-mean_horizon:]))
                        mean_sec_per_ep = float(np.mean(runner.episode_seconds[-mean_horizon:]))
                        mean_agent_sec = float(
                            np.mean(runner.episode_agent_seconds[-mean_horizon:])
                        )
                        mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep
                        mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep
                        runner.tqdm.postfix[1] = mean_ts_per_ep
                        runner.tqdm.postfix[2] = mean_sec_per_ep
                        runner.tqdm.postfix[3] = mean_ms_per_ts
                        runner.tqdm.postfix[4] = mean_rel_agent
                    if runner.is_environment_remote and len(runner.episode_env_seconds) > 0:
                        mean_env_sec = float(np.mean(runner.episode_env_seconds[-mean_horizon:]))
                        mean_rel_comm = (mean_agent_sec + mean_env_sec) * 100.0 / mean_sec_per_ep
                        mean_rel_comm = 100.0 - mean_rel_comm
                        runner.tqdm.postfix[5] = mean_rel_comm
                    runner.tqdm.update(n=(runner.episodes - runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.episodes
                    return inner_callback(runner, parallel)

            else:
                # Timestep-based tqdm
                self.tqdm = tqdm(
                    desc='Timesteps', total=self.num_timesteps, initial=self.timesteps,
                    postfix=dict(mean_reward='n/a')
                )
                self.tqdm_last_update = self.timesteps

                def tqdm_callback(runner, parallel):
                    # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
                    # num_timesteps = min(num_mean_reward, runner.evaluation_timestep)
                    # mean_reward = sum_timesteps_reward / num_episodes
                    runner.tqdm.set_postfix(mean_reward='n/a')
                    runner.tqdm.update(n=(runner.timesteps - runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.timesteps
                    return inner_callback(runner, parallel)

            self.callback = tqdm_callback

        # Evaluation
        if evaluation and len(self.environments) > 1:
            raise TensorforceError.invalid(
                name='Runner.run', argument='evaluation', condition='multiple environments'
            )
        self.evaluation_run = self.evaluation or evaluation
        self.save_best_agent = save_best_agent
        if evaluation_callback is None:
            self.evaluation_callback = (lambda r: None)
        else:
            self.evaluation_callback = evaluation_callback
        if self.save_best_agent is not None:
            inner_evaluation_callback = self.evaluation_callback

            def mean_reward_callback(runner):
                result = inner_evaluation_callback(runner)
                if result is None:
                    return float(np.mean(runner.evaluation_rewards[-mean_horizon:]))
                else:
                    return result

            self.evaluation_callback = mean_reward_callback
            self.best_evaluation_score = None

        # Episode statistics
        self.episode_reward = [0.0 for _ in self.environments]
        self.episode_timestep = [0 for _ in self.environments]
        # if self.batch_agent_calls:
        #     self.episode_agent_second = 0.0
        #     self.episode_start = time.time()
        if self.evaluation_run:
            self.episode_agent_second = [0.0 for _ in self.environments[:-1]]
            self.episode_start = [time.time() for _ in self.environments[:-1]]
        else:
            self.episode_agent_second = [0.0 for _ in self.environments]
            self.episode_start = [time.time() for _ in self.environments]
        self.evaluation_agent_second = 0.0
        self.evaluation_start = time.time()

        # Values
        self.terminate = 0
        self.prev_terminals = [-1 for _ in self.environments]
        self.states = [None for _ in self.environments]
        self.terminals = [None for _ in self.environments]
        self.rewards = [None for _ in self.environments]
        if self.evaluation_run:
            self.evaluation_internals = self.agent.initial_internals()

        # Required if agent was previously stopped mid-episode
        self.agent.reset()

        # Reset environments
        for environment in self.environments:
            environment.start_reset()

        # Runner loop
        while any(terminal <= 0 for terminal in self.prev_terminals):
            self.terminals = [None for _ in self.terminals]

            if self.batch_agent_calls:
                # Retrieve observations (only if not already terminated)
                while any(terminal is None for terminal in self.terminals):
                    for n in range(len(self.environments)):
                        if self.terminals[n] is not None:
                            # Already received
                            continue
                        elif self.prev_terminals[n] <= 0:
                            # Receive if not terminal
                            observation = self.environments[n].receive_execute()
                            if observation is None:
                                continue
                            self.states[n], self.terminals[n], self.rewards[n] = observation
                        else:
                            # Terminal
                            self.states[n] = None
                            self.terminals[n] = self.prev_terminals[n]
                            self.rewards[n] = None

                self.handle_observe_joint()
                self.handle_act_joint()

            # Parallel environments loop
            no_environment_ready = True
            for n in range(len(self.environments)):

                if self.prev_terminals[n] > 0:
                    # Continue if episode terminated (either sync_episodes or finished)
                    self.terminals[n] = self.prev_terminals[n]
                    continue

                elif self.batch_agent_calls:
                    # Handled before parallel environments loop
                    pass

                elif self.sync_timesteps:
                    # Wait until environment is ready
                    while True:
                        observation = self.environments[n].receive_execute()
                        if observation is not None:
                            break

                else:
                    # Check whether environment is ready, otherwise continue
                    observation = self.environments[n].receive_execute()
                    if observation is None:
                        self.terminals[n] = self.prev_terminals[n]
                        continue

                no_environment_ready = False
                if not self.batch_agent_calls:
                    self.states[n], self.terminals[n], self.rewards[n] = observation

                # Check whether evaluation environment
                if self.evaluation_run and n == (len(self.environments) - 1):
                    if self.terminals[n] == -1:
                        # Initial act
                        self.handle_act_evaluation()
                    else:
                        # Observe
                        self.handle_observe_evaluation()
                        if self.terminals[n] == 0:
                            # Act
                            self.handle_act_evaluation()
                        else:
                            # Terminal
                            self.handle_terminal_evaluation()

                else:
                    if self.terminals[n] == -1:
                        # Initial act
                        self.handle_act(parallel=n)
                    else:
                        # Observe
                        self.handle_observe(parallel=n)
                        if self.terminals[n] == 0:
                            # Act
                            self.handle_act(parallel=n)
                        else:
                            # Terminal
                            self.handle_terminal(parallel=n)

            self.prev_terminals = list(self.terminals)

            # Sync_episodes: Reset if all episodes terminated
            if self.sync_episodes and all(terminal > 0 for terminal in self.terminals):
                num_episodes_left = self.num_episodes - self.episodes
                num_noneval_environments = len(self.environments) - int(self.evaluation_run)
                for n in range(min(num_noneval_environments, num_episodes_left)):
                    self.prev_terminals[n] = -1
                    self.environments[n].start_reset()
                if self.evaluation_run and num_episodes_left > 0:
                    self.prev_terminals[-1] = -1
                    self.environments[-1].start_reset()

            # Sleep if no environment was ready
            if no_environment_ready:
                time.sleep(self.num_sleep_secs)
Esempio n. 19
0
    def __setattr__(self, name, value):
        if not self.overwrite:
            raise NotImplementedError

        if name == 'type':
            if value is None:
                # Type: None
                pass
            elif util.is_iterable(x=value):
                # Type: tuple(*types)
                if any(_normalize_type(dtype=x) is None for x in value):
                    raise TensorforceError.value(name='TensorSpec', argument=name, value=value)
                value = tuple(_normalize_type(dtype=x) for x in value)
            else:
                # Type: 'bool' | 'int' | 'float'
                if _normalize_type(dtype=value) is None:
                    raise TensorforceError.value(name='TensorSpec', argument=name, value=value)
                value = _normalize_type(dtype=value)

            # Delete attributes not required anymore
            if self.type is not None and self.type != 'bool' and value == 'bool':
                super().__delattr__('min_value')
                super().__delattr__('max_value')
            if self.type is not None and (
                self.type == 'int' or (isinstance(self.type, tuple) and 'int' in self.type)
            ) and value != 'int' and (not isinstance(value, tuple) or 'int' not in value):
                super().__delattr__('num_values')

            # Set type attribute
            super().__setattr__(name, value)

            # Reset attributes
            if self.type == 'int' or (isinstance(self.type, tuple) and 'int' in self.type):
                self.min_value = None
                self.max_value = None
                self.num_values = None
            elif self.type != 'bool':
                self.min_value = None
                self.max_value = None

        elif name == 'shape':
            if value is None:
                # Shape: None
                pass
            elif util.is_iterable(x=value):
                if len(value) > 0 and value[0] is None:
                    # Shape: tuple(None, *ints >= -1)
                    try:
                        value = (None,) + tuple(int(x) for x in value[1:])
                        if any(x < -1 for x in value[1:]):
                            raise TensorforceError.value(
                                name='TensorSpec', argument=name, value=value
                            )
                    except BaseException:
                        raise TensorforceError.type(
                            name='TensorSpec', argument=name, value=type(value)
                        )
                else:
                    # Shape: tuple(*ints >= -1)
                    try:
                        value = tuple(int(x) for x in value)
                        if any(x < -1 for x in value):
                            raise TensorforceError.value(
                                name='TensorSpec', argument=name, value=value
                            )
                    except BaseException:
                        raise TensorforceError.value(name='TensorSpec', argument=name, value=value)
            else:
                # Shape: (int >= -1,)
                try:
                    value = (int(value),)
                    if value[0] < -1:
                        raise TensorforceError.value(name='TensorSpec', argument=name, value=value)
                except BaseException:
                    raise TensorforceError.type(name='TensorSpec', argument=name, value=type(value))

            # TODO: check min/max_value shape if np.ndarray

            # Set shape attribute
            super().__setattr__(name, value)

        elif name == 'min_value' or name == 'max_value':
            # Invalid for type == 'bool', or type == 'int' and num_values != None
            if self.type == 'bool':
                raise TensorforceError.invalid(
                    name='TensorSpec', argument=name, condition='type is bool'
                )

            if value is None:
                # Min/max value: None
                pass
            else:
                # Min/max value: int/float
                try:
                    value = self.py_type()(value)
                    if self.type == 'int' and self.num_values is not None:
                        if name == 'min_value':
                            assert value == 0
                        elif name == 'max_value':
                            assert value == self.num_values - 1
                except BaseException:
                    try:
                        value = np.asarray(value, dtype=self.np_type())
                        if self.type == 'int':
                            assert self.num_values is None
                    except BaseException:
                        raise TensorforceError.type(
                            name='TensorSpec', argument=name, value=type(value)
                        )

                if isinstance(value, np.ndarray):
                    if self.shape is not None and (
                        value.ndim > len(self.shape) or value.shape != self.shape[:value.ndim]
                    ):
                        raise TensorforceError.value(
                            name='TensorSpec', argument=(name + ' shape'), value=value.shape,
                            hint='incompatible with {}'.format(self.shape)
                        )
                    if name == 'min_value' and self.max_value is not None and \
                            (value > self.max_value - util.epsilon).any():
                        raise TensorforceError.value(
                            name='TensorSpec', argument=name, value=value,
                            condition='max_value = {}'.format(self.max_value)
                        )
                    elif name == 'max_value' and self.min_value is not None and \
                            (value < self.min_value + util.epsilon).any():
                        raise TensorforceError.value(
                            name='TensorSpec', argument=name, value=value,
                            condition='min_value = {}'.format(self.min_value)
                        )
                else:
                    if name == 'min_value' and self.max_value is not None:
                        if isinstance(self.max_value, np.ndarray):
                            if (value > self.max_value - util.epsilon).any():
                                raise TensorforceError.value(
                                    name='TensorSpec', argument=name, value=value,
                                    condition='max_value = {}'.format(self.max_value)
                                )
                        elif value > self.max_value - util.epsilon:
                            raise TensorforceError.value(
                                name='TensorSpec', argument=name, value=value,
                                condition='max_value = {}'.format(self.max_value)
                            )
                    elif name == 'max_value' and self.min_value is not None:
                        if isinstance(self.min_value, np.ndarray):
                            if (value < self.min_value + util.epsilon).any():
                                raise TensorforceError.value(
                                    name='TensorSpec', argument=name, value=value,
                                    condition='min_value = {}'.format(self.min_value)
                                )
                        elif value < self.min_value + util.epsilon:
                            raise TensorforceError.value(
                                name='TensorSpec', argument=name, value=value,
                                condition='min_value = {}'.format(self.min_value)
                            )

            # Set min/max_value attribute
            super().__setattr__(name, value)

        elif name == 'num_values':
            # Invalid for type != 'int'
            if self.type != 'int' and (not isinstance(self.type, tuple) or 'int' not in self.type):
                raise TensorforceError.invalid(
                    name='TensorSpec', argument=name, condition='type is {}'.format(self.type)
                )

            if value is None:
                # Num values: None
                pass
            else:
                # Num values: int >= 0
                try:
                    value = int(value)
                except BaseException:
                    raise TensorforceError.type(name='TensorSpec', argument=name, value=type(value))
                if value < 0:
                    raise TensorforceError.value(name='TensorSpec', argument=name, value=value)

            # Set num_values attribute and min/max_value accordingly
            super().__setattr__(name, value)
            if value is not None and value > 0:
                super().__setattr__('min_value', 0)
                super().__setattr__('max_value', value - 1)
            else:
                super().__setattr__('min_value', None)
                super().__setattr__('max_value', None)

        else:
            raise TensorforceError.invalid(name='TensorSpec', argument=name)
Esempio n. 20
0
    def __init__(self,
                 agent,
                 environment=None,
                 num_parallel=None,
                 environments=None,
                 max_episode_timesteps=None,
                 evaluation_environment=None,
                 save_best_agent=None):
        self.environments = list()
        if environment is None:
            assert num_parallel is None and environments is not None
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(name='parallel-runner',
                                            argument='environments',
                                            value=environments)
            elif len(environments) == 0:
                raise TensorforceError.value(name='parallel-runner',
                                             argument='environments',
                                             value=environments)
            num_parallel = len(environments)
            environment = environments[0]
            self.is_environment_external = isinstance(environment, Environment)
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps)
            states = environment.states()
            actions = environment.actions()
            self.environments.append(environment)
            for environment in environments[1:]:
                assert isinstance(environment,
                                  Environment) == self.is_environment_external
                environment = Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps)
                assert environment.states() == states
                assert environment.actions() == actions
                self.environments.append(environment)

        else:
            assert num_parallel is not None and environments is None
            assert not isinstance(environment, Environment)
            self.is_environment_external = False
            for _ in range(num_parallel):
                environment = Environment.create(
                    environment=environment,
                    max_episode_timesteps=max_episode_timesteps)
                self.environments(environment)

        if evaluation_environment is None:
            self.evaluation_environment = None
        else:
            self.is_eval_environment_external = isinstance(
                evaluation_environment, Environment)
            self.evaluation_environment = Environment.create(
                environment=evaluation_environment,
                max_episode_timesteps=max_episode_timesteps)
            assert self.evaluation_environment.states() == environment.states()
            assert self.evaluation_environment.actions(
            ) == environment.actions()

        self.is_agent_external = isinstance(agent, Agent)
        kwargs = dict(parallel_interactions=num_parallel)
        self.agent = Agent.create(agent=agent,
                                  environment=environment,
                                  **kwargs)
        self.save_best_agent = save_best_agent

        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()
        self.episode_agent_seconds = list()
        self.evaluation_rewards = list()
        self.evaluation_timesteps = list()
        self.evaluation_seconds = list()
        self.evaluation_agent_seconds = list()
Esempio n. 21
0
    def __init__(self, agent, learner, environment, p_network, global_dict, report_frequency,
                 algorithm, callback=None, callback_episode_frequency=None, callback_timestep_frequency=None,
                 parallel_interactions=1, num_episodes=None, **kwargs
                 ):
        if isinstance(environment, BaseEnvironment):
            fruit_environment = environment
            self.tf_environment = TensorForcePlugin.convert(environment)
        else:
            environment = Environment.create(environment=environment)
            fruit_environment = TensorForcePlugin.convert(environment)
            self.tf_environment = environment

        super().__init__(agent=agent, name=learner, environment=fruit_environment, network=p_network,
                         global_dict=global_dict,
                         report_frequency=report_frequency)
        self.algorithm = algorithm
        self.tf_agent = Agent.create(
            algorithm, self.tf_environment, **kwargs
        )
        if not self.tf_agent.model.is_initialized:
            self.tf_agent.initialize()

        self.episode_rewards = list()
        self.episode_timesteps = list()
        self.episode_seconds = list()

        self.parallel_interactions = parallel_interactions
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes

        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r: True)
        elif util.is_iterable(x=callback):
            def sequential_callback(runner):
                result = True
                for fn in callback:
                    x = fn(runner)
                    if isinstance(result, bool):
                        result = result and x
                return result

            self.callback = sequential_callback
        else:
            def boolean_callback(runner):
                result = callback(runner)
                if isinstance(result, bool):
                    return result
                else:
                    return True

            self.callback = boolean_callback
Esempio n. 22
0
    def _process_states_input(self, states, function_name):
        if self.states_spec.is_singleton() and not isinstance(
                states, dict) and not (util.is_iterable(x=states)
                                       and isinstance(states[0], dict)):
            # Single state
            states = np.asarray(states)
            if states.shape == self.states_spec.value().shape:
                # Single state is not batched
                states = ArrayDict(singleton=np.expand_dims(states, axis=0))
                batched = False
                num_instances = 1
                is_iter_of_dicts = None

            else:
                # Single state is batched, iter[state]
                assert states.shape[1:] == self.states_spec.value().shape
                assert type(states) in (tuple, list, np.ndarray)
                num_instances = states.shape[0]
                states = ArrayDict(singleton=states)
                batched = True
                is_iter_of_dicts = True  # Default

        elif util.is_iterable(x=states):
            # States is batched, iter[dict[state]]
            batched = True
            num_instances = len(states)
            is_iter_of_dicts = True
            assert type(states) in (tuple, list)
            if num_instances == 0:
                raise TensorforceError.value(name=function_name,
                                             argument='len(states)',
                                             value=num_instances,
                                             hint='= 0')
            for n, state in enumerate(states):
                if not isinstance(state, dict):
                    raise TensorforceError.type(
                        name=function_name,
                        argument='states[{}]'.format(n),
                        dtype=type(state),
                        hint='is not dict')
            # Turn iter of dicts into dict of arrays
            # (Doesn't use self.states_spec since states also contains auxiliaries)
            states = [ArrayDict(state) for state in states]
            states = states[0].fmap(
                function=(lambda *xs: np.stack(xs, axis=0)),
                zip_values=states[1:])

        elif isinstance(states, dict):
            # States is dict, turn into arrays
            states = ArrayDict(states)
            name, spec = self.states_spec.item()
            if name is None:
                name = 'state'

            if states[name].shape == spec.shape:
                # States is not batched, dict[state]
                states = states.fmap(
                    function=(lambda state: np.expand_dims(state, axis=0)))
                batched = False
                num_instances = 1
                is_iter_of_dicts = None

            else:
                # States is batched, dict[iter[state]]
                assert states[name].shape[1:] == spec.shape
                assert type(states[name]) in (tuple, list, np.ndarray)
                batched = True
                num_instances = states[name].shape[0]
                is_iter_of_dicts = False
                if num_instances == 0:
                    raise TensorforceError.value(name=function_name,
                                                 argument='len(states)',
                                                 value=num_instances,
                                                 hint='= 0')

        else:
            raise TensorforceError.type(name=function_name,
                                        argument='states',
                                        dtype=type(states),
                                        hint='is not array/tuple/list/dict')

        # Check number of inputs
        if any(state.shape[0] != num_instances for state in states.values()):
            raise TensorforceError.value(
                name=function_name,
                argument='len(states)',
                value=[state.shape[0] for state in states.values()],
                hint='inconsistent')

        return states, batched, num_instances, is_iter_of_dicts
Esempio n. 23
0
    def add_variable(
        self, name, dtype, shape, is_trainable, initializer='zeros', is_saved=True, summarize=None,
        shared=None
    ):
        # name
        if not util.is_valid_name(name=name):
            raise TensorforceError.value(name='Module.add_variable', argument='name', value=name)
        elif name in self.variables:
            raise TensorforceError.exists(name='variable', value=name)
        # dtype
        if not util.is_valid_type(dtype=dtype):
            raise TensorforceError.value(name='Module.add_variable', argument='dtype', value=dtype)
        # shape
        if not util.is_iterable(x=shape) or not all(isinstance(dims, int) for dims in shape):
            raise TensorforceError.value(name='Module.add_variable', argument='shape', value=shape)
        elif not all(dims > 0 for dims in shape):
            raise TensorforceError.value(name='Module.add_variable', argument='shape', value=shape)
        # is_trainable
        if not isinstance(is_trainable, bool):
            raise TensorforceError.type(
                name='Module.add_variable', argument='is_trainable', dtype=type(is_trainable)
            )
        elif is_trainable and dtype != 'float':
            raise TensorforceError.value(
                name='Module.add_variable', argument='is_trainable', value=is_trainable,
                condition='dtype != float'
            )
        # initializer
        initializer_names = (
            'normal', 'normal-relu', 'orthogonal', 'orthogonal-relu', 'zeros', 'ones'
        )
        if not isinstance(initializer, (util.py_dtype(dtype=dtype), np.ndarray, tf.Tensor)) and \
                initializer not in initializer_names:
            raise TensorforceError.value(
                name='Module.add_variable', argument='initializer', value=initializer
            )
        elif isinstance(initializer, np.ndarray) and \
                initializer.dtype != util.np_dtype(dtype=dtype):
            raise TensorforceError.type(
                name='Module.add_variable', argument='initializer', dtype=type(initializer)
            )
        elif isinstance(initializer, tf.Tensor) and util.dtype(x=initializer) != dtype:
            raise TensorforceError.type(
                name='Module.add_variable', argument='initializer', dtype=type(initializer)
            )
        # is_saved
        if not isinstance(is_saved, bool):
            raise TensorforceError.type(
                name='Module.add_variable', argument='is_saved', dtype=type(is_saved)
            )
        # summarize
        if summarize is not None and not isinstance(summarize, bool):
            raise TensorforceError.type(
                name='Module.add_variable', argument='summarize', dtype=type(summarize)
            )
        # shared
        if shared is not None and not isinstance(shared, str):
            raise TensorforceError.type(
                name='Module.add_variable', argument='shared',dtype=type(shared)
            )

        variable = None

        if shared is not None and len(self.graph.get_collection(name=shared)) > 0:
            # Retrieve shared variable from TensorFlow
            collection = self.graph.get_collection(name=shared)
            if len(collection) > 1:
                raise TensorforceError.unexpected()
            variable = collection[0]

        else:
            tf_dtype = util.tf_dtype(dtype=dtype)

            # Variable initializer
            if isinstance(initializer, util.py_dtype(dtype=dtype)):
                initializer = tf.constant(value=initializer, dtype=tf_dtype, shape=shape)
            elif isinstance(initializer, np.ndarray):
                if initializer.shape != shape:
                    raise TensorforceError.mismatch(
                        name='Module.add_variable', value1='shape', value2='initializer'
                    )
                initializer = tf.constant(value=initializer, dtype=tf_dtype)
            elif isinstance(initializer, tf.Tensor):
                if util.shape(x=initializer) != shape:
                    raise TensorforceError.mismatch(
                        name='Module.add_variable', value1='shape', value2='initializer'
                    )
                initializer = initializer
            elif not isinstance(initializer, str):
                raise TensorforceError("Invalid variable initializer: {}".format(initializer))
            elif initializer[:6] == 'normal':
                if dtype != 'float':
                    raise TensorforceError(
                        message="Invalid variable initializer value for non-float variable: {}.".format(
                            initializer
                        )
                    )
                if initializer[6:] == '-relu':
                    stddev = min(0.1, sqrt(2.0 / util.product(xs=shape[:-1])))
                else:
                    stddev = min(0.1, sqrt(2.0 / (util.product(xs=shape[:-1]) + shape[-1])))
                initializer = tf.random.normal(shape=shape, stddev=stddev, dtype=tf_dtype)
            elif initializer[:10] == 'orthogonal':
                if dtype != 'float':
                    raise TensorforceError(
                        message="Invalid variable initializer value for non-float variable: {}.".format(
                            initializer
                        )
                    )
                if len(shape) < 2:
                    raise TensorforceError(
                        message="Invalid variable initializer value for 0/1-rank variable: {}.".format(
                            initializer
                        )
                    )
                normal = np.random.normal(size=(util.product(xs=shape[:-1]), shape[-1]))
                u, _, v = np.linalg.svd(a=normal, full_matrices=False)
                orthogonal = u if u.shape[1] == shape[-1] else v
                if initializer[10:] == '-relu':
                    orthogonal = orthogonal * sqrt(2.0)
                initializer = tf.constant(value=orthogonal.reshape(shape), dtype=tf_dtype)
            elif initializer == 'zeros':
                initializer = tf.zeros(shape=shape, dtype=tf_dtype)
            elif initializer == 'ones':
                initializer = tf.ones(shape=shape, dtype=tf_dtype)

            # Variable
            variable = tf.Variable(
                initial_value=initializer, trainable=is_trainable, validate_shape=True, name=name,
                dtype=tf_dtype, shape=shape
            )

            # Register shared variable with TensorFlow
            if shared is not None:
                self.graph.add_to_collection(name=shared, value=variable)

        # Register variable
        self.variables[name] = variable
        if is_trainable:
            self.trainable_variables[name] = variable
        if is_saved:
            self.saved_variables[name] = variable

        # Add summary
        if (summarize is None and is_trainable) or summarize:
            variable = self.add_summary(
                label='variables', name=name, tensor=variable, mean_variance=True
            )
            variable = self.add_summary(label='variables-histogram', name=name, tensor=variable)

        return variable
Esempio n. 24
0
    def unify(self, *, other, name='TensorSpec.unify'):
        # Unify type
        if self.type is None:
            dtype = other.type
        elif other.type is None:
            dtype = self.type
        elif util.is_iterable(x=self.type):
            if util.is_iterable(x=other.type):
                if set(self.type) <= set(other.type):
                    dtype = self.type
                elif set(other.type) <= set(self.type):
                    dtype = other.type
                else:
                    raise TensorforceError.mismatch(
                        name=name, argument='type', value1=self.type, value2=other.type
                    )
            elif other.type in self.type:
                dtype = other.type
            else:
                raise TensorforceError.mismatch(
                    name=name, argument='type', value1=self.type, value2=other.type
                )
        elif util.is_iterable(x=other.type):
            if self.type in other.type:
                dtype = self.type
            else:
                raise TensorforceError.mismatch(
                    name=name, argument='type', value1=self.type, value2=other.type
                )
        elif self.type == other.type:
            dtype = self.type
        else:
            raise TensorforceError.mismatch(
                name=name, argument='type', value1=self.type, value2=other.type
            )

        # Unify shape
        if self.shape is None:
            shape = other.shape
        elif other.shape is None:
            shape = self.shape
        else:
            reverse_shape = list()
            start = len(self.shape) - 1
            if self.shape[-1] is None:
                reverse_shape.extend(other.shape[len(self.shape) - 1:])
                start = len(self.shape) - 2
            elif other.shape[-1] is None:
                reverse_shape.extend(self.shape[len(other.shape) - 1:])
                start = len(other.shape) - 2
            elif len(self.shape) != len(other.shape):
                raise TensorforceError.mismatch(
                    name=name, argument='rank', value1=self.rank, value2=other.rank
                )
            for n in range(start, -1, -1):
                if self.shape[n] == 0:
                    reverse_shape.append(other.shape[n])
                elif other.shape[n] == 0:
                    reverse_shape.append(self.shape[n])
                elif self.shape[n] == -1 and other.shape[n] > 0:
                    reverse_shape.append(other.shape[n])
                elif other.shape[n] == -1 and self.shape[n] > 0:
                    reverse_shape.append(self.shape[n])
                elif self.shape[n] == other.shape[n]:
                    reverse_shape.append(self.shape[n])
                else:
                    raise TensorforceError.mismatch(
                        name=name, argument='shape', value1=self.shape, value2=other.shape
                    )
            shape = tuple(reversed(reverse_shape))

        # Unify min_value
        if dtype == 'bool':
            min_value = None
        elif self.type != 'bool' and self.min_value is not None:
            if other.type != 'bool' and other.min_value is not None:
                if isinstance(self.min_value, np.ndarray) or \
                        isinstance(other.min_value, np.ndarray):
                    min_value = np.minimum(self.min_value, other.min_value)
                elif self.min_value < other.min_value:
                    min_value = other.min_value
                else:
                    min_value = self.min_value
            else:
                min_value = self.min_value
        elif other.type != 'bool' and other.min_value is not None:
            min_value = other.min_value
        else:
            min_value = None

        # Unify max_value
        if dtype == 'bool':
            max_value = None
        elif self.type != 'bool' and self.max_value is not None:
            if other.type != 'bool' and other.max_value is not None:
                if isinstance(self.max_value, np.ndarray) or \
                        isinstance(other.max_value, np.ndarray):
                    max_value = np.maximum(self.max_value, other.max_value)
                elif self.max_value < other.max_value:
                    max_value = other.max_value
                else:
                    max_value = self.max_value
            else:
                max_value = self.max_value
        elif other.type != 'bool' and other.max_value is not None:
            max_value = other.max_value
        else:
            max_value = None
        if min_value is not None and max_value is not None:
            if isinstance(min_value, np.ndarray) or isinstance(max_value, np.ndarray):
                if (min_value > max_value).any():
                    raise TensorforceError.mismatch(
                        name=name, argument='min/max_value', value1=min_value, value2=max_value
                    )
            else:
                if min_value > max_value:
                    raise TensorforceError.mismatch(
                        name=name, argument='min/max_value', value1=min_value, value2=max_value
                    )

        # Unify num_values
        if dtype != 'int' and (not isinstance(dtype, tuple) or 'int' not in dtype):
            num_values = None
        elif self.type == 'int' and self.num_values is not None:
            if other.type == 'int' and other.num_values is not None:
                if self.num_values == 0:
                    num_values = other.num_values
                elif other.num_values == 0:
                    num_values = self.num_values
                elif self.num_values == other.num_values:
                    num_values = self.num_values
                else:
                    raise TensorforceError.mismatch(
                        name=name, argument='num_values', value1=self.num_values,
                        value2=other.num_values
                    )
            else:
                num_values = self.num_values
        elif other.type == 'int' and other.num_values is not None:
            num_values = other.num_values
        else:
            num_values = None
        if num_values is not None:
            min_value = None
            max_value = None

        # Unified tensor spec
        return TensorSpec(
            type=dtype, shape=shape, min_value=min_value, max_value=max_value, num_values=num_values
        )
Esempio n. 25
0
    def run(
        self,
        # General
        num_episodes=None,
        num_timesteps=None,
        num_updates=None,
        join_agent_calls=False,
        sync_timesteps=False,
        sync_episodes=False,
        num_sleep_secs=0.01,
        # Callback
        callback=None,
        callback_episode_frequency=None,
        callback_timestep_frequency=None,
        # Tqdm
        use_tqdm=True,
        mean_horizon=1,
        # Evaluation
        evaluation_callback=None,
    ):
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if num_updates is None:
            self.num_updates = float('inf')
        else:
            self.num_updates = num_updates
        self.join_agent_calls = join_agent_calls
        if self.join_agent_calls:
            sync_timesteps = True
        self.sync_timesteps = sync_timesteps
        self.sync_episodes = sync_episodes
        self.num_sleep_secs = num_sleep_secs

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r, p: True)
        elif util.is_iterable(x=callback):

            def sequential_callback(runner, parallel):
                result = True
                for fn in callback:
                    x = fn(runner, parallel)
                    if isinstance(result, bool):
                        result = result and x
                return result

            self.callback = sequential_callback
        else:

            def boolean_callback(runner, parallel):
                result = callback(runner, parallel)
                if isinstance(result, bool):
                    return result
                else:
                    return True

            self.callback = boolean_callback

        # Timestep/episode/update counter
        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Tqdm
        if use_tqdm:
            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float(
                'inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                bar_format = (
                    '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep='
                    '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent='
                    '{postfix[4]:.1f}%]')
                postfix = [0.0, 0, 0.0, 0.0, 0.0]
                self.tqdm = tqdm(desc='Episodes',
                                 total=self.num_episodes,
                                 bar_format=bar_format,
                                 initial=self.episodes,
                                 postfix=postfix)
                self.tqdm_last_update = self.episodes

                def tqdm_callback(runner, parallel):
                    mean_reward = float(
                        np.mean(runner.episode_rewards[-mean_horizon:]))
                    mean_ts_per_ep = int(
                        np.mean(runner.episode_timesteps[-mean_horizon:]))
                    mean_sec_per_ep = float(
                        np.mean(runner.episode_seconds[-mean_horizon:]))
                    mean_agent_sec = float(
                        np.mean(runner.episode_agent_seconds[-mean_horizon:]))
                    mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep
                    mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep
                    runner.tqdm.postfix[0] = mean_reward
                    runner.tqdm.postfix[1] = mean_ts_per_ep
                    runner.tqdm.postfix[2] = mean_sec_per_ep
                    runner.tqdm.postfix[3] = mean_ms_per_ts
                    runner.tqdm.postfix[4] = mean_rel_agent
                    runner.tqdm.update(n=(runner.episodes -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.episodes
                    return inner_callback(runner, parallel)

            else:
                # Timestep-based tqdm
                self.tqdm = tqdm(desc='Timesteps',
                                 total=self.num_timesteps,
                                 initial=self.timesteps,
                                 postfix=dict(mean_reward='n/a'))
                self.tqdm_last_update = self.timesteps

                def tqdm_callback(runner, parallel):
                    # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
                    # num_timesteps = min(num_mean_reward, runner.episode_timestep)
                    # mean_reward = sum_timesteps_reward / num_episodes
                    runner.tqdm.set_postfix(mean_reward='n/a')
                    runner.tqdm.update(n=(runner.timesteps -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.timesteps
                    return inner_callback(runner, parallel)

            self.callback = tqdm_callback

        # Evaluation
        if self.evaluation_environment is None:
            assert evaluation_callback is None
            assert self.save_best_agent is None
        else:
            if evaluation_callback is None:
                self.evaluation_callback = (lambda r: None)
            else:
                self.evaluation_callback = evaluation_callback
            if self.save_best_agent is not None:
                inner_evaluation_callback = self.evaluation_callback

                def mean_reward_callback(runner):
                    result = inner_evaluation_callback(runner)
                    if result is None:
                        return runner.evaluation_reward
                    else:
                        return result

                self.evaluation_callback = mean_reward_callback
                self.best_evaluation_score = None

        # Required if agent was previously stopped mid-episode
        self.agent.reset()

        # Reset environments and episode statistics
        for environment in self.environments:
            environment.start_reset()
        self.episode_reward = [0.0 for _ in self.environments]
        self.episode_timestep = [0 for _ in self.environments]
        if self.join_agent_calls:
            self.episode_agent_second = 0.0
            self.episode_start = time.time()
        else:
            self.episode_agent_second = [0.0 for _ in self.environments]
            self.episode_start = [time.time() for _ in self.environments]
        environments = list(self.environments)

        if self.evaluation_environment is not None:
            self.evaluation_environment.start_reset()
            self.evaluation_reward = 0.0
            self.evaluation_timestep = 0
            if not self.join_agent_calls:
                self.evaluation_agent_second = 0.0
            environments.append(self.evaluation_environment)

        self.finished = False
        self.prev_terminals = [0 for _ in environments]
        self.states = [None for _ in environments]
        self.terminals = [None for _ in environments]
        self.rewards = [None for _ in environments]

        if self.join_agent_calls:
            self.joint

        # Runner loop
        while not self.finished:

            if self.join_agent_calls:
                # Retrieve observations (only if not already terminated)
                self.observations = [None for _ in environments]
                while any(observation is None
                          for observation in self.observations):
                    for n, (environment, terminal) in enumerate(
                            zip(environments, self.prev_terminals)):
                        if self.observations[n] is not None:
                            continue
                        if terminal == 0:
                            self.observations[n] = environment.receive_execute(
                            )
                        else:
                            self.observations[n] = (None, terminal, None)
                self.states, self.terminals, self.rewards = zip(
                    self.observations)
                self.terminals[parallel] = [
                    terminal if terminal is None else int(terminal)
                    for terminal in terminals
                ]

                self.handle_observe_joint()
                self.handle_act_joint()
                # if not self.join_agent_calls:  # !!!!!!
                #     self.episode_seconds.append(time.time() - episode_start[parallel])
                #     self.episode_agent_seconds.append(self.episode_agent_second[parallel])

            else:
                self.terminals = list(self.prev_terminals)

            if not self.sync_timesteps:
                no_environment_ready = True

            # Parallel environments loop
            for parallel, environment in enumerate(environments):

                # Is evaluation environment?
                evaluation = (parallel == len(self.environments))

                if self.sync_episodes and self.prev_terminals[parallel] > 0:
                    # Continue if episode already terminated
                    continue

                elif self.join_agent_calls:
                    pass

                elif self.sync_timesteps:
                    # Wait until environment is ready
                    while True:
                        observation = environment.receive_execute()
                        if observation is not None:
                            break

                else:
                    # Check whether environment is ready, otherwise continue
                    observation = environment.receive_execute()
                    if observation is None:
                        continue
                    no_environment_ready = False

                if not self.join_agent_calls:
                    self.states[parallel], self.terminals[
                        parallel], self.rewards[parallel] = observation
                    if self.terminals[parallel] is not None:
                        self.terminals[parallel] = int(
                            self.terminals[parallel])

                if self.terminals[parallel] is None:
                    # Initial act
                    if evaluation:
                        self.handle_act_evaluation()
                    else:
                        self.handle_act(parallel=parallel)

                else:
                    # Observe
                    if evaluation:
                        self.handle_observe_evaluation()
                    else:
                        self.handle_observe(parallel=parallel)

                    if self.terminals[parallel] == 0:
                        # Act
                        if evaluation:
                            self.handle_act_evaluation()
                        else:
                            self.handle_act(parallel=parallel)

                    else:
                        # Terminal
                        if evaluation:
                            self.handle_terminal_evaluation()
                        else:
                            self.handle_terminal(parallel=parallel)

                # # Update global timesteps/episodes/updates
                # self.global_timesteps = self.agent.timesteps
                # self.global_episodes = self.agent.episodes
                # self.global_updates = self.agent.updates

            print(self.sync_episodes)
            if self.sync_episodes and all(terminal > 0
                                          for terminal in self.terminals):
                # Reset if all episodes terminated
                self.prev_terminals = [0 for _ in environments]
                for environment in environments:
                    environment.start_reset()
            else:
                self.prev_terminals = list(self.terminals)

            if not self.sync_timesteps and no_environment_ready:
                # Sleep if no environment was ready
                time.sleep(self.num_sleep_secs)
Esempio n. 26
0
    def run(
            self,
            # General
            num_episodes=None,
            num_timesteps=None,
            num_updates=None,
            num_sleep_secs=0.01,
            sync_timesteps=False,
            sync_episodes=False,
            # Callback
            callback=None,
            callback_episode_frequency=None,
            callback_timestep_frequency=None,
            # Tqdm
            use_tqdm=True,
            mean_horizon=1,
            # Evaluation
            evaluation_callback=None):
        # General
        if num_episodes is None:
            self.num_episodes = float('inf')
        else:
            self.num_episodes = num_episodes
        if num_timesteps is None:
            self.num_timesteps = float('inf')
        else:
            self.num_timesteps = num_timesteps
        if num_updates is None:
            self.num_updates = float('inf')
        else:
            self.num_updates = num_updates
        self.num_sleep_secs = num_sleep_secs
        self.sync_timesteps = sync_timesteps
        self.sync_episodes = sync_episodes

        # Callback
        assert callback_episode_frequency is None or callback_timestep_frequency is None
        if callback_episode_frequency is None and callback_timestep_frequency is None:
            callback_episode_frequency = 1
        if callback_episode_frequency is None:
            self.callback_episode_frequency = float('inf')
        else:
            self.callback_episode_frequency = callback_episode_frequency
        if callback_timestep_frequency is None:
            self.callback_timestep_frequency = float('inf')
        else:
            self.callback_timestep_frequency = callback_timestep_frequency
        if callback is None:
            self.callback = (lambda r, p: True)
        elif util.is_iterable(x=callback):

            def sequential_callback(runner, parallel):
                result = True
                for fn in callback:
                    x = fn(runner, parallel)
                    if isinstance(result, bool):
                        result = result and x
                return result

            self.callback = sequential_callback
        else:

            def boolean_callback(runner, parallel):
                result = callback(runner, parallel)
                if isinstance(result, bool):
                    return result
                else:
                    return True

            self.callback = boolean_callback

        # Timestep/episode/update counter
        self.timesteps = 0
        self.episodes = 0
        self.updates = 0

        # Tqdm
        if use_tqdm:
            if hasattr(self, 'tqdm'):
                self.tqdm.close()

            assert self.num_episodes != float(
                'inf') or self.num_timesteps != float('inf')
            inner_callback = self.callback

            if self.num_episodes != float('inf'):
                # Episode-based tqdm (default option if both num_episodes and num_timesteps set)
                assert self.num_episodes != float('inf')
                bar_format = (
                    '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}, reward={postfix[0]:.2f}, ts/ep='
                    '{postfix[1]}, sec/ep={postfix[2]:.2f}, ms/ts={postfix[3]:.1f}, agent='
                    '{postfix[4]:.1f}%]')
                postfix = [0.0, 0, 0.0, 0.0, 0.0]
                self.tqdm = tqdm(desc='Episodes',
                                 total=self.num_episodes,
                                 bar_format=bar_format,
                                 initial=self.episodes,
                                 postfix=postfix)
                self.tqdm_last_update = self.episodes

                def tqdm_callback(runner, parallel):
                    mean_reward = float(
                        np.mean(runner.episode_rewards[-mean_horizon:]))
                    mean_ts_per_ep = int(
                        np.mean(runner.episode_timesteps[-mean_horizon:]))
                    mean_sec_per_ep = float(
                        np.mean(runner.episode_seconds[-mean_horizon:]))
                    mean_agent_sec = float(
                        np.mean(runner.episode_agent_seconds[-mean_horizon:]))
                    mean_ms_per_ts = mean_sec_per_ep * 1000.0 / mean_ts_per_ep
                    mean_rel_agent = mean_agent_sec * 100.0 / mean_sec_per_ep
                    runner.tqdm.postfix[0] = mean_reward
                    runner.tqdm.postfix[1] = mean_ts_per_ep
                    runner.tqdm.postfix[2] = mean_sec_per_ep
                    runner.tqdm.postfix[3] = mean_ms_per_ts
                    runner.tqdm.postfix[4] = mean_rel_agent
                    runner.tqdm.update(n=(runner.episodes -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.episodes
                    return inner_callback(runner, parallel)

            else:
                # Timestep-based tqdm
                self.tqdm = tqdm(desc='Timesteps',
                                 total=self.num_timesteps,
                                 initial=self.timesteps,
                                 postfix=dict(mean_reward='n/a'))
                self.tqdm_last_update = self.timesteps

                def tqdm_callback(runner, parallel):
                    # sum_timesteps_reward = sum(runner.timestep_rewards[num_mean_reward:])
                    # num_timesteps = min(num_mean_reward, runner.episode_timestep)
                    # mean_reward = sum_timesteps_reward / num_episodes
                    runner.tqdm.set_postfix(mean_reward='n/a')
                    runner.tqdm.update(n=(runner.timesteps -
                                          runner.tqdm_last_update))
                    runner.tqdm_last_update = runner.timesteps
                    return inner_callback(runner, parallel)

            self.callback = tqdm_callback

        # Evaluation
        if self.evaluation_environment is None:
            assert evaluation_callback is None
            assert self.save_best_agent is False
        else:
            if evaluation_callback is None:
                self.evaluation_callback = (lambda r: None)
            else:
                self.evaluation_callback = evaluation_callback
            if self.save_best_agent is not False:
                inner_evaluation_callback = self.evaluation_callback

                def mean_reward_callback(runner):
                    result = inner_evaluation_callback(runner)
                    if result is None:
                        return runner.evaluation_reward
                    else:
                        return result

                self.evaluation_callback = mean_reward_callback
                self.best_evaluation_score = None

        # Reset agent
        self.agent.reset()

        # Reset environments and episode statistics
        for environment in self.environments:
            environment.start_reset()
        self.episode_reward = [0.0 for _ in self.environments]
        self.episode_timestep = [0 for _ in self.environments]
        self.episode_agent_second = [0.0 for _ in self.environments]
        episode_start = [time.time() for _ in self.environments]
        environments = list(self.environments)

        if self.evaluation_environment is not None:
            self.evaluation_environment.start_reset()
            self.evaluation_reward = 0.0
            self.evaluation_timestep = 0
            self.evaluation_agent_second = 0.0
            evaluation_start = time.time()
            environments.append(self.evaluation_environment)

        if self.sync_episodes:
            terminated = [False for _ in environments]

        # Runner loop
        while True:

            if not self.sync_timesteps:
                no_environment_ready = True

            # Parallel environments loop
            for parallel, environment in enumerate(environments):

                # Is evaluation environment?
                evaluation = (parallel == len(self.environments))

                if self.sync_episodes and terminated[parallel]:
                    # Continue if episode terminated
                    continue

                if self.sync_timesteps:
                    # Wait until environment is ready
                    while True:
                        observation = environment.retrieve_execute()
                        if observation is not None:
                            break
                        time.sleep(num_sleep_secs)

                else:
                    # Check whether environment is ready
                    observation = environment.retrieve_execute()
                    if observation is None:
                        continue
                    no_environment_ready = False

                states, terminal, reward = observation

                # Episode start or evaluation
                if terminal is None:
                    # Retrieve actions from agent
                    agent_start = time.time()
                    actions = self.agent.act(states=states,
                                             parallel=(parallel -
                                                       int(evaluation)),
                                             evaluation=evaluation)

                    if evaluation:
                        self.evaluation_agent_second += time.time(
                        ) - agent_start
                        self.evaluation_timestep += 1
                    else:
                        self.timesteps += 1
                        self.episode_agent_second[parallel] += time.time(
                        ) - agent_start
                        self.episode_timestep[parallel] += 1

                    # Execute actions in environment
                    environment.start_execute(actions=actions)

                    continue

                elif isinstance(terminal, bool):
                    terminal = int(terminal)

                # Observe unless episode just started or evaluation
                # assert (terminal is None) == (self.episode_timestep[parallel] == 0)
                # if terminal is not None and not evaluation:
                if evaluation:
                    self.evaluation_reward += reward
                else:
                    agent_start = time.time()
                    updated = self.agent.observe(terminal=terminal,
                                                 reward=reward,
                                                 parallel=parallel)
                    self.updates += int(updated)
                    self.episode_agent_second[parallel] += time.time(
                    ) - agent_start
                    self.episode_reward[parallel] += reward

                # # Update global timesteps/episodes/updates
                # self.global_timesteps = self.agent.timesteps
                # self.global_episodes = self.agent.episodes
                # self.global_updates = self.agent.updates

                # Callback plus experiment termination check
                if not evaluation and \
                        self.episode_timestep[parallel] % self.callback_timestep_frequency == 0 and \
                        not self.callback(self, parallel):
                    return

                if terminal > 0:
                    if evaluation:
                        # Update experiment statistics
                        self.evaluation_rewards.append(self.evaluation_reward)
                        self.evaluation_timesteps.append(
                            self.evaluation_timestep)
                        self.evaluation_seconds.append(time.time() -
                                                       evaluation_start)
                        self.evaluation_agent_seconds.append(
                            self.evaluation_agent_second)

                        # Evaluation callback
                        if self.save_best_agent is not False:
                            evaluation_score = self.evaluation_callback(self)
                            assert isinstance(evaluation_score, float)
                            if self.best_evaluation_score is None:
                                self.best_evaluation_score = evaluation_score
                            elif evaluation_score > self.best_evaluation_score:
                                self.best_evaluation_score = evaluation_score
                                if self.save_best_agent is True:
                                    self.agent.save(filename='best-model',
                                                    append_timestep=False)
                                else:
                                    self.agent.save(
                                        directory=self.save_best_agent,
                                        filename='best-model',
                                        append_timestep=False)
                        else:
                            self.evaluation_callback(self)

                    else:
                        # Increment episode counter (after calling callback)
                        self.episodes += 1

                        # Update experiment statistics
                        self.episode_rewards.append(
                            self.episode_reward[parallel])
                        self.episode_timesteps.append(
                            self.episode_timestep[parallel])
                        self.episode_seconds.append(time.time() -
                                                    episode_start[parallel])
                        self.episode_agent_seconds.append(
                            self.episode_agent_second[parallel])

                        # Callback
                        if self.episodes % self.callback_episode_frequency == 0 and \
                                not self.callback(self, parallel):
                            return

                # Terminate experiment if too long
                if self.timesteps >= self.num_timesteps:
                    return
                elif self.episodes >= self.num_episodes:
                    return
                elif self.updates >= self.num_updates:
                    return
                elif self.agent.should_stop():
                    return

                # Check whether episode terminated
                if terminal > 0:

                    if self.sync_episodes:
                        terminated[parallel] = True

                    if evaluation:
                        # Reset environment and episode statistics
                        environment.start_reset()
                        self.evaluation_reward = 0.0
                        self.evaluation_timestep = 0
                        self.evaluation_agent_second = 0.0
                        evaluation_start = time.time()

                    else:
                        # Reset environment and episode statistics
                        environment.start_reset()
                        self.episode_reward[parallel] = 0.0
                        self.episode_timestep[parallel] = 0
                        self.episode_agent_second[parallel] = 0.0
                        episode_start[parallel] = time.time()

                else:
                    # Retrieve actions from agent
                    agent_start = time.time()
                    actions = self.agent.act(states=states,
                                             parallel=(parallel -
                                                       int(evaluation)),
                                             evaluation=evaluation)

                    if evaluation:
                        self.evaluation_agent_second += time.time(
                        ) - agent_start
                        self.evaluation_timestep += 1
                    else:
                        self.timesteps += 1
                        self.episode_agent_second[parallel] += time.time(
                        ) - agent_start
                        self.episode_timestep[parallel] += 1

                    # Execute actions in environment
                    environment.start_execute(actions=actions)

            if not self.sync_timesteps and no_environment_ready:
                # Sleep if no environment was ready
                time.sleep(num_sleep_secs)

            if self.sync_episodes and all(terminated):
                # Reset if all episodes terminated
                terminated = [False for _ in environments]
Esempio n. 27
0
    def __init__(self,
                 agent,
                 environment=None,
                 max_episode_timesteps=None,
                 evaluation=False,
                 num_parallel=None,
                 environments=None,
                 remote=None,
                 blocking=False,
                 host=None,
                 port=None):
        if environment is None and environments is None:
            assert num_parallel is not None and remote == 'socket-client'
            environments = [None for _ in range(num_parallel)]

        elif environment is None:
            assert environments is not None
            assert num_parallel is None or num_parallel == len(environments)
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(name='parallel-runner',
                                            argument='environments',
                                            value=environments)
            elif len(environments) == 0:
                raise TensorforceError.value(name='parallel-runner',
                                             argument='environments',
                                             value=environments)
            num_parallel = len(environments)
            environments = list(environments)

        elif num_parallel is None:
            assert environments is None
            num_parallel = 1
            environments = [environment]

        else:
            assert environments is None
            assert not isinstance(environment, Environment)
            environments = [environment for _ in range(num_parallel)]

        if port is None or isinstance(port, int):
            if isinstance(host, str):
                port = [port + n for n in range(num_parallel)]
            else:
                port = [port for _ in range(num_parallel)]
        else:
            assert len(port) == num_parallel
        if host is None or isinstance(host, str):
            host = [host for _ in range(num_parallel)]
        else:
            assert len(host) == num_parallel

        self.environments = list()
        self.is_environment_external = isinstance(environments[0], Environment)
        environment = Environment.create(
            environment=environments[0],
            max_episode_timesteps=max_episode_timesteps,
            remote=remote,
            blocking=blocking,
            host=host[0],
            port=port[0])
        self.is_environment_remote = isinstance(environment, RemoteEnvironment)
        states = environment.states()
        actions = environment.actions()
        self.environments.append(environment)

        for n, environment in enumerate(environments[1:], start=1):
            assert isinstance(environment,
                              Environment) == self.is_environment_external
            environment = Environment.create(
                environment=environment,
                max_episode_timesteps=max_episode_timesteps,
                remote=remote,
                blocking=blocking,
                host=host[n],
                port=port[n])
            assert isinstance(environment,
                              RemoteEnvironment) == self.is_environment_remote
            assert environment.states() == states
            assert environment.actions() == actions
            self.environments.append(environment)

        self.evaluation = evaluation

        self.is_agent_external = isinstance(agent, Agent)
        if num_parallel - int(self.evaluation) > 1:
            self.agent = Agent.create(
                agent=agent,
                environment=environment,
                parallel_interactions=(num_parallel - int(self.evaluation)))
        else:
            self.agent = Agent.create(agent=agent, environment=environment)
Esempio n. 28
0
    def __init__(
        self, agent, environment=None, max_episode_timesteps=None, num_parallel=None,
        environments=None, evaluation=False, remote=None, blocking=False, host=None, port=None
    ):
        if environment is None and environments is None:
            if remote != 'socket-client':
                raise TensorforceError.required(
                    name='Runner', argument='environment or environments'
                )
            if num_parallel is None:
                raise TensorforceError.required(
                    name='Runner', argument='num_parallel', condition='socket-client remote mode'
                )
            environments = [None for _ in range(num_parallel)]

        elif environment is None:
            if environments is None:
                raise TensorforceError.required(
                    name='Runner', argument='environment or environments'
                )
            if not util.is_iterable(x=environments):
                raise TensorforceError.type(
                    name='Runner', argument='environments', value=environments
                )
            if len(environments) <= 1:
                raise TensorforceError.value(
                    name='Runner', argument='len(environments)', value=len(environments)
                )
            if num_parallel is not None and num_parallel != len(environments):
                raise TensorforceError.value(
                    name='Runner', argument='num_parallel', value=num_parallel,
                    hint='!= len(environments)'
                )
            num_parallel = len(environments)
            environments = list(environments)

        elif num_parallel is None:
            if environments is not None:
                raise TensorforceError.invalid(
                    name='Runner', argument='environments', condition='environment is specified'
                )
            if evaluation:
                raise TensorforceError.invalid(
                    name='Runner', argument='evaluation', condition='single environment'
                )
            num_parallel = 1
            environments = [environment]

        else:
            if not isinstance(num_parallel, int):
                raise TensorforceError.value(
                    name='Runner', argument='num_parallel', dtype=type(num_parallel)
                )
            elif num_parallel < 2:
                raise TensorforceError.value(
                    name='Runner', argument='num_parallel', value=num_parallel, hint='< 2'
                )
            if environments is not None:
                raise TensorforceError.invalid(
                    name='Runner', argument='environments', condition='environment is specified'
                )
            if isinstance(environment, Environment):
                raise TensorforceError.type(
                    name='Runner', argument='environment', dtype=type(environment),
                    condition='num_parallel', hint='is not specification'
                )
            environments = [environment for _ in range(num_parallel)]

        if port is None or isinstance(port, int):
            if isinstance(host, str):
                port = [port + n for n in range(num_parallel)]
            else:
                port = [port for _ in range(num_parallel)]
        else:
            if len(port) != num_parallel:
                raise TensorforceError.value(
                    name='Runner', argument='len(port)', value=len(port), hint='!= num_parallel'
                )
        if host is None or isinstance(host, str):
            host = [host for _ in range(num_parallel)]
        else:
            if len(host) != num_parallel:
                raise TensorforceError.value(
                    name='Runner', argument='len(host)', value=len(host), hint='!= num_parallel'
                )

        self.environments = list()
        self.is_environment_external = isinstance(environments[0], Environment)
        environment = Environment.create(
            environment=environments[0], max_episode_timesteps=max_episode_timesteps,
            remote=remote, blocking=blocking, host=host[0], port=port[0]
        )
        self.is_environment_remote = isinstance(environment, RemoteEnvironment)
        states = environment.states()
        actions = environment.actions()
        self.environments.append(environment)

        for n, environment in enumerate(environments[1:], start=1):
            assert isinstance(environment, Environment) == self.is_environment_external
            environment = Environment.create(
                environment=environment, max_episode_timesteps=max_episode_timesteps,
                remote=remote, blocking=blocking, host=host[n], port=port[n]
            )
            assert isinstance(environment, RemoteEnvironment) == self.is_environment_remote
            assert util.is_equal(x=environment.states(), y=states)
            assert util.is_equal(x=environment.actions(), y=actions)
            self.environments.append(environment)

        self.evaluation = evaluation

        self.is_agent_external = isinstance(agent, Agent)
        if num_parallel - int(self.evaluation) > 1:
            self.agent = Agent.create(
                agent=agent, environment=environment,
                parallel_interactions=(num_parallel - int(self.evaluation))
            )
        else:
            self.agent = Agent.create(agent=agent, environment=environment)
Esempio n. 29
0
    def observe(self, reward=0.0, terminal=False, parallel=0):
        """
        Observes reward and whether a terminal state is reached, needs to be preceded by `act()`.

        Args:
            reward (float | iter[float]): Reward
                (<span style="color:#00C000"><b>default</b></span>: 0.0).
            terminal (bool | 0 | 1 | 2 | iter[...]): Whether a terminal state is reached, or 2 if
                the episode was aborted
                (<span style="color:#00C000"><b>default</b></span>: false).
            parallel (int, iter[int]): Parallel execution index
                (<span style="color:#00C000"><b>default</b></span>: 0).

        Returns:
            int: Number of performed updates.
        """
        # Check whether inputs are batched
        if util.is_iterable(x=reward):
            reward = np.asarray(reward)
            num_parallel = reward.shape[0]
            if terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)
            if parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=terminal):
            terminal = np.asarray([int(t) for t in terminal])
            num_parallel = terminal.shape[0]
            if reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if parallel == 0:
                assert num_parallel == self.parallel_interactions
                parallel = np.asarray(list(range(num_parallel)))
            else:
                parallel = np.asarray(parallel)

        elif util.is_iterable(x=parallel):
            parallel = np.asarray(parallel)
            num_parallel = parallel.shape[0]
            if reward == 0.0:
                reward = np.asarray([0.0 for _ in range(num_parallel)])
            else:
                reward = np.asarray(reward)
            if terminal is False:
                terminal = np.asarray([0 for _ in range(num_parallel)])
            else:
                terminal = np.asarray(terminal)

        else:
            reward = np.asarray([float(reward)])
            terminal = np.asarray([int(terminal)])
            parallel = np.asarray([int(parallel)])
            num_parallel = 1

        # Check whether shapes/lengths are consistent
        if parallel.shape[0] == 0:
            raise TensorforceError.value(
                name='Agent.observe', argument='len(parallel)', value=parallel.shape[0], hint='= 0'
            )
        if reward.shape != parallel.shape:
            raise TensorforceError.value(
                name='Agent.observe', argument='len(reward)', value=reward.shape,
                hint='!= parallel length'
            )
        if terminal.shape != parallel.shape:
            raise TensorforceError.value(
                name='Agent.observe', argument='len(terminal)', value=terminal.shape,
                hint='!= parallel length'
            )

        # Convert terminal to int if necessary
        if terminal.dtype is util.np_dtype(dtype='bool'):
            zeros = np.zeros_like(terminal, dtype=util.np_dtype(dtype='int'))
            ones = np.ones_like(terminal, dtype=util.np_dtype(dtype='int'))
            terminal = np.where(terminal, ones, zeros)

        # Check whether current timesteps are not completed
        if self.timestep_completed[parallel].any():
            raise TensorforceError(message="Calling agent.observe must be preceded by agent.act.")
        self.timestep_completed[parallel] = True

        # Process per parallel interaction
        num_updates = 0
        for n in range(num_parallel):

            # Buffer inputs
            p = parallel[n]
            self.buffers['terminal'][p].append(terminal[n])
            self.buffers['reward'][p].append(reward[n])

            # Check whether episode is too long
            if self.max_episode_timesteps is not None and \
                    len(self.buffers['terminal'][p]) > self.max_episode_timesteps:
                raise TensorforceError(message="Episode longer than max_episode_timesteps.")

            # Continue if not terminal and buffer_observe
            if terminal[n].item() == 0 and (
                self.config.buffer_observe == 'episode' or
                len(self.buffers['terminal'][p]) < self.config.buffer_observe
            ):
                continue

            # Buffered terminal/reward inputs
            t = np.asarray(self.buffers['terminal'][p], dtype=self.terminal_spec.np_type())
            r = np.asarray(self.buffers['reward'][p], dtype=self.reward_spec.np_type())
            self.buffers['terminal'][p].clear()
            self.buffers['reward'][p].clear()

            # Recorder
            if self.recorder_spec is not None and \
                    self.episodes >= self.recorder_spec.get('start', 0):

                # Store buffered values
                for name in self.states_spec:
                    self.recorded['states'][name].append(
                        np.stack(self.buffers['states'][name][p], axis=0)
                    )
                    self.buffers['states'][name][p].clear()
                for name in self.auxiliaries_spec:
                    self.recorded['auxiliaries'][name].append(
                        np.stack(self.buffers['auxiliaries'][name][p], axis=0)
                    )
                    self.buffers['auxiliaries'][name][p].clear()
                for name, spec in self.actions_spec.items():
                    self.recorded['actions'][name].append(
                        np.stack(self.buffers['actions'][name][p], axis=0)
                    )
                    self.buffers['actions'][name][p].clear()
                self.recorded['terminal'].append(t.copy())
                self.recorded['reward'].append(r.copy())

                # If terminal
                if t[-1] > 0:
                    self.num_episodes += 1

                    # Check whether recording step
                    if self.num_episodes == self.recorder_spec.get('frequency', 1):
                        self.num_episodes = 0

                        # Manage recorder directory
                        directory = self.recorder_spec['directory']
                        if os.path.isdir(directory):
                            files = sorted(
                                f for f in os.listdir(directory)
                                if os.path.isfile(os.path.join(directory, f))
                                and os.path.splitext(f)[1] == '.npz'
                            )
                        else:
                            os.makedirs(directory)
                            files = list()
                        max_traces = self.recorder_spec.get('max-traces')
                        if max_traces is not None and len(files) > max_traces - 1:
                            for filename in files[:-max_traces + 1]:
                                filename = os.path.join(directory, filename)
                                os.remove(filename)

                        # Write recording file
                        filename = os.path.join(directory, 'trace-{:09d}.npz'.format(self.episodes))
                        # time.strftime('%Y%m%d-%H%M%S')
                        kwargs = self.recorded.fmap(function=np.concatenate, cls=ArrayDict).items()
                        np.savez_compressed(file=filename, **dict(kwargs))

                        # Clear recorded values
                        for recorded in self.recorded.values():
                            recorded.clear()

            # Inputs to tensors
            terminal_tensor = self.terminal_spec.to_tensor(value=t, batched=True)
            reward_tensor = self.reward_spec.to_tensor(value=r, batched=True)
            parallel_tensor = self.parallel_spec.to_tensor(value=p, batched=False)

            # Model.observe()
            updated, episodes, updates = self.model.observe(
                terminal=terminal_tensor, reward=reward_tensor, parallel=parallel_tensor
            )
            num_updates += int(updated.numpy().item())
            self.episodes = episodes.numpy().item()
            self.updates = updates.numpy().item()

        if self.model.saver is not None:
            self.model.save()

        return num_updates