def test_enqueue_dequeue(self):
        """
        Simply tests insert op without checking internal logic.
        """
        fifo_queue = FIFOQueue(capacity=self.capacity,
                               record_space=self.record_space)
        test = ComponentTest(component=fifo_queue,
                             input_spaces=self.input_spaces)

        first_record = self.record_space.sample(size=1)
        test.test(("insert_records", first_record), expected_outputs=None)
        test.test("get_size", expected_outputs=1)

        further_records = self.record_space.sample(size=5)
        test.test(("insert_records", further_records), expected_outputs=None)
        test.test("get_size", expected_outputs=6)

        expected = dict()
        for (k1, v1), (k2, v2) in zip(
                flatten_op(first_record).items(),
                flatten_op(further_records).items()):
            expected[k1] = np.concatenate((v1, v2[:4]))
        expected = unflatten_op(expected)

        test.test(("get_records", 5), expected_outputs=expected)
        test.test("get_size", expected_outputs=1)
Beispiel #2
0
    def flatten_input_ops(self, *ops, **kwarg_ops):
        """
        Flattens all DataOps in ops into FlattenedDataOp with auto-key generation.
        Ops whose Sockets are not in self.flatten_ops (if its a set)
        will be ignored.

        Args:
            *ops (op): The primitive ops to flatten.
            **kwarg_ops (op): More primitive ops to flatten (but by named key).

        Returns:
            Tuple[DataOp]: A new tuple with all ops (or those specified by `flatten_ops` as FlattenedDataOp.
        """
        assert all(op is not None for op in ops)  # just make sure

        # The returned sequence of output ops.
        ret = []
        for i, op in enumerate(ops):
            if self.flatten_ops is True or (isinstance(self.flatten_ops, set) and i in self.flatten_ops):
                ret.append(flatten_op(op))
            else:
                ret.append(op)

        # Process kwargs, if given.
        kwarg_ret = {}
        if len(kwarg_ops) > 0:
            for key, op in kwarg_ops.items():
                if self.flatten_ops is True or (isinstance(self.flatten_ops, set) and key in self.flatten_ops):
                    kwarg_ret[key] = flatten_op(op)
                else:
                    kwarg_ret[key] = op

        # Always return a tuple for indexing into the return values.
        return tuple(ret), kwarg_ret
Beispiel #3
0
    def flatten_input_ops(self, *ops, **kwarg_ops):
        """
        Flattens all DataOps in ops into FlattenedDataOp with auto-key generation.
        Ops whose Sockets are not in self.flatten_ops (if its a set)
        will be ignored.

        Args:
            *ops (op): The primitive ops to flatten.
            **kwarg_ops (op): More primitive ops to flatten (but by named key).

        Returns:
            Tuple[DataOp]: A new tuple with all ops (or those specified by `flatten_ops` as FlattenedDataOp.
        """
        assert all(op is not None for op in ops)  # just make sure

        flatten_alongside = None
        if isinstance(self.flatten_ops, str):
            flatten_alongside = self.component.__getattribute__(self.flatten_ops)

        # The returned sequence of output ops.
        ret = []
        for i, op in enumerate(ops):
            if self.flatten_ops is True or isinstance(self.flatten_ops, str) or \
                    (isinstance(self.flatten_ops, (set, dict)) and i in self.flatten_ops):
                fa = flatten_alongside
                if isinstance(self.flatten_ops, dict):
                    fa = self.component.__getattribute__(self.flatten_ops[i])
                if fa is not None:
                    assert isinstance(fa, dict), \
                        "ERROR: Given `flatten_alongside` property ('{}') is not a dict!".format(fa)
                ret.append(flatten_op(op, flatten_alongside=fa))
            else:
                ret.append(op)

        # Process kwargs, if given.
        kwarg_ret = {}
        if len(kwarg_ops) > 0:
            for key, op in kwarg_ops.items():
                if self.flatten_ops is True or isinstance(self.flatten_ops, str) or \
                        (isinstance(self.flatten_ops, (set, dict)) and key in self.flatten_ops):
                    fa = flatten_alongside
                    if isinstance(self.flatten_ops, dict):
                        fa = self.component.__getattribute__(self.flatten_ops[key])
                    if fa is not None:
                        assert isinstance(fa, dict), \
                            "ERROR: Given `flatten_alongside` property ('{}') is not a dict!".format(fa)
                    kwarg_ret[key] = flatten_op(op, flatten_alongside=fa)
                else:
                    kwarg_ret[key] = op

        # Always return a tuple for indexing into the return values.
        return tuple(ret), kwarg_ret
Beispiel #4
0
    def _graph_fn_setup(self):
        enqueue_ops = list()

        if get_backend() == "tf":
            for data_producing_component in self.data_producing_components:
                record = getattr(data_producing_component, self.api_method_name)()
                if self.return_slot != -1:
                    # Only care about one slot of the return values.
                    record = record[self.return_slot]

                # TODO: specific for IMPALA problem: needs to be generalized.
                if self.internal_states_slicer is not None:
                    outs = self.env_output_splitter.split(record)

                    # Assume that internal_states are the last item coming from the env-stepper.
                    initial_internal_states = self.internal_states_slicer.slice(outs[-1], 0)
                    record = self.fifo_input_merger.merge(*(outs[:-1] + (initial_internal_states,)))
                else:
                    terminals, states, actions, rewards, action_log_probs = self.env_output_splitter.split(record)
                    record = self.fifo_input_merger.merge(
                        terminals, states, actions, rewards, action_log_probs
                    )

                # Create enqueue_op from api_return.
                # TODO: This is kind of cheating, as we are producing an op from a component that's not ours.
                enqueue_op = self.queue.queue.enqueue(flatten_op(record))
                enqueue_ops.append(enqueue_op)

            self.queue_runner = tf.train.QueueRunner(self.queue.queue, enqueue_ops)
            # Add to standard collection, so all queue-runners will be started after session creation.
            tf.train.add_queue_runner(self.queue_runner)

            return tf.no_op()
    def test_capacity(self):
        """
        Tests if insert correctly blocks when capacity is reached.
        """
        fifo_queue = FIFOQueue(capacity=self.capacity,
                               record_space=self.record_space)
        test = ComponentTest(component=fifo_queue,
                             input_spaces=self.input_spaces)

        def run(expected_):
            # Wait n seconds.
            time.sleep(2)
            # Pull something out of the queue again to continue.
            test.test(("get_records", 2), expected_outputs=expected_)

        # Insert one more element than capacity
        records = self.record_space.sample(size=self.capacity + 1)

        expected = dict()
        for key, value in flatten_op(records).items():
            expected[key] = value[:2]
        expected = unflatten_op(expected)

        # Start thread to save this one from getting stuck due to capacity overflow.
        thread = threading.Thread(target=run, args=(expected, ))
        thread.start()

        print("Going over capacity: blocking ...")
        test.test(("insert_records", records), expected_outputs=None)
        print("Dequeued some items in another thread. Unblocked.")

        thread.join()
Beispiel #6
0
    def create_variables(self, input_spaces, action_space=None):
        self.in_space = input_spaces["preprocessing_inputs"]  # type: Space

        # Store the mapped output Spaces (per flat key).
        self.output_spaces = flatten_op(self.get_preprocessed_space(self.in_space))
        # Store time_major settings of incoming spaces.
        self.in_space_time_majors = self.in_space.flatten(mapping=lambda key, space: space.time_major)

        # Check whether we have to flatten the incoming categories of an IntBox into a FloatBox with additional
        # rank (categories rank). Store the dimension of this additional rank in the `self.num_categories` dict.
        if self.flatten is True:
            if self.flatten_categories is True:
                def mapping_func(key, space):
                    if isinstance(space, IntBox):
                        # Must have global bounds (bounds valid for all axes).
                        if space.num_categories is False:
                            raise RLGraphError("ERROR: Cannot flatten categories if one of the IntBox spaces ({}={}) does "
                                               "not have global bounds (its `num_categories` is False)!".format(key, space))
                        return space.num_categories
                    # No categories. Keep as is.
                    return 1
                self.num_categories = self.in_space.flatten(mapping=mapping_func)
            elif self.flatten_categories is not False:
                # TODO: adjust for input ContainerSpaces. For now only support single space (flat-key=="")
                self.num_categories = {"": self.flatten_categories}
Beispiel #7
0
 def _graph_fn_insert_records(self, records):
     flattened_records = flatten_op(records)
     flattened_stopped_records = {key: tf.stop_gradient(op) for key, op in flattened_records.items()}
     # Records is just one record.
     if self.only_insert_single_records is True:
         return self.queue.enqueue(flattened_stopped_records)
     # Insert many records (with batch rank).
     else:
         return self.queue.enqueue_many(flattened_stopped_records)
Beispiel #8
0
    def _graph_fn_get_q_values(self, states, actions, target=False):
        backend = get_backend()

        #tf.one_hot(tf.cast(x=tensor, dtype=tf.int32), depth=5)

        flat_actions = flatten_op(actions)
        state_actions = [states]
        for flat_key, action_component in self._policy.action_space.flatten(
        ).items():
            state_actions.append(flat_actions[flat_key])

        if backend == "tf":
            state_actions = tf.concat(state_actions, axis=-1)
        elif backend == "pytorch":
            state_actions = torch.cat(state_actions, dim=-1)

        q_funcs = self._q_functions if target is False else self._target_q_functions
        return tuple(q.value_output(state_actions) for q in q_funcs)
    def _graph_fn_reduce_over_sub_distributions(self, log_probs):
        params_space = next(iter(flatten_op(self.api_method_inputs["parameters"]).values()))
        num_ranks_to_keep = (1 if params_space.has_batch_rank else 0) + (1 if params_space.has_time_rank else 0)
        log_probs_list = []
        if get_backend() == "tf":
            for log_prob in log_probs.values():
                # Reduce sum over all ranks to get the joint log llh.
                log_prob = tf.reduce_sum(log_prob, axis=list(range(len(log_prob.shape) - 1, num_ranks_to_keep - 1, -1)))
                log_probs_list.append(log_prob)
            return tf.reduce_sum(tf.stack(log_probs_list, axis=0), axis=0)

        elif get_backend() == "pytorch":
            for log_prob in log_probs.values():
                # Reduce sum over all ranks to get the joint log llh.
                log_prob = torch.sum(log_prob, dim=list(range(len(log_prob.shape) - 1, num_ranks_to_keep - 1, -1)))
                log_probs_list.append(log_prob)

            return torch.sum(torch.stack(log_probs_list, dim=0), dim=0)
Beispiel #10
0
    def _graph_fn_stage(self, *inputs):
        """
        Stages all incoming ops (after flattening them).

        Args:
            inputs (DataOp): The incoming ops to be (flattened and) staged.

        Returns:
            DataOp: The staging op.
        """
        # Flatten inputs and stage them.
        # TODO: Build equivalent to nest.flatten ()
        flattened_ops = list()
        for input_ in inputs:
            flat_list = list(flatten_op(input_).values())
            flattened_ops.extend(flat_list)
        stage_op = self.area.put(flattened_ops)
        return stage_op
    def __init__(self, input_network_specs, post_network_spec=None, **kwargs):
        """
        Args:
            input_network_specs (Union[Dict[str,dict],Tuple[dict]]): A specification dict or tuple with values being
                the spec dicts for the single streams. The `call` method expects a dict input or a single tuple input
                (not as *args) in its first parameter.

            post_network_spec (Optional[]): The specification dict of the post-concat network or the post-concat
                network object itself.
        """
        super(MultiInputStreamNeuralNetwork,
              self).__init__(scope="multi-input-stream-nn", **kwargs)

        # Create all streams' networks.
        if isinstance(input_network_specs, dict):
            self.input_stream_nns = {}
            for i, (flat_key, nn_spec) in enumerate(
                    flatten_op(input_network_specs).items()):
                self.input_stream_nns[flat_key] = NeuralNetwork.from_spec(
                    nn_spec, scope="input-stream-nn-{}".format(i))
            # Create the concat layer to merge all streams.
            self.concat_layer = ConcatLayer(dict_keys=list(
                self.input_stream_nns.keys()),
                                            axis=-1)
        else:
            assert isinstance(input_network_specs, (list, tuple)),\
                "ERROR: `input_network_specs` must be dict or tuple/list!"
            self.input_stream_nns = []
            for i, nn_spec in enumerate(input_network_specs):
                self.input_stream_nns.append(
                    NeuralNetwork.from_spec(
                        nn_spec, scope="input-stream-nn-{}".format(i)))
            # Create the concat layer to merge all streams.
            self.concat_layer = ConcatLayer(axis=-1)

        # Create the post-network (after the concat).
        self.post_nn = NeuralNetwork.from_spec(
            post_network_spec, scope="post-concat-nn")  # type: NeuralNetwork

        # Add all sub-Components.
        self.add_components(
            self.post_nn, self.concat_layer,
            *list(self.input_stream_nns.values() if isinstance(
                input_network_specs, dict) else self.input_stream_nns))
Beispiel #12
0
    def _graph_fn_setup(self):
        enqueue_ops = list()

        if get_backend() == "tf":
            for data_producing_component in self.data_producing_components:
                record = getattr(data_producing_component,
                                 self.api_method_name)()
                if self.return_slot != -1:
                    # Only care about one slot of the return values.
                    record = record[self.return_slot]
                # Create dict record from tuple return.
                #record = self.input_merger.merge(*record)

                # TODO: specific for IMPALA problem: needs to be generalized.
                preprocessed_s, actions, rewards, returns, terminals, next_states, action_log_probs, \
                    internal_states = self.env_output_splitter.split(record)

                last_next_state = self.next_states_slicer.slice(
                    next_states, -1)
                initial_internal_states = self.internal_states_slicer.slice(
                    internal_states, 0)
                #current_internal_states = self.internal_states_slicer.slice(internal_states, -1)

                record = self.fifo_input_merger.merge(preprocessed_s, actions,
                                                      rewards, terminals,
                                                      last_next_state,
                                                      action_log_probs,
                                                      initial_internal_states)

                # Insert results into the FIFOQueue.
                #insert_op = fifo_queue.insert_records(record)
                #return step_op, insert_op, current_internal_states, returns, terminals

                # Create enqueue_op from api_return.
                # TODO: This is kind of cheating, as we are producing an op from a component that's not ours.
                enqueue_op = self.queue.queue.enqueue(flatten_op(record))
                enqueue_ops.append(enqueue_op)

            self.queue_runner = tf.train.QueueRunner(self.queue.queue,
                                                     enqueue_ops)
            # Add to standard collection, so all queue-runners will be started after session creation.
            tf.train.add_queue_runner(self.queue_runner)

            return tf.no_op()
Beispiel #13
0
    def __init__(self, preprocessors, **kwargs):
        """
        Args:
            preprocessors (dict):

        Raises:
            RLGraphError: If a sub-component is not a PreprocessLayer object.
        """
        # Create one separate PreprocessorStack per given key.
        # All possibly other keys in an input will be pass through un-preprocessed.
        self.flattened_preprocessors = flatten_op(preprocessors)
        for i, (flat_key, spec) in enumerate(self.flattened_preprocessors.items()):
            self.flattened_preprocessors[flat_key] = PreprocessorStack.from_spec(
                spec, scope="preprocessor-stack-{}".format(i)
            )

        # NOTE: No automatic API-methods. Define them all ourselves.
        kwargs["api_methods"] = {}
        default_dict(kwargs, dict(scope=kwargs.pop("scope", "dict-preprocessor-stack")))
        super(DictPreprocessorStack, self).__init__(*list(self.flattened_preprocessors.values()), **kwargs)
    def _graph_fn_entropy(self, distribution):
        params_space = next(iter(flatten_op(self.api_method_inputs["parameters"]).values()))
        num_ranks_to_keep = (1 if params_space.has_batch_rank else 0) + (1 if params_space.has_time_rank else 0)
        all_entropies = []
        if get_backend() == "tf":
            for key, distr in distribution.items():
                entropy = distr.entropy()
                # Reduce sum over all ranks to get the joint entropy.
                entropy = tf.reduce_sum(entropy, axis=list(range(len(entropy.shape) - 1, num_ranks_to_keep - 1, -1)))
                all_entropies.append(entropy)
            return tf.reduce_sum(tf.stack(all_entropies, axis=0), axis=0)

        elif get_backend() == "pytorch":
            for key, distr in distribution.items():
                entropy = distr.entropy()
                # Reduce sum over all ranks to get the joint log llh.
                entropy = torch.sum(entropy, dim=list(range(len(entropy.shape) - 1, num_ranks_to_keep - 1, -1)))
                all_entropies.append(entropy)

            # TODO: flatten all all_log_probs (or expand in last dim) so we can concat, then reduce_sum to get the joint probs.
            return torch.sum(torch.stack(all_entropies, dim=0), dim=0)
Beispiel #15
0
    def _graph_fn_get_q_values(self,
                               preprocessed_states,
                               actions,
                               target=False):
        backend = get_backend()

        flat_actions = flatten_op(actions)
        actions = []
        for flat_key, action_component in self._policy.action_space.flatten(
        ).items():
            actions.append(flat_actions[flat_key])

        if backend == "tf":
            actions = tf.concat(actions, axis=-1)
        elif backend == "pytorch":
            actions = torch.cat(actions, dim=-1)

        q_funcs = self._q_functions if target is False else self._target_q_functions

        # We do not concat states yet because we might pass states through a conv stack before merging it
        # with actions.
        return tuple(
            q.state_action_value(preprocessed_states, actions)
            for q in q_funcs)
Beispiel #16
0
    def _graph_fn_step(self):
        if get_backend() == "tf":

            def scan_func(accum, time_delta):
                # Not needed: preprocessed-previous-states (tuple!)
                # `state` is a tuple as well. See comment in ctor for why tf cannot use ContainerSpaces here.
                internal_states = None
                state = accum[1]
                if self.has_rnn:
                    internal_states = accum[-1]

                state = tuple(tf.convert_to_tensor(value=s) for s in state)

                flat_state = OrderedDict()
                for i, flat_key in enumerate(
                        self.state_space_actor_flattened.keys()):
                    # Add a simple (size 1) batch rank to the state so it'll pass through the NN.
                    # - Also have to add a time-rank for RNN processing.
                    expanded = state[i]
                    for _ in range(1 if self.has_rnn is False else 2):
                        expanded = tf.expand_dims(input=expanded, axis=0)
                    # Make None so it'll be recognized as batch-rank by the auto-Space detector.
                    flat_state[flat_key] = tf.placeholder_with_default(
                        input=expanded,
                        shape=(None, ) + ((None, ) if self.has_rnn is True else
                                          ()) +
                        self.state_space_actor_list[i].shape)

                # Recreate state as the original Space to pass it into the actor-component.
                state = unflatten_op(flat_state)

                # Get action and preprocessed state (as batch-size 1).
                out = (self.actor_component.get_preprocessed_state_and_action
                       if self.add_action_probs is False else
                       self.actor_component.
                       get_preprocessed_state_action_and_action_probs)(
                           state,
                           # Add simple batch rank to internal_states.
                           None if internal_states is None else DataOpTuple(
                               internal_states),  # <- None for non-RNN systems
                           time_step=self.time_step + time_delta,
                           return_ops=True)

                # Get output depending on whether it contains internal_states or not.
                a = out["action"]
                action_probs = out.get("action_probs")
                current_internal_states = out.get("last_internal_states")

                # Strip the batch (and maybe time) ranks again from the action in case the Env doesn't like it.
                a_no_extra_ranks = a[0, 0] if self.has_rnn is True else a[0]
                # Step through the Env and collect next state (tuple!), reward and terminal as single values
                # (not batched).
                out = self.environment_server.step_for_env_stepper(
                    a_no_extra_ranks)
                s_, r, t_ = out[:-2], out[-2], out[-1]
                r = tf.cast(r, dtype="float32")

                # Add a and/or r to next_state?
                if self.add_previous_action_to_state is True:
                    assert isinstance(
                        s_, tuple
                    ), "ERROR: Cannot add previous action to non tuple!"
                    s_ = s_ + (a_no_extra_ranks, )
                if self.add_previous_reward_to_state is True:
                    assert isinstance(
                        s_, tuple
                    ), "ERROR: Cannot add previous reward to non tuple!"
                    s_ = s_ + (r, )

                # Note: s_ is packed as tuple.
                ret = [t_, s_] + \
                    ([a_no_extra_ranks] if self.add_action else []) + \
                    ([r] if self.add_reward else []) + \
                    ([(action_probs[0][0] if self.has_rnn is True else action_probs[0])] if
                     self.add_action_probs is True else []) + \
                    ([tuple(current_internal_states)] if self.has_rnn is True else [])

                return tuple(ret)

            # Initialize the tf.scan run.
            initializer = [
                self.current_terminal.read_value(
                ),  # whether the current state is terminal
                # current (raw) state (flattened components if ContainerSpace).
                tuple(
                    map(lambda x: x.read_value(), self.current_state.values()))
            ]
            # Append actions and rewards if needed.
            if self.add_action:
                initializer.append(self.current_action.read_value())
            if self.add_reward:
                initializer.append(self.current_reward.read_value())
            # Append action probs if needed.
            if self.add_action_probs is True:
                initializer.append(self.current_action_probs.read_value())
            # Append internal states if needed.
            if self.current_internal_states is not None:
                initializer.append(
                    tuple(
                        tf.placeholder_with_default(
                            internal_s.read_value(),
                            shape=(None, ) +
                            tuple(internal_s.shape.as_list()[1:])) for
                        internal_s in self.current_internal_states.values()))

            # Scan over n time-steps (tf.range produces the time_delta with respect to the current time_step).
            # NOTE: Changed parallel to 1, to resolve parallel issues.
            step_results = list(
                tf.scan(fn=scan_func,
                        elems=tf.range(self.num_steps, dtype="int32"),
                        initializer=tuple(initializer),
                        back_prop=False))

            # Store the time-step increment, return so far, current terminal and current state.
            assigns = [
                tf.assign_add(self.time_step, self.num_steps),
                self.assign_variable(self.current_terminal,
                                     step_results[0][-1])
            ]

            # Concatenate first and rest.
            full_results = []
            for first_values, rest_values in zip(initializer, step_results):
                full_results.append(
                    nest.map_structure(
                        lambda first, rest: tf.concat([[first], rest], axis=0),
                        first_values, rest_values))

            # Re-build DataOpDicts from preprocessed-states and states (from tuple right now).
            rebuild_s = DataOpDict()
            for flat_key, var_ref, s_comp in zip(
                    self.state_space_actor_flattened.keys(),
                    self.current_state.values(), full_results[1]):
                assigns.append(self.assign_variable(
                    var_ref, s_comp[-1]))  # -1: current state (last observed)
                rebuild_s[flat_key] = s_comp
            rebuild_s = unflatten_op(rebuild_s)
            full_results[1] = rebuild_s

            # Remove batch rank from internal states again.
            if self.current_internal_states is not None:
                # TODO: What if internal states is not the last item in the list anymore due to some change.
                slot = -1  # if self.add_action_probs is True else 2
                # TODO: What if internal states is a dict? Right now assume some tuple.
                internal_states_wo_batch = list()
                for i in range(len(full_results[slot])):
                    # 1=batch axis (which is 1); 0=time axis.
                    internal_states_wo_batch.append(
                        tf.squeeze(full_results[-1][i], axis=1))
                full_results[slot] = DataOpTuple(internal_states_wo_batch)

            with tf.control_dependencies(control_inputs=assigns):
                # Let the auto-infer system know, what time rank we have.
                full_results = DataOpTuple(full_results)
                for o in flatten_op(full_results).values():
                    o._time_rank = 0  # which position in the shape is the time-rank?
                step_op = tf.no_op()

            return step_op, full_results
Beispiel #17
0
    def observe(self,
                preprocessed_states,
                actions,
                internals,
                rewards,
                next_states,
                terminals,
                env_id=None,
                batched=False):
        """
        Observes an experience tuple or a batch of experience tuples. Note: If configured,
        first uses buffers and then internally calls _observe_graph() to actually run the computation graph.
        If buffering is disabled, this just routes the call to the respective `_observe_graph()` method of the
        child Agent.

        Args:
            preprocessed_states (Union[dict,ndarray]): Preprocessed states dict or array.
            actions (Union[dict,ndarray]): Actions dict or array containing actions performed for the given state(s).

            internals (Optional[list]): Internal state(s) returned by agent for the given states.Must be
                empty list if no internals available.

            rewards (Union[float,List[float]]): Scalar reward(s) observed.
            terminals (Union[bool,List[bool]]): Boolean indicating terminal.
            next_states (Union[dict,ndarray]): Preprocessed next states dict or array.

            env_id (Optional[str]): Environment id to observe for. When using vectorized execution and
                buffering, using environment ids is necessary to ensure correct trajectories are inserted.
                See `SingleThreadedWorker` for example usage.

            batched (bool): Whether given data (states, actions, etc..) is already batched or not.
        """
        # Check for illegal internals.
        if internals is None:
            internals = []

        if self.observe_spec["buffer_enabled"] is True:
            if env_id is None:
                env_id = self.default_env

            # If data is already batched, just have to extend our buffer lists.
            if batched:
                if self.flat_state_space is not None:
                    for i, flat_key in enumerate(self.flat_state_space.keys()):
                        self.states_buffer[env_id][i].extend(
                            preprocessed_states[flat_key])
                        self.next_states_buffer[env_id][i].extend(
                            next_states[flat_key])
                else:
                    self.states_buffer[env_id].extend(preprocessed_states)
                    self.next_states_buffer[env_id].extend(next_states)
                if self.flat_action_space is not None:
                    flat_action = flatten_op(actions)
                    for i, flat_key in enumerate(
                            self.flat_action_space.keys()):
                        self.actions_buffer[env_id][i].append(
                            flat_action[flat_key])
                else:
                    self.actions_buffer[env_id].extend(actions)
                self.internals_buffer[env_id].extend(internals)
                self.rewards_buffer[env_id].extend(rewards)
                self.terminals_buffer[env_id].extend(terminals)
            # Data is not batched, append single items (without creating new lists first!) to buffer lists.
            else:
                if self.flat_state_space is not None:
                    for i, flat_key in enumerate(self.flat_state_space.keys()):
                        self.states_buffer[env_id][i].append(
                            preprocessed_states[flat_key])
                        self.next_states_buffer[env_id][i].append(
                            next_states[flat_key])
                else:
                    self.states_buffer[env_id].append(preprocessed_states)
                    self.next_states_buffer[env_id].append(next_states)
                if self.flat_action_space is not None:
                    flat_action = flatten_op(actions)
                    for i, flat_key in enumerate(
                            self.flat_action_space.keys()):
                        self.actions_buffer[env_id][i].append(
                            flat_action[flat_key])
                else:
                    self.actions_buffer[env_id].append(actions)
                self.internals_buffer[env_id].append(internals)
                self.rewards_buffer[env_id].append(rewards)
                self.terminals_buffer[env_id].append(terminals)

            buffer_is_full = len(self.rewards_buffer[env_id]
                                 ) >= self.observe_spec["buffer_size"]

            # If the buffer (per environment) is full OR the episode was aborted:
            # Change terminal of last record artificially to True (also give warning "buffer too small"),
            # insert and flush the buffer.
            if buffer_is_full or self.terminals_buffer[env_id][-1]:
                # Warn if full and last terminal is False.
                if buffer_is_full and not self.terminals_buffer[env_id][-1]:
                    self.logger.warning(
                        "Buffer of size {} of Agent '{}' may be too small! Had to add artificial terminal=True "
                        "to end.".format(self.observe_spec["buffer_size"],
                                         self))
                    self.terminals_buffer[env_id][-1] = True

                # TODO: Apply n-step post-processing if necessary.
                if self.flat_state_space is not None:
                    states_ = {}
                    next_states_ = {}
                    for i, key in enumerate(self.flat_state_space.keys()):
                        states_[key] = np.asarray(
                            self.states_buffer[env_id][i])
                        next_states_[key] = np.asarray(
                            self.next_states_buffer[env_id][i])
                        # Squeeze, but do not squeeze (1,) to ().
                        if len(states_[key]) > 1:
                            states_[key] = np.squeeze(states_[key])
                            next_states_[key] = np.squeeze(next_states_[key])
                        #else:
                        #    states_[key] = np.reshape(states_[key], (1,))
                        #    next_states_[key] = np.reshape(next_states_[key], (1,))
                else:
                    states_ = np.asarray(self.states_buffer[env_id])
                    next_states_ = np.asarray(self.next_states_buffer[env_id])

                if self.flat_action_space is not None:
                    actions_ = {}
                    for i, key in enumerate(self.flat_action_space.keys()):
                        actions_[key] = np.asarray(
                            self.actions_buffer[env_id][i])
                        # Squeeze, but do not squeeze (1,) to ().
                        if len(actions_[key]) > 1:
                            actions_[key] = np.squeeze(actions_[key])
                        else:
                            actions_[key] = np.reshape(actions_[key], (1, ))
                else:
                    actions_ = np.asarray(self.actions_buffer[env_id])

                self._write_rewards_summary(
                    rewards=self.
                    rewards_buffer[env_id],  # No need to be converted to np
                    terminals=self.terminals_buffer[env_id],
                    env_id=env_id)

                self._observe_graph(
                    preprocessed_states=states_,
                    actions=actions_,
                    internals=np.asarray(self.internals_buffer[env_id]),
                    rewards=np.asarray(self.rewards_buffer[env_id]),
                    next_states=next_states_,
                    terminals=np.asarray(self.terminals_buffer[env_id]))
                self.reset_env_buffers(env_id)
        else:
            if not batched:
                preprocessed_states, _ = self.preprocessed_state_space.force_batch(
                    preprocessed_states)
                next_states, _ = self.preprocessed_state_space.force_batch(
                    next_states)
                actions, _ = self.action_space.force_batch(actions)
                rewards = [rewards]
                terminals = [terminals]

            self._write_rewards_summary(
                rewards=rewards,  # No need to be converted to np
                terminals=terminals,
                env_id=env_id)

            self._observe_graph(preprocessed_states, actions, internals,
                                rewards, next_states, terminals)
Beispiel #18
0
 def create_variables(self, input_spaces, action_space=None):
     in_space = input_spaces["inputs"]
     self.output_spaces = flatten_op(self.get_preprocessed_space(in_space))
Beispiel #19
0
        def _graph_fn_update_from_external_batch(
                root, preprocessed_states, actions, rewards, terminals, sequence_indices, apply_postprocessing=True,
                time_percentage=None
        ):
            """
            Calls iterative optimization by repeatedly sub-sampling.
            """
            multi_gpu_sync_optimizer = root.sub_components.get("multi-gpu-synchronizer")

            # Return values.
            loss, loss_per_item, vf_loss, vf_loss_per_item = None, None, None, None

            policy = root.get_sub_component_by_name(agent.policy.scope)
            value_function = root.get_sub_component_by_name(agent.value_function.scope)
            optimizer = root.get_sub_component_by_name(agent.optimizer.scope)
            loss_function = root.get_sub_component_by_name(agent.loss_function.scope)
            value_function_optimizer = root.get_sub_component_by_name(agent.value_function_optimizer.scope)
            vars_merger = root.get_sub_component_by_name(agent.vars_merger.scope)
            gae_function = root.get_sub_component_by_name(agent.gae_function.scope)

            prev_log_probs = policy.get_log_likelihood(preprocessed_states, actions)["log_likelihood"]
            prev_state_values = value_function.value_output(preprocessed_states)

            if get_backend() == "tf":
                batch_size = tf.shape(list(flatten_op(preprocessed_states).values())[0])[0]

                # Log probs before update (stop-gradient as these are used in target term).
                prev_log_probs = tf.stop_gradient(prev_log_probs)
                #prev_log_probs = tf.Print(prev_log_probs, [prev_log_probs], "prev-log-probs: ", summarize=1000)
                # State values before update (stop-gradient as these are used in target term).
                prev_state_values = tf.stop_gradient(prev_state_values)
                #prev_state_values = tf.Print(prev_state_values, [prev_state_values], "prev-state-values: ", summarize=1000)

                # Advantages are based on previous state values.
                advantages = tf.cond(
                    pred=apply_postprocessing,
                    true_fn=lambda: gae_function.calc_gae_values(
                        prev_state_values, rewards, terminals, sequence_indices
                    ),
                    false_fn=lambda: rewards
                )
                #advantages = tf.Print(advantages, [advantages], "advantages before standardizing: ", summarize=1000)
                if self.standardize_advantages:
                    mean, std = tf.nn.moments(x=advantages, axes=[0])
                    advantages = (advantages - mean) / std
                #advantages = tf.Print(advantages, [advantages], "advantages after standardizing: ", summarize=1000)

                def opt_body(index_, loss_, loss_per_item_, vf_loss_, vf_loss_per_item_):
                    start = tf.random_uniform(shape=(), minval=0, maxval=batch_size, dtype=tf.int32)
                    indices = tf.range(start=start, limit=start + agent.sample_size) % batch_size

                    # Use `map` here in case we have container states/actions.
                    sample_states = preprocessed_states.map(lambda k, v: tf.gather(v, indices))
                    sample_actions = actions.map(lambda k, v: tf.gather(v, indices))
                    #sample_actions["direction"] = tf.Print(sample_actions["direction"], [sample_actions["direction"]], "sample-actions['direction']: ", summarize=1000)
                    #sample_actions["jump"] = tf.Print(sample_actions["jump"], [sample_actions["jump"]], "sample-actions['jump']: ", summarize=1000)
                    #sample_actions["crouch"] = tf.Print(sample_actions["crouch"], [sample_actions["crouch"]], "sample-actions['crouch']: ", summarize=1000)

                    sample_prev_log_probs = tf.gather(params=prev_log_probs, indices=indices)
                    sample_rewards = tf.gather(params=rewards, indices=indices)
                    sample_terminals = tf.gather(params=terminals, indices=indices)
                    sample_sequence_indices = tf.gather(params=sequence_indices, indices=indices)
                    sample_advantages = tf.gather(params=advantages, indices=indices)
                    sample_advantages.set_shape((agent.sample_size,))

                    sample_state_values = value_function.value_output(sample_states)
                    sample_prev_state_values = tf.gather(params=prev_state_values, indices=indices)

                    # If we are a multi-GPU root:
                    # Simply feeds everything into the multi-GPU sync optimizer's method and return.
                    if multi_gpu_sync_optimizer is not None:
                        main_policy_vars = agent.policy.variables()
                        main_vf_vars = agent.value_function.variables()
                        all_vars = agent.vars_merger.merge(main_policy_vars, main_vf_vars)
                        # grads_and_vars, loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        out = multi_gpu_sync_optimizer.calculate_update_from_external_batch(
                            all_vars,
                            sample_states, sample_actions, sample_rewards, sample_terminals, sample_sequence_indices,
                            apply_postprocessing=apply_postprocessing
                        )
                        avg_grads_and_vars_policy, avg_grads_and_vars_vf = agent.vars_splitter.call(
                            out["avg_grads_and_vars_by_component"]
                        )
                        policy_step_op = agent.optimizer.apply_gradients(avg_grads_and_vars_policy)
                        vf_step_op = agent.value_function_optimizer.apply_gradients(avg_grads_and_vars_vf)
                        step_op = root._graph_fn_group(policy_step_op, vf_step_op)
                        step_and_sync_op = multi_gpu_sync_optimizer.sync_variables_to_towers(
                            step_op, all_vars
                        )
                        loss_vf, loss_per_item_vf = out["additional_return_0"], out["additional_return_1"]

                        # Have to set all shapes here due to strict loop-var shape requirements.
                        out["loss"].set_shape(())
                        loss_vf.set_shape(())
                        loss_per_item_vf.set_shape((agent.sample_size,))
                        out["loss_per_item"].set_shape((agent.sample_size,))

                        with tf.control_dependencies([step_and_sync_op]):
                            if index_ == 0:
                                # Increase the global training step counter.
                                out["loss"] = root._graph_fn_training_step(out["loss"])
                            return index_ + 1, out["loss"], out["loss_per_item"], loss_vf, loss_per_item_vf

                    sample_log_probs = policy.get_log_likelihood(sample_states, sample_actions)["log_likelihood"]
                    #sample_log_probs = tf.Print(sample_log_probs, [sample_log_probs], "sample-log-probs:", summarize=1000)

                    entropy = policy.get_entropy(sample_states)["entropy"]
                    #entropy["direction"] = tf.Print(entropy["direction"], [entropy["direction"]], "entropy['dir']: ", summarize=1000)

                    loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        loss_function.loss(
                            sample_log_probs, sample_prev_log_probs,
                            sample_state_values, sample_prev_state_values, sample_advantages, entropy, time_percentage
                        )

                    if hasattr(root, "is_multi_gpu_tower") and root.is_multi_gpu_tower is True:
                        policy_grads_and_vars = optimizer.calculate_gradients(policy.variables(), loss, time_percentage)
                        vf_grads_and_vars = value_function_optimizer.calculate_gradients(
                            value_function.variables(), vf_loss, time_percentage
                        )
                        grads_and_vars_by_component = vars_merger.merge(policy_grads_and_vars, vf_grads_and_vars)
                        return grads_and_vars_by_component, loss, loss_per_item, vf_loss, vf_loss_per_item
                    else:
                        step_op = optimizer.step(policy.variables(), loss, loss_per_item, time_percentage)
                        loss.set_shape(())
                        loss_per_item.set_shape((agent.sample_size,))

                        vf_step_op = value_function_optimizer.step(
                            value_function.variables(), vf_loss, vf_loss_per_item, time_percentage
                        )
                        vf_loss.set_shape(())
                        vf_loss_per_item.set_shape((agent.sample_size,))

                        with tf.control_dependencies([step_op, vf_step_op]):
                            return index_ + 1, loss, loss_per_item, vf_loss, vf_loss_per_item

                def cond(index_, loss_, loss_per_item_, v_loss_, v_loss_per_item_):
                    return index_ < agent.iterations

                init_loop_vars = [
                    0,
                    tf.zeros(shape=(), dtype=tf.float32),
                    tf.zeros(shape=(agent.sample_size,)),
                    tf.zeros(shape=(), dtype=tf.float32),
                    tf.zeros(shape=(agent.sample_size,))
                ]

                if hasattr(root, "is_multi_gpu_tower") and root.is_multi_gpu_tower is True:
                    return opt_body(*init_loop_vars)
                else:
                    index, loss, loss_per_item, vf_loss, vf_loss_per_item = tf.while_loop(
                        cond=cond,
                        body=opt_body,
                        loop_vars=init_loop_vars,
                        parallel_iterations=1
                    )
                    # Increase the global training step counter.
                    loss = root._graph_fn_training_step(loss)
                    return loss, loss_per_item, vf_loss, vf_loss_per_item

            elif get_backend() == "pytorch":
                batch_size = list(flatten_op(preprocessed_states).values())[0].shape[0]
                sample_size = min(batch_size, agent.sample_size)

                if isinstance(prev_log_probs, dict):
                    for name in actions.keys():
                        prev_log_probs[name] = prev_log_probs[name].detach()
                else:
                    prev_log_probs = prev_log_probs.detach()
                prev_state_values = value_function.value_output(preprocessed_states).detach()
                if apply_postprocessing:
                    advantages = gae_function.calc_gae_values(prev_state_values, rewards, terminals, sequence_indices)
                else:
                    advantages = rewards
                if self.standardize_advantages:
                    std = torch.std(advantages)
                    if not np.isnan(std):
                        advantages = (advantages - torch.mean(advantages)) / std

                for _ in range(agent.iterations):
                    start = int(torch.rand(1) * (batch_size - 1))
                    indices = torch.arange(start=start, end=start + sample_size, dtype=torch.long) % batch_size
                    sample_states = torch.index_select(preprocessed_states, 0, indices)

                    if isinstance(actions, dict):
                        sample_actions = DataOpDict()
                        sample_prev_log_probs = DataOpDict()
                        for name, action in define_by_run_flatten(actions, scope_separator_at_start=False).items():
                            sample_actions[name] = torch.index_select(action, 0, indices)
                            sample_prev_log_probs[name] = torch.index_select(prev_log_probs[name], 0, indices)
                    else:
                        sample_actions = torch.index_select(actions, 0, indices)
                        sample_prev_log_probs = torch.index_select(prev_log_probs, 0, indices)

                    sample_advantages = torch.index_select(advantages, 0, indices)
                    sample_prev_state_values = torch.index_select(prev_state_values, 0, indices)

                    sample_log_probs = policy.get_log_likelihood(sample_states, sample_actions)["log_likelihood"]
                    sample_state_values = value_function.value_output(sample_states)

                    entropy = policy.get_entropy(sample_states)["entropy"]
                    loss, loss_per_item, vf_loss, vf_loss_per_item = loss_function.loss(
                        sample_log_probs, sample_prev_log_probs,
                        sample_state_values, sample_prev_state_values, sample_advantages, entropy, time_percentage
                    )

                    # Do not need step op.
                    optimizer.step(policy.variables(), loss, loss_per_item, time_percentage)
                    value_function_optimizer.step(value_function.variables(), vf_loss, vf_loss_per_item, time_percentage)
                return loss, loss_per_item, vf_loss, vf_loss_per_item
Beispiel #20
0
                def opt_body(index_, loss_, loss_per_item_, vf_loss_,
                             vf_loss_per_item_):
                    start = tf.random_uniform(shape=(),
                                              minval=0,
                                              maxval=batch_size - 1,
                                              dtype=tf.int32)
                    indices = tf.range(
                        start=start,
                        limit=start + agent.sample_size) % batch_size
                    sample_states = tf.gather(params=preprocessed_states,
                                              indices=indices)
                    if isinstance(actions, ContainerDataOp):
                        sample_actions = FlattenedDataOp()
                        for name, action in flatten_op(actions).items():
                            sample_actions[name] = tf.gather(params=action,
                                                             indices=indices)
                        sample_actions = unflatten_op(sample_actions)
                    else:
                        sample_actions = tf.gather(params=actions,
                                                   indices=indices)

                    sample_prior_log_probs = tf.gather(params=prev_log_probs,
                                                       indices=indices)
                    sample_rewards = tf.gather(params=rewards, indices=indices)
                    sample_terminals = tf.gather(params=terminals,
                                                 indices=indices)
                    sample_sequence_indices = tf.gather(
                        params=sequence_indices, indices=indices)
                    sample_advantages = tf.gather(params=advantages,
                                                  indices=indices)
                    sample_advantages.set_shape((self.sample_size, ))

                    sample_baseline_values = value_function.value_output(
                        sample_states)
                    sample_prior_baseline_values = tf.gather(
                        params=prior_baseline_values, indices=indices)

                    # If we are a multi-GPU root:
                    # Simply feeds everything into the multi-GPU sync optimizer's method and return.
                    if multi_gpu_sync_optimizer is not None:
                        main_policy_vars = agent.policy.variables()
                        main_vf_vars = agent.value_function.variables()
                        all_vars = agent.vars_merger.merge(
                            main_policy_vars, main_vf_vars)
                        # grads_and_vars, loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        out = multi_gpu_sync_optimizer.calculate_update_from_external_batch(
                            all_vars,
                            sample_states,
                            sample_actions,
                            sample_rewards,
                            sample_terminals,
                            sample_sequence_indices,
                            apply_postprocessing=apply_postprocessing)
                        avg_grads_and_vars_policy, avg_grads_and_vars_vf = agent.vars_splitter.call(
                            out["avg_grads_and_vars_by_component"])
                        policy_step_op = agent.optimizer.apply_gradients(
                            avg_grads_and_vars_policy)
                        vf_step_op = agent.value_function_optimizer.apply_gradients(
                            avg_grads_and_vars_vf)
                        step_op = root._graph_fn_group(policy_step_op,
                                                       vf_step_op)
                        step_and_sync_op = multi_gpu_sync_optimizer.sync_variables_to_towers(
                            step_op, all_vars)
                        loss_vf, loss_per_item_vf = out[
                            "additional_return_0"], out["additional_return_1"]

                        # Have to set all shapes here due to strict loop-var shape requirements.
                        out["loss"].set_shape(())
                        loss_vf.set_shape(())
                        loss_per_item_vf.set_shape((agent.sample_size, ))
                        out["loss_per_item"].set_shape((agent.sample_size, ))

                        with tf.control_dependencies([step_and_sync_op]):
                            if index_ == 0:
                                # Increase the global training step counter.
                                out["loss"] = root._graph_fn_training_step(
                                    out["loss"])
                            return index_ + 1, out["loss"], out[
                                "loss_per_item"], loss_vf, loss_per_item_vf

                    policy_probs = policy.get_log_likelihood(
                        sample_states, sample_actions)["log_likelihood"]
                    baseline_values = value_function.value_output(
                        tf.stop_gradient(sample_states))
                    sample_rewards = tf.cond(
                        pred=apply_postprocessing,
                        true_fn=lambda: gae_function.calc_gae_values(
                            baseline_values, sample_rewards, sample_terminals,
                            sample_sequence_indices),
                        false_fn=lambda: sample_rewards)
                    sample_rewards.set_shape((agent.sample_size, ))
                    entropy = policy.get_entropy(sample_states)["entropy"]

                    loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        loss_function.loss(
                            policy_probs, sample_prior_log_probs,
                            sample_baseline_values, sample_prior_baseline_values, sample_advantages, entropy
                        )

                    if hasattr(root, "is_multi_gpu_tower"
                               ) and root.is_multi_gpu_tower is True:
                        policy_grads_and_vars = optimizer.calculate_gradients(
                            policy.variables(), loss)
                        vf_grads_and_vars = value_function_optimizer.calculate_gradients(
                            value_function.variables(), vf_loss)
                        grads_and_vars_by_component = vars_merger.merge(
                            policy_grads_and_vars, vf_grads_and_vars)
                        return grads_and_vars_by_component, loss, loss_per_item, vf_loss, vf_loss_per_item
                    else:
                        step_op, loss, loss_per_item = optimizer.step(
                            policy.variables(), loss, loss_per_item)
                        loss.set_shape(())
                        loss_per_item.set_shape((agent.sample_size, ))

                        vf_step_op, vf_loss, vf_loss_per_item = value_function_optimizer.step(
                            value_function.variables(), vf_loss,
                            vf_loss_per_item)
                        vf_loss.set_shape(())
                        vf_loss_per_item.set_shape((agent.sample_size, ))

                        with tf.control_dependencies([step_op, vf_step_op]):
                            return index_ + 1, loss, loss_per_item, vf_loss, vf_loss_per_item