Ejemplo n.º 1
0
    def _graph_fn_get_action_adapter_logits_parameters_log_probs(self, nn_output, nn_input):
        """
        Pushes the given nn_output through all our action adapters' get_logits_parameters_log_probs API's and
        returns a DataOpDict with the keys corresponding to our `action_space`.

        Args:
            nn_output (DataOp): The output of our neural network.

        Returns:
            tuple:
                - FlattenedDataOp: A DataOpDict with the different action adapters' logits outputs.
                - FlattenedDataOp: A DataOpDict with the different action adapters' parameters outputs.
                - FlattenedDataOp: A DataOpDict with the different action adapters' log_probs outputs.
            Note: Keys always correspond to structure of `self.action_space`.
        """
        logits = FlattenedDataOp()
        parameters = FlattenedDataOp()
        log_probs = FlattenedDataOp()

        if isinstance(nn_input, dict):
            nn_input = next(iter(nn_input.values()))

        for flat_key, action_adapter in self.action_adapters.items():
            out = action_adapter.get_logits_parameters_log_probs(nn_output, nn_input)
            logits[flat_key], parameters[flat_key], log_probs[flat_key] = \
                out["logits"], out["parameters"], out["log_probs"]

        return logits, parameters, log_probs
Ejemplo n.º 2
0
    def _graph_fn_get_action_and_log_prob(self, parameters, deterministic):
        action = FlattenedDataOp()
        log_prob = FlattenedDataOp()
        for flat_key, action_space_component in self.action_space.flatten().items():
            # Skip our distribution, iff discrete action-space and deterministic acting (greedy).
            # In that case, one does not need to create a distribution in the graph each act (only to get the argmax
            # over the logits, which is the same as the argmax over the probabilities (or log-probabilities)).
            if flat_key == "":
                if isinstance(parameters, FlattenedDataOp):
                    params = parameters[""]
                else:
                    params = parameters
            else:
                params = parameters.flat_key_lookup(flat_key)

            if isinstance(action_space_component, IntBox) and \
                    (deterministic is True or (isinstance(deterministic, np.ndarray) and deterministic)):
                action[flat_key] = self._graph_fn_get_deterministic_action_wo_distribution(params)
                log_prob[flat_key] = tf.reduce_max(params, axis=-1)
            elif isinstance(action_space_component, BoolBox) and \
                    (deterministic is True or (isinstance(deterministic, np.ndarray) and deterministic)):
                action[flat_key] = tf.greater(params, 0.5)
                log_prob[flat_key] = params
            else:
                action[flat_key], log_prob[flat_key] = self.distributions[flat_key].sample_and_log_prob(
                    params, deterministic
                )

        if len(action) == 1 and "" in action:
            return action[""], log_prob[""]
        else:
            return action, log_prob
Ejemplo n.º 3
0
    def _graph_fn_get_adapter_outputs_and_parameters(self, nn_outputs):
        """
        Pushes the given nn_output through all our action adapters' get_logits_parameters_log_probs API's and
        returns a DataOpDict with the keys corresponding to our `action_space`.

        Args:
            nn_outputs (DataOp): The output of our neural network.
            #nn_inputs (DataOp): The original inputs of the NN (that produced the `nn_outputs`).

        Returns:
            tuple:
                - FlattenedDataOp: A DataOpDict with the different action adapters' logits outputs.
                - FlattenedDataOp: A DataOpDict with the different action adapters' parameters outputs.
                - FlattenedDataOp: A DataOpDict with the different action adapters' log_probs outputs.
            Note: Keys always correspond to structure of `self.action_space`.
        """
        adapter_outputs = FlattenedDataOp()
        parameters = FlattenedDataOp()
        log_probs = FlattenedDataOp()

        #if isinstance(nn_inputs, dict):
        #    nn_inputs = next(iter(nn_inputs.values()))

        for flat_key, action_adapter in self.action_adapters.items():
            adapter_outs = action_adapter.call(nn_outputs)
            params = action_adapter.get_parameters_from_adapter_outputs(
                adapter_outs)
            #out = action_adapter.get_adapter_outputs_and_parameters(nn_outputs, nn_inputs)
            adapter_outputs[flat_key], parameters[flat_key], log_probs[flat_key] = \
                adapter_outs, params["parameters"], params.get("log_probs")

        return adapter_outputs, parameters, log_probs
Ejemplo n.º 4
0
    def _graph_fn_split_batch(self, *inputs):
        """
        Splits all DataOps in *inputs along their batch dimension into n equally sized shards. The number of shards
        is determined by `self.num_shards` (int) and the size of each shard depends on the incoming batch size with
        possibly a few superfluous items in the batch being discarded
        (effective batch size = num_shards x shard_size).

        Args:
            *input (FlattenedDataOp): Input tensors which must all have the same batch dimension.

        Returns:
            tuple:
                # Each shard consisting of: A DataOpTuple with len = number of input args.
                # - Each item in the DataOpTuple is a FlattenedDataOp with (flat) key (describing the input-piece
                # (e.g. "/states1")) and values being the (now sharded) batch data for that input piece.

                # e.g. return (for 2 shards):
                # tuple(DataOpTuple(input1_flatdict, input2_flatdict, input3_flatdict, input4_flatdict), DataOpTuple([same]))


                List of FlattenedDataOps () containing DataOpTuples containing the input shards.
        """
        if get_backend() == "tf":
            #batch_size = tf.shape(next(iter(inputs[0].values())))[0]
            #shard_size = tf.cast(batch_size / self.num_shards, dtype=tf.int32)

            # Must be evenly divisible so we slice out an evenly divisible tensor.
            # E.g. 203 items in batch with 4 shards -> Only 4 x 50 = 200 are usable.
            usable_size = self.shard_size * self.num_shards

            # List (one item for each input arg). Each item in the list looks like:
            # A FlattenedDataOp with (flat) keys (describing the input-piece (e.g. "/states1")) and values being
            # lists of len n for the n shards' data.
            inputs_flattened_and_split = list()

            for input_arg_data in inputs:
                shard_dict = FlattenedDataOp()
                for flat_key, data in input_arg_data.items():
                    usable_input_tensor = data[:usable_size]
                    shard_dict[flat_key] = tf.split(
                        value=usable_input_tensor,
                        num_or_size_splits=self.num_shards)
                inputs_flattened_and_split.append(shard_dict)

            # Flip the list to generate a new list where each item represents one shard.
            shard_list = list()
            for shard_idx in range(self.num_shards):
                # To be converted into FlattenedDataOps over the input-arg-pieces once complete.
                input_arg_list = list()
                for input_elem in range(len(inputs)):
                    sharded_data_dict = FlattenedDataOp()
                    for flat_key, shards in inputs_flattened_and_split[
                            input_elem].items():
                        sharded_data_dict[flat_key] = shards[shard_idx]
                    input_arg_list.append(unflatten_op(sharded_data_dict))
                # Must store everything as FlattenedDataOp otherwise the re-nesting will not work.
                shard_list.append(DataOpTuple(input_arg_list))

            # Return n values (n = number of batch shards).
            return tuple(shard_list)
 def _graph_fn_log_prob(self, parameters, values):
     ret = {}
     for key in parameters:
         #d = self.flattened_sub_distributions[key].get_distribution(parameters[key])
         #return self.flattened_sub_distributions[key]._graph_fn_log_prob(distribution, values)
         ret[key] = self.flattened_sub_distributions[key].log_prob(parameters[key], values[key])
     return FlattenedDataOp(ret)
Ejemplo n.º 6
0
    def _graph_fn_get_adapter_outputs(self, flat_key, nn_outputs):  #, nn_inputs):
        """
        Pushes the given nn_output through all our action adapters and returns a DataOpDict with the keys corresponding
        to our `action_space`.

        Args:
            nn_outputs (DataOp): The output of our neural network.
            #nn_inputs (DataOp): The original inputs of the NN (that produced the `nn_outputs`).

        Returns:
            FlattenedDataOp: A DataOpDict with the different action adapter outputs (keys correspond to
                structure of `self.action_space`).
        """
        # NN outputs are already split -> Feed flat-key NN output directly into its corresponding action_adapter.
        if flat_key in self.action_adapters:
            return self.action_adapters[flat_key].call(nn_outputs)
        # Many NN outputs, but no action adapters specified for this one -> return nn_outputs as is.
        elif flat_key != "":
            return nn_outputs, nn_outputs, None

        ret = FlattenedDataOp()
        for aa_flat_key, action_adapter in self.action_adapters.items():
            ret[aa_flat_key] = action_adapter.call(nn_outputs)

        return ret
Ejemplo n.º 7
0
    def _graph_fn_get_action_components(self, logits, parameters, deterministic):
        ret = FlattenedDataOp()

        # TODO Clean up the checks in here wrt define-by-run processing.
        for flat_key, action_space_component in self.action_space.flatten().items():
            # Skip our distribution, iff discrete action-space and deterministic acting (greedy).
            # In that case, one does not need to create a distribution in the graph each act (only to get the argmax
            # over the logits, which is the same as the argmax over the probabilities (or log-probabilities)).
            if isinstance(action_space_component, IntBox) and \
                    (deterministic is True or (isinstance(deterministic, np.ndarray) and deterministic)):
                if flat_key == "":
                    return self._graph_fn_get_deterministic_action_wo_distribution(logits)
                else:
                    ret[flat_key] = self._graph_fn_get_deterministic_action_wo_distribution(
                        logits.flat_key_lookup(flat_key)
                    )
            elif isinstance(action_space_component, BoolBox) and \
                    (deterministic is True or (isinstance(deterministic, np.ndarray) and deterministic)):
                if flat_key == "":
                    return tf.greater(logits, 0.5)
                else:
                    ret[flat_key] = tf.greater(logits.flat_key_lookup(flat_key), 0.5)
            else:
                if flat_key == "":
                    # Still wrapped as FlattenedDataOp.
                    if isinstance(parameters, FlattenedDataOp):
                        return self.distributions[flat_key].draw(parameters[flat_key], deterministic)
                    else:
                        return self.distributions[flat_key].draw(parameters, deterministic)

                if isinstance(parameters, ContainerDataOp):
                    ret[flat_key] = self.distributions[flat_key].draw(parameters.flat_key_lookup(flat_key), deterministic)
                else:
                    ret[flat_key] = self.distributions[flat_key].draw(parameters[flat_key], deterministic)
        return ret
Ejemplo n.º 8
0
    def _graph_fn_get_distribution_log_probs(self, parameters, actions):
        """
        Pushes the given `probabilities` and actions through all our distributions' `log_prob` API-methods and returns a
        DataOpDict with the keys corresponding to our `action_space`.

        Args:
            parameters (DataOp): The parameters to define a distribution.
            actions (DataOp): The actions for which to return the log-probs.

        Returns:
            FlattenedDataOp: A DataOpDict with the different distributions' `log_prob` outputs. Keys always correspond
                to structure of `self.action_space`.
        """
        ret = FlattenedDataOp()
        for flat_key, action_space_component in self.action_space.flatten().items():
            if flat_key == "":
                if isinstance(parameters, FlattenedDataOp):
                    return self.distributions[flat_key].log_prob(parameters[flat_key], actions)
                else:
                    return self.distributions[flat_key].log_prob(parameters, actions)
            else:
                ret[flat_key] = self.distributions[flat_key].log_prob(
                    parameters.flat_key_lookup(flat_key), actions.flat_key_lookup(flat_key)
                )
        return ret
Ejemplo n.º 9
0
    def split_flattened_input_ops(self, *ops, **kwarg_ops):
        """
        Splits any FlattenedDataOp in *ops and **kwarg_ops into its SingleDataOps and collects them to be passed
        one by one through some graph_fn. If more than one FlattenedDataOp exists in *ops and **kwarg_ops,
        these must have the exact same keys.
        If `add_auto_key_as_first_param` is True: Add auto-key as very first parameter in each
        returned parameter tuple.

        Args:
            *ops (op): The primitive ops to split.
            **kwarg_ops (op): More primitive ops to split (but by named key).

        Returns:
            Union[FlattenedDataOp,Tuple[DataOp]]: The sorted parameter tuples (by flat-key) to use as api_methods in the
                calls to the graph_fn.
                If no FlattenedDataOp is in ops, returns ops as-is.

        Raises:
            RLGraphError: If there are more than 1 flattened ops in ops and their keys don't match 100%.
        """
        assert all(op is not None for op in ops)  # just make sure

        # Collect FlattenedDataOp for checking their keys (must match).
        flattened = [op.items() for op in ops if len(op) > 1 or "" not in op]
        # If it's more than 1, make sure they match. If they don't match: raise Error.
        if len(flattened) > 1:
            # Loop through the non-first ones and make sure all keys match vs the first one.
            for other in flattened[1:]:
                other_arg_iter = iter(other)
                for key, value in flattened[0]:
                    k_other, v_other = next(other_arg_iter)
                    if key != k_other:  # or get_shape(v_other) != get_shape(value):
                        raise RLGraphError("ERROR: Flattened ops have a key mismatch ({} vs {})!".format(key, k_other))

        # We have one or many (matching) ContainerDataOps: Split the calls.
        if len(flattened) > 0:
            # The first op that is a FlattenedDataOp.
            guide_op = next(op for op in ops if len(op) > 1 or "" not in op)
            # Re-create our iterators.
            collected_call_params = FlattenedDataOp()
            # Do the single split calls to our computation func.
            for key in guide_op.keys():
                # Prep input params for a single call.
                params = [key] if self.add_auto_key_as_first_param is True else []
                for op in ops:
                    params.append(op[key] if key in op else op[""])
                # Add kwarg_ops
                for kwarg_key, kwarg_op in kwarg_ops.items():
                    params.append(tuple([
                        kwarg_key,
                        kwarg_ops[kwarg_key][key] if key in kwarg_ops[kwarg_key] else kwarg_ops[kwarg_key][""]
                    ]))
                # Now do the single call.
                collected_call_params[key] = params
            return collected_call_params
        # We don't have any container ops: No splitting possible. Return args and kwargs as is.
        else:
            return tuple(([""] if self.add_auto_key_as_first_param is True else []) + [op[""] for op in ops]),\
                   {key: value[""] for key, value in kwarg_ops.items()}
Ejemplo n.º 10
0
    def _graph_fn_2_to_3(self, input1, input2):
        """
        NOTE: Both input1 and input2 are flattened dicts.

        Returns:
            Tuple:
                - in1 + in2
                - in1 - in2
                - in2

        """
        ret = FlattenedDataOp()
        ret2 = FlattenedDataOp()
        for key, value in input1.items():
            ret[key] = value + input2[""]
            ret2[key] = value - input2[""]
        return ret, ret2, input2
Ejemplo n.º 11
0
    def _graph_fn_get_adapter_outputs_and_parameters(self, flat_key, nn_outputs):
        """
        Pushes the given nn_output through all our action adapters' get_logits_parameters_log_probs API's and
        returns a DataOpDict with the keys corresponding to our `action_space`.

        Args:
            nn_outputs (DataOp): The output of our neural network.
            #nn_inputs (DataOp): The original inputs of the NN (that produced the `nn_outputs`).

        Returns:
            tuple:
                - FlattenedDataOp: A DataOpDict with the different action adapters' logits outputs.
                - FlattenedDataOp: A DataOpDict with the different action adapters' parameters outputs.
                - FlattenedDataOp: A DataOpDict with the different action adapters' log_probs outputs.
            Note: Keys always correspond to structure of `self.action_space`.
        """
        # NN outputs are already split -> Feed flat-key NN output directly into its corresponding action_adapter.
        if flat_key in self.action_adapters:
            adapter_outs = self.action_adapters[flat_key].call(nn_outputs)
            params = self.action_adapters[flat_key].get_parameters_from_adapter_outputs(adapter_outs)
            return adapter_outs, params["parameters"], params.get("probabilities"), params.get("log_probs")
        # Many NN outputs, but no action adapters specified for this one -> return nn_outputs as is.
        elif flat_key != "":
            return nn_outputs, nn_outputs, None, None

        # There is only a single NN-output, but many action adapters.
        adapter_outputs = FlattenedDataOp()
        parameters = FlattenedDataOp()
        probs = FlattenedDataOp()
        log_probs = FlattenedDataOp()
        for aa_flat_key, action_adapter in self.action_adapters.items():
            adapter_outs = action_adapter.call(nn_outputs)
            params = action_adapter.get_parameters_from_adapter_outputs(adapter_outs)
            #out = action_adapter.get_adapter_outputs_and_parameters(nn_outputs, nn_inputs)
            adapter_outputs[aa_flat_key], parameters[aa_flat_key], probs[aa_flat_key], log_probs[aa_flat_key] = \
                adapter_outs, params["parameters"], params.get("probabilities"), params.get("log_probs")

        return adapter_outputs, parameters, probs, log_probs
Ejemplo n.º 12
0
 def _graph_fn_get_records(self, num_records=1):
     # Get the records as dict.
     record_dict = self.queue.dequeue_many(num_records)
     # Return a FlattenedDataOp.
     flattened_records = FlattenedDataOp(record_dict)
     # Add batch and (possible) time rank to output ops for the auto-Space-inference.
     flat_record_space = self.record_space.flatten()
     for flat_key, op in record_dict.items():
         if flat_record_space[flat_key].has_time_rank:
             op._batch_rank = 0
             op._time_rank = 1
             flattened_records[flat_key] = op
         else:
             op._batch_rank = 0
             flattened_records[flat_key] = op
     return flattened_records
Ejemplo n.º 13
0
    def _graph_fn_get_action_layer_outputs(self, nn_output, nn_input):
        """
        Pushes the given nn_output through all our action adapters and returns a DataOpDict with the keys corresponding
        to our `action_space`.

        Args:
            nn_output (DataOp): The output of our neural network.

        Returns:
            FlattenedDataOp: A DataOpDict with the different action adapter outputs (keys correspond to
                structure of `self.action_space`).
        """
        nn_input = next(iter(nn_input.values()))

        ret = FlattenedDataOp()
        for flat_key, action_adapter in self.action_adapters.items():
            ret[flat_key] = action_adapter.get_logits(nn_output, nn_input)

        return ret
Ejemplo n.º 14
0
    def _graph_fn_unstage(self):
        """
        Unstages (and unflattens) all staged data.

        Returns:
            Tuple[DataOp]: All previously staged ops.
        """
        unstaged_data = self.area.get()
        unflattened_data = list()
        idx = 0
        # Unflatten all data and return.
        for flat_key_list in self.flat_keys:
            flat_dict = FlattenedDataOp({
                flat_key: item
                for flat_key, item in zip(
                    flat_key_list, unstaged_data[idx:idx + len(flat_key_list)])
            })
            unflattened_data.append(unflatten_op(flat_dict))
            idx += len(flat_key_list)

        return tuple(unflattened_data)
Ejemplo n.º 15
0
    def _graph_fn_get_adapter_outputs(self, nn_outputs):  #, nn_inputs):
        """
        Pushes the given nn_output through all our action adapters and returns a DataOpDict with the keys corresponding
        to our `action_space`.

        Args:
            nn_outputs (DataOp): The output of our neural network.
            #nn_inputs (DataOp): The original inputs of the NN (that produced the `nn_outputs`).

        Returns:
            FlattenedDataOp: A DataOpDict with the different action adapter outputs (keys correspond to
                structure of `self.action_space`).
        """
        #if isinstance(nn_inputs, FlattenedDataOp):
        #    nn_inputs = next(iter(nn_inputs.values()))

        ret = FlattenedDataOp()
        for flat_key, action_adapter in self.action_adapters.items():
            ret[flat_key] = action_adapter.call(nn_outputs)

        return ret
Ejemplo n.º 16
0
    def _graph_fn_sample(self, sample_size, inputs):
        """
        Takes a set of input tensors and uniformly samples a subset of the
        specified size from them.

        Args:
            sample_size (SingleDataOp[int]): Subsample size.
            inputs (FlattenedDataOp): Input tensors (in a FlattenedDataOp) to sample from.
                All values (tensors) should all be the same size.

        Returns:
            FlattenedDataOp: The sub-sampled api_methods (will be unflattened automatically).
        """
        batch_size = get_batch_size(next(iter(inputs.values())))

        if get_backend() == "tf":
            sample_indices = tf.random_uniform(shape=(sample_size, ),
                                               maxval=batch_size,
                                               dtype=tf.int32)
            sample = FlattenedDataOp()
            for key, tensor in inputs.items():
                sample[key] = tf.gather(params=tensor, indices=sample_indices)
            return sample
Ejemplo n.º 17
0
    def _graph_fn_get_distribution_entropies(self, parameters):
        """
        Pushes the given `probabilities` through all our distributions' `entropy` API-methods and returns a
        DataOpDict with the keys corresponding to our `action_space`.

        Args:
            parameters (DataOp): The parameters to define a distribution. This could be a ContainerDataOp, which
                container the parameter pieces for each action component.

        Returns:
            FlattenedDataOp: A DataOpDict with the different distributions' `entropy` outputs. Keys always correspond to
                structure of `self.action_space`.
        """
        ret = FlattenedDataOp()
        for flat_key, d in self.distributions.items():
            if flat_key == "":
                if isinstance(parameters, FlattenedDataOp):
                    return d.entropy(parameters[flat_key])
                else:
                    return d.entropy(parameters)
            else:
                ret[flat_key] = d.entropy(parameters.flat_key_lookup(flat_key))
        return ret
 def _graph_fn_sample_stochastic(self, parameters):
     ret = {}
     for key in parameters:
         ret[key] = self.flattened_sub_distributions[key].sample_stochastic(parameters[key])
     return FlattenedDataOp(ret)
Ejemplo n.º 19
0
                def opt_body(index_, loss_, loss_per_item_, vf_loss_,
                             vf_loss_per_item_):
                    start = tf.random_uniform(shape=(),
                                              minval=0,
                                              maxval=batch_size - 1,
                                              dtype=tf.int32)
                    indices = tf.range(
                        start=start,
                        limit=start + agent.sample_size) % batch_size
                    sample_states = tf.gather(params=preprocessed_states,
                                              indices=indices)
                    if isinstance(actions, ContainerDataOp):
                        sample_actions = FlattenedDataOp()
                        for name, action in flatten_op(actions).items():
                            sample_actions[name] = tf.gather(params=action,
                                                             indices=indices)
                        sample_actions = unflatten_op(sample_actions)
                    else:
                        sample_actions = tf.gather(params=actions,
                                                   indices=indices)

                    sample_prior_log_probs = tf.gather(params=prev_log_probs,
                                                       indices=indices)
                    sample_rewards = tf.gather(params=rewards, indices=indices)
                    sample_terminals = tf.gather(params=terminals,
                                                 indices=indices)
                    sample_sequence_indices = tf.gather(
                        params=sequence_indices, indices=indices)
                    sample_advantages = tf.gather(params=advantages,
                                                  indices=indices)
                    sample_advantages.set_shape((self.sample_size, ))

                    sample_baseline_values = value_function.value_output(
                        sample_states)
                    sample_prior_baseline_values = tf.gather(
                        params=prior_baseline_values, indices=indices)

                    # If we are a multi-GPU root:
                    # Simply feeds everything into the multi-GPU sync optimizer's method and return.
                    if multi_gpu_sync_optimizer is not None:
                        main_policy_vars = agent.policy.variables()
                        main_vf_vars = agent.value_function.variables()
                        all_vars = agent.vars_merger.merge(
                            main_policy_vars, main_vf_vars)
                        # grads_and_vars, loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        out = multi_gpu_sync_optimizer.calculate_update_from_external_batch(
                            all_vars,
                            sample_states,
                            sample_actions,
                            sample_rewards,
                            sample_terminals,
                            sample_sequence_indices,
                            apply_postprocessing=apply_postprocessing)
                        avg_grads_and_vars_policy, avg_grads_and_vars_vf = agent.vars_splitter.call(
                            out["avg_grads_and_vars_by_component"])
                        policy_step_op = agent.optimizer.apply_gradients(
                            avg_grads_and_vars_policy)
                        vf_step_op = agent.value_function_optimizer.apply_gradients(
                            avg_grads_and_vars_vf)
                        step_op = root._graph_fn_group(policy_step_op,
                                                       vf_step_op)
                        step_and_sync_op = multi_gpu_sync_optimizer.sync_variables_to_towers(
                            step_op, all_vars)
                        loss_vf, loss_per_item_vf = out[
                            "additional_return_0"], out["additional_return_1"]

                        # Have to set all shapes here due to strict loop-var shape requirements.
                        out["loss"].set_shape(())
                        loss_vf.set_shape(())
                        loss_per_item_vf.set_shape((agent.sample_size, ))
                        out["loss_per_item"].set_shape((agent.sample_size, ))

                        with tf.control_dependencies([step_and_sync_op]):
                            if index_ == 0:
                                # Increase the global training step counter.
                                out["loss"] = root._graph_fn_training_step(
                                    out["loss"])
                            return index_ + 1, out["loss"], out[
                                "loss_per_item"], loss_vf, loss_per_item_vf

                    policy_probs = policy.get_log_likelihood(
                        sample_states, sample_actions)["log_likelihood"]
                    baseline_values = value_function.value_output(
                        tf.stop_gradient(sample_states))
                    sample_rewards = tf.cond(
                        pred=apply_postprocessing,
                        true_fn=lambda: gae_function.calc_gae_values(
                            baseline_values, sample_rewards, sample_terminals,
                            sample_sequence_indices),
                        false_fn=lambda: sample_rewards)
                    sample_rewards.set_shape((agent.sample_size, ))
                    entropy = policy.get_entropy(sample_states)["entropy"]

                    loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        loss_function.loss(
                            policy_probs, sample_prior_log_probs,
                            sample_baseline_values, sample_prior_baseline_values, sample_advantages, entropy
                        )

                    if hasattr(root, "is_multi_gpu_tower"
                               ) and root.is_multi_gpu_tower is True:
                        policy_grads_and_vars = optimizer.calculate_gradients(
                            policy.variables(), loss)
                        vf_grads_and_vars = value_function_optimizer.calculate_gradients(
                            value_function.variables(), vf_loss)
                        grads_and_vars_by_component = vars_merger.merge(
                            policy_grads_and_vars, vf_grads_and_vars)
                        return grads_and_vars_by_component, loss, loss_per_item, vf_loss, vf_loss_per_item
                    else:
                        step_op, loss, loss_per_item = optimizer.step(
                            policy.variables(), loss, loss_per_item)
                        loss.set_shape(())
                        loss_per_item.set_shape((agent.sample_size, ))

                        vf_step_op, vf_loss, vf_loss_per_item = value_function_optimizer.step(
                            value_function.variables(), vf_loss,
                            vf_loss_per_item)
                        vf_loss.set_shape(())
                        vf_loss_per_item.set_shape((agent.sample_size, ))

                        with tf.control_dependencies([step_op, vf_step_op]):
                            return index_ + 1, loss, loss_per_item, vf_loss, vf_loss_per_item
Ejemplo n.º 20
0
    def _graph_fn_apply(self, preprocessing_inputs):
        """
        Sequences (stitches) together the incoming inputs by using our buffer (with stored older records).
        Sequencing happens within the last rank if `self.add_rank` is False, otherwise a new rank is added at the end
        for the sequencing.

        Args:
            preprocessing_inputs (FlattenedDataOp): The FlattenedDataOp to be sequenced.
                One sequence is generated separately for each SingleDataOp in api_methods.

        Returns:
            FlattenedDataOp: The FlattenedDataOp holding the sequenced SingleDataOps as values.
        """
        # A normal (index != -1) assign op.
        if self.backend == "python" or get_backend() == "python":
            if self.index == -1:
                for _ in range_(self.sequence_length):
                    self.deque.append(preprocessing_inputs)
            else:
                self.deque.append(preprocessing_inputs)
            self.index = (self.index + 1) % self.sequence_length

            if self.add_rank:
                sequence = np.stack(self.deque, axis=-1)
            # Concat the sequence items in the last rank.
            else:
                sequence = np.concatenate(self.deque, axis=-1)

            # TODO move into transpose component.
            if self.in_data_format == "channels_last" and self.out_data_format == "channels_first":
                sequence = sequence.transpose((0, 3, 2, 1))

            return sequence
        elif get_backend() == "pytorch":
            if self.index == -1:
                for _ in range_(self.sequence_length):
                    if isinstance(preprocessing_inputs, dict):
                        for key, value in preprocessing_inputs.items():
                            self.deque.append(value)
                    else:
                        self.deque.append(preprocessing_inputs)
            else:
                if isinstance(preprocessing_inputs, dict):
                    for key, value in preprocessing_inputs.items():
                        self.deque.append(value)
                        self.index = (self.index + 1) % self.sequence_length
                else:
                    self.deque.append(preprocessing_inputs)
                    self.index = (self.index + 1) % self.sequence_length

            if self.add_rank:
                sequence = torch.stack(torch.tensor(self.deque), dim=-1)
            # Concat the sequence items in the last rank.
            else:
                data = []
                for t in self.deque:
                    if isinstance(t, torch.Tensor):
                        data.append(t)
                    else:
                        data.append(torch.tensor(t))
                sequence = torch.cat(data, dim=-1)

            # TODO remove when transpose component implemented.
            if self.in_data_format == "channels_last" and self.out_data_format == "channels_first":
                # Problem: PyTorch does not have data format options in conv layers ->
                # only channels first supported.
                # -> Confusingly have to transpose.
                # B W H C -> B C W H
                # e.g. atari: [4 84 84 4] -> [4 4 84 84]
                sequence = sequence.permute(0, 3, 2, 1)

            return sequence
        elif get_backend() == "tf":
            # Assigns the input_ into the buffer at the current time index.
            def normal_assign():
                assigns = list()
                for key_, value in preprocessing_inputs.items():
                    assign_op = self.assign_variable(ref=self.buffer[key_][self.index], value=value)
                    assigns.append(assign_op)
                return assigns

            # After a reset (time index is -1), fill the entire buffer with `self.sequence_length` x input_.
            def after_reset_assign():
                assigns = list()
                for key_, value in preprocessing_inputs.items():
                    multiples = (self.sequence_length,) + tuple([1] * get_rank(value))
                    input_ = tf.expand_dims(input=value, axis=0)
                    assign_op = self.assign_variable(
                        ref=self.buffer[key_], value=tf.tile(input=input_, multiples=multiples)
                    )
                    assigns.append(assign_op)
                return assigns

            # Insert the input at the correct index or fill empty buffer entirely with input.
            insert_inputs = tf.cond(pred=(self.index >= 0), true_fn=normal_assign, false_fn=after_reset_assign)

            # Make sure the input has been inserted.
            with tf.control_dependencies(control_inputs=force_list(insert_inputs)):
                # Then increase index by 1.
                index_plus_1 = self.assign_variable(ref=self.index, value=((self.index + 1) % self.sequence_length))

            # Then gather the output.
            with tf.control_dependencies(control_inputs=[index_plus_1]):
                sequences = FlattenedDataOp()
                # Collect the correct previous inputs from the buffer to form the output sequence.
                for key in preprocessing_inputs.keys():
                    n_in = [self.buffer[key][(self.index + n) % self.sequence_length]
                            for n in range_(self.sequence_length)]

                    # Add the sequence-rank to the end of our inputs.
                    if self.add_rank:
                        sequence = tf.stack(values=n_in, axis=-1)
                    # Concat the sequence items in the last rank.
                    else:
                        sequence = tf.concat(values=n_in, axis=-1)

                    # Must pass the sequence through a placeholder_with_default dummy to set back the
                    # batch rank to '?', instead of 1 (1 would confuse the auto Space inference).
                    sequences[key] = tf.placeholder_with_default(
                        sequence, shape=(None,) + tuple(get_shape(sequence)[1:])
                    )
            # TODO implement transpose
                return sequences
 def _graph_fn_draw(self, parameters, deterministic):
     ret = {}
     for key in parameters:
         ret[key] = self.flattened_sub_distributions[key].draw(parameters[key], deterministic)
     return FlattenedDataOp(ret)