Beispiel #1
0
    def clean_dict(tensor_dict):
        """
        Detach tensor values in nested dict.
        Args:
            tensor_dict (dict): Dict containing torch tensor.

        Returns:
            dict: Dict containing numpy arrays.
        """
        # Un-nest.
        param = define_by_run_flatten(tensor_dict)
        ret = {}

        # Detach tensor values.
        for key, value in param.items():
            if isinstance(value, torch.Tensor):
                ret[key] = value.detach().numpy()

        # Pack again.
        return define_by_run_unflatten(ret)
Beispiel #2
0
def force_torch_tensors(params, requires_grad=False):
    """
    Converts input params to torch tensors
    Args:
        params (list): Input args.
        requires_grad (bool): If gradients need to be computed from these arguments.

    Returns:
        list: List of Torch tensors.
    """
    if get_backend() == "pytorch":
        tensor_params = []
        for param in params:
            if isinstance(param, dict):
                # Flatten dict.
                param = define_by_run_flatten(param)
                ret = {}
                for key, value in param.items():
                    ret[key] = convert_param(value, requires_grad)
                tensor_params.append(ret)
            else:
                tensor_params.append(convert_param(param, requires_grad))
        return tensor_params
Beispiel #3
0
        def _graph_fn_update_from_external_batch(root, preprocessed_states,
                                                 actions, rewards, terminals,
                                                 sequence_indices,
                                                 apply_postprocessing):
            """
            Calls iterative optimization by repeatedly sub-sampling.
            """
            multi_gpu_sync_optimizer = root.sub_components.get(
                "multi-gpu-synchronizer")

            # Return values.
            loss, loss_per_item, vf_loss, vf_loss_per_item = None, None, None, None

            policy = root.get_sub_component_by_name(agent.policy.scope)
            value_function = root.get_sub_component_by_name(
                agent.value_function.scope)
            optimizer = root.get_sub_component_by_name(agent.optimizer.scope)
            loss_function = root.get_sub_component_by_name(
                agent.loss_function.scope)
            value_function_optimizer = root.get_sub_component_by_name(
                agent.value_function_optimizer.scope)
            vars_merger = root.get_sub_component_by_name(
                agent.vars_merger.scope)
            gae_function = root.get_sub_component_by_name(
                agent.gae_function.scope)
            prev_log_probs = policy.get_log_likelihood(
                preprocessed_states, actions)["log_likelihood"]

            if get_backend() == "tf":
                # Log probs before update.
                prev_log_probs = tf.stop_gradient(prev_log_probs)
                batch_size = tf.shape(preprocessed_states)[0]
                prior_baseline_values = tf.stop_gradient(
                    value_function.value_output(preprocessed_states))

                # Advantages are based on prior baseline values.
                advantages = tf.cond(
                    pred=apply_postprocessing,
                    true_fn=lambda: gae_function.calc_gae_values(
                        prior_baseline_values, rewards, terminals,
                        sequence_indices),
                    false_fn=lambda: rewards)

                if self.standardize_advantages:
                    mean, std = tf.nn.moments(x=advantages, axes=[0])
                    advantages = (advantages - mean) / std

                def opt_body(index_, loss_, loss_per_item_, vf_loss_,
                             vf_loss_per_item_):
                    start = tf.random_uniform(shape=(),
                                              minval=0,
                                              maxval=batch_size - 1,
                                              dtype=tf.int32)
                    indices = tf.range(
                        start=start,
                        limit=start + agent.sample_size) % batch_size
                    sample_states = tf.gather(params=preprocessed_states,
                                              indices=indices)
                    if isinstance(actions, ContainerDataOp):
                        sample_actions = FlattenedDataOp()
                        for name, action in flatten_op(actions).items():
                            sample_actions[name] = tf.gather(params=action,
                                                             indices=indices)
                        sample_actions = unflatten_op(sample_actions)
                    else:
                        sample_actions = tf.gather(params=actions,
                                                   indices=indices)

                    sample_prior_log_probs = tf.gather(params=prev_log_probs,
                                                       indices=indices)
                    sample_rewards = tf.gather(params=rewards, indices=indices)
                    sample_terminals = tf.gather(params=terminals,
                                                 indices=indices)
                    sample_sequence_indices = tf.gather(
                        params=sequence_indices, indices=indices)
                    sample_advantages = tf.gather(params=advantages,
                                                  indices=indices)
                    sample_advantages.set_shape((self.sample_size, ))

                    sample_baseline_values = value_function.value_output(
                        sample_states)
                    sample_prior_baseline_values = tf.gather(
                        params=prior_baseline_values, indices=indices)

                    # If we are a multi-GPU root:
                    # Simply feeds everything into the multi-GPU sync optimizer's method and return.
                    if multi_gpu_sync_optimizer is not None:
                        main_policy_vars = agent.policy.variables()
                        main_vf_vars = agent.value_function.variables()
                        all_vars = agent.vars_merger.merge(
                            main_policy_vars, main_vf_vars)
                        # grads_and_vars, loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        out = multi_gpu_sync_optimizer.calculate_update_from_external_batch(
                            all_vars,
                            sample_states,
                            sample_actions,
                            sample_rewards,
                            sample_terminals,
                            sample_sequence_indices,
                            apply_postprocessing=apply_postprocessing)
                        avg_grads_and_vars_policy, avg_grads_and_vars_vf = agent.vars_splitter.call(
                            out["avg_grads_and_vars_by_component"])
                        policy_step_op = agent.optimizer.apply_gradients(
                            avg_grads_and_vars_policy)
                        vf_step_op = agent.value_function_optimizer.apply_gradients(
                            avg_grads_and_vars_vf)
                        step_op = root._graph_fn_group(policy_step_op,
                                                       vf_step_op)
                        step_and_sync_op = multi_gpu_sync_optimizer.sync_variables_to_towers(
                            step_op, all_vars)
                        loss_vf, loss_per_item_vf = out[
                            "additional_return_0"], out["additional_return_1"]

                        # Have to set all shapes here due to strict loop-var shape requirements.
                        out["loss"].set_shape(())
                        loss_vf.set_shape(())
                        loss_per_item_vf.set_shape((agent.sample_size, ))
                        out["loss_per_item"].set_shape((agent.sample_size, ))

                        with tf.control_dependencies([step_and_sync_op]):
                            if index_ == 0:
                                # Increase the global training step counter.
                                out["loss"] = root._graph_fn_training_step(
                                    out["loss"])
                            return index_ + 1, out["loss"], out[
                                "loss_per_item"], loss_vf, loss_per_item_vf

                    policy_probs = policy.get_log_likelihood(
                        sample_states, sample_actions)["log_likelihood"]
                    baseline_values = value_function.value_output(
                        tf.stop_gradient(sample_states))
                    sample_rewards = tf.cond(
                        pred=apply_postprocessing,
                        true_fn=lambda: gae_function.calc_gae_values(
                            baseline_values, sample_rewards, sample_terminals,
                            sample_sequence_indices),
                        false_fn=lambda: sample_rewards)
                    sample_rewards.set_shape((agent.sample_size, ))
                    entropy = policy.get_entropy(sample_states)["entropy"]

                    loss, loss_per_item, vf_loss, vf_loss_per_item = \
                        loss_function.loss(
                            policy_probs, sample_prior_log_probs,
                            sample_baseline_values, sample_prior_baseline_values, sample_advantages, entropy
                        )

                    if hasattr(root, "is_multi_gpu_tower"
                               ) and root.is_multi_gpu_tower is True:
                        policy_grads_and_vars = optimizer.calculate_gradients(
                            policy.variables(), loss)
                        vf_grads_and_vars = value_function_optimizer.calculate_gradients(
                            value_function.variables(), vf_loss)
                        grads_and_vars_by_component = vars_merger.merge(
                            policy_grads_and_vars, vf_grads_and_vars)
                        return grads_and_vars_by_component, loss, loss_per_item, vf_loss, vf_loss_per_item
                    else:
                        step_op, loss, loss_per_item = optimizer.step(
                            policy.variables(), loss, loss_per_item)
                        loss.set_shape(())
                        loss_per_item.set_shape((agent.sample_size, ))

                        vf_step_op, vf_loss, vf_loss_per_item = value_function_optimizer.step(
                            value_function.variables(), vf_loss,
                            vf_loss_per_item)
                        vf_loss.set_shape(())
                        vf_loss_per_item.set_shape((agent.sample_size, ))

                        with tf.control_dependencies([step_op, vf_step_op]):
                            return index_ + 1, loss, loss_per_item, vf_loss, vf_loss_per_item

                def cond(index_, loss_, loss_per_item_, v_loss_,
                         v_loss_per_item_):
                    return index_ < agent.iterations

                init_loop_vars = [
                    0,
                    tf.zeros(shape=(), dtype=tf.float32),
                    tf.zeros(shape=(agent.sample_size, )),
                    tf.zeros(shape=(), dtype=tf.float32),
                    tf.zeros(shape=(agent.sample_size, ))
                ]

                if hasattr(root, "is_multi_gpu_tower"
                           ) and root.is_multi_gpu_tower is True:
                    return opt_body(*init_loop_vars)
                else:
                    index, loss, loss_per_item, vf_loss, vf_loss_per_item = tf.while_loop(
                        cond=cond,
                        body=opt_body,
                        loop_vars=init_loop_vars,
                        parallel_iterations=1)
                    # Increase the global training step counter.
                    loss = root._graph_fn_training_step(loss)
                    return loss, loss_per_item, vf_loss, vf_loss_per_item

            elif get_backend() == "pytorch":
                if isinstance(prev_log_probs, dict):
                    for name in actions.keys():
                        prev_log_probs[name] = prev_log_probs[name].detach()
                else:
                    prev_log_probs = prev_log_probs.detach()
                batch_size = preprocessed_states.shape[0]
                sample_size = min(batch_size, agent.sample_size)
                prior_baseline_values = value_function.value_output(
                    preprocessed_states).detach()
                if apply_postprocessing:
                    advantages = gae_function.calc_gae_values(
                        prior_baseline_values, rewards, terminals,
                        sequence_indices)
                else:
                    advantages = rewards
                if self.standardize_advantages:
                    advantages = (advantages - torch.mean(advantages)
                                  ) / torch.std(advantages)

                for _ in range(agent.iterations):
                    start = int(torch.rand(1) * (batch_size - 1))
                    indices = torch.arange(start=start,
                                           end=start + sample_size,
                                           dtype=torch.long) % batch_size
                    sample_states = torch.index_select(preprocessed_states, 0,
                                                       indices)

                    if isinstance(actions, dict):
                        sample_actions = DataOpDict()
                        sample_prior_log_probs = DataOpDict()
                        for name, action in define_by_run_flatten(
                                actions,
                                scope_separator_at_start=False).items():
                            sample_actions[name] = torch.index_select(
                                action, 0, indices)
                            sample_prior_log_probs[name] = torch.index_select(
                                prev_log_probs[name], 0, indices)
                    else:
                        sample_actions = torch.index_select(
                            actions, 0, indices)
                        sample_prior_log_probs = torch.index_select(
                            prev_log_probs, 0, indices)

                    sample_advantages = torch.index_select(
                        advantages, 0, indices)
                    sample_prior_baseline_values = torch.index_select(
                        prior_baseline_values, 0, indices)

                    policy_probs = policy.get_log_likelihood(
                        sample_states, sample_actions)["log_likelihood"]
                    sample_baseline_values = value_function.value_output(
                        sample_states)

                    entropy = policy.get_entropy(sample_states)["entropy"]
                    loss, loss_per_item, vf_loss, vf_loss_per_item = loss_function.loss(
                        policy_probs, sample_prior_log_probs,
                        sample_baseline_values, sample_prior_baseline_values,
                        sample_advantages, entropy)

                    # Do not need step op.
                    _, loss, loss_per_item = optimizer.step(
                        policy.variables(), loss, loss_per_item)
                    _, vf_loss, vf_loss_per_item = \
                        value_function_optimizer.step(value_function.variables(), vf_loss, vf_loss_per_item)
                return loss, loss_per_item, vf_loss, vf_loss_per_item