Ejemplo n.º 1
0
def simulate_policy(args):
    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print(
                'Using the deterministic version of the UNintentional policy '
                '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    # MultiPolicySelector(data['u_policy'], args.un))
                    WeightedMultiPolicySelector(data['policy'], args.un))
            else:
                policy = MakeDeterministic(
                    WeightedMultiPolicySelector(data['policy'], args.un))
        else:
            print('Using the deterministic version of the Intentional policy.')
            policy = MakeDeterministic(data['policy'])
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
            else:
                # policy = data['u_policies'][args.un]
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    with open('variant.json') as json_data:
        env_params = json.load(json_data)['env_params']
    env = NormalizedBoxEnv(Navigation2dGoalCompoEnv(**env_params))
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, PyTorchModule):
        policy.train(False)
    while True:
        path = rollout(
            env,
            policy,
            max_path_length=args.H,
            animated=True,
            # deterministic=args.deterministic,
        )
        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])
        logger.dump_tabular()
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print('Using the deterministic version of the UNintentional policy '
                  '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    MultiPolicySelector(data['u_policy'], args.un))
                    # WeightedMultiPolicySelector(data['u_policy'], args.un))
            else:
                # policy = MakeDeterministic(data['u_policies'][args.un])
                if isinstance(data['policy'], TanhGaussianPolicy):
                    policy = MakeDeterministic(data['policy'])
                else:
                    policy = MakeDeterministic(
                        WeightedMultiPolicySelector(data['policy'], args.un)
                    )
        else:
            print('Using the deterministic version of the Intentional policy.')
            if isinstance(data['policy'], ExplorationPolicy):
                policy = MakeDeterministic(data['policy'])
            else:
                policy = data['policy']
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['u_policy'], args.un)
            else:
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
                # policy = data['policy'][args.un]
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    dirname = os.path.dirname(args.file)
    with open(os.path.join(dirname, 'variant.json')) as json_data:
        log_data = json.load(json_data)
        env_params = log_data['env_params']
        H = int(log_data['path_length'])

    env_params.pop('goal', None)
    env_params['is_render'] = True

    if args.subtask and args.un != -1:
        env_params['subtask'] = args.un

    env = NormalizedBoxEnv(
        Pusher2D3DofGoalCompoEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('pusher_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        if args.H != -1:
            H = args.H

        path = rollout(
            env,
            policy,
            max_path_length=H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )
        # plot_rollout_reward(path)

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Ejemplo n.º 3
0
    def __init__(
            self,
            env,
            policy,
            explo_policy,
            u_qf,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            i_qf=None,
            action_prior='uniform',
            policy_lr=3e-4,
            qf_lr=1e-4,
            i_policy_pre_activation_weight=0.,
            i_policy_mixing_coeff_weight=1e-3,
            u_policy_pre_activation_weight=None,
            policy_weight_decay=0.,
            qf_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            i_soft_target_tau=1e-2,
            u_soft_target_tau=1e-2,
            i_target_update_interval=1,
            u_target_update_interval=1,
            reward_scale=1.,
            u_reward_scales=None,
            min_q_value=-np.inf,
            max_q_value=np.inf,
            residual_gradient_weight=0,
            eval_with_target_policy=False,
            save_replay_buffer=False,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Deterministic Policies
        self._policy = policy
        self._target_policy = policy.copy()

        # Exploration Policy
        self._exploration_policy = explo_policy

        # Evaluation Policy
        if eval_with_target_policy:
            eval_policy = self._target_policy
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=env,
                             explo_policy=self._exploration_policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Number of Unintentional Tasks (Composable Tasks)
        self._n_unintentional = self._policy.n_heads

        # Evaluation Sampler (One for each unintentional)
        self.eval_u_samplers = [
            InPlacePathSampler(
                env=env,
                policy=WeightedMultiPolicySelector(eval_policy, idx),
                total_samples=self.num_steps_per_eval,
                max_path_length=self.max_path_length,
                deterministic=None,
            ) for idx in range(self._n_unintentional)
        ]

        # Important algorithm hyperparameters
        self._action_prior = action_prior

        # Intentional (Main Task) Q-function
        self._i_qf = i_qf
        self._i_target_qf = i_qf.copy()

        # Unintentional (Composable Tasks) Q-functions
        self._u_qf = u_qf
        self._u_target_qf = u_qf.copy()

        self._min_q_value = min_q_value
        self._max_q_value = max_q_value
        self._residual_gradient_weight = residual_gradient_weight

        # Soft-update rate for target V-functions
        self._i_soft_target_tau = i_soft_target_tau
        self._u_soft_target_tau = u_soft_target_tau
        self._i_target_update_interval = i_target_update_interval
        self._u_target_update_interval = u_target_update_interval

        # Reward Scales
        self.reward_scale = reward_scale
        if u_reward_scales is None:
            reward_scale = kwargs['reward_scale']
            u_reward_scales = [
                reward_scale for _ in range(self._n_unintentional)
            ]
        self._u_reward_scales = ptu.FloatTensor(u_reward_scales)

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')
        self._qf_lr = qf_lr
        self._policy_lr = policy_lr

        # Q-function and V-function Optimization Criteria
        self._u_qf_criterion = nn.MSELoss()
        self._i_qf_criterion = nn.MSELoss()

        # Q-function(s) optimizers(s)
        self._u_qf_optimizer = optimizer_class(self._u_qf.parameters(),
                                               lr=qf_lr,
                                               weight_decay=qf_weight_decay,
                                               **optimizer_kwargs)
        self._i_qf_optimizer = optimizer_class(self._i_qf.parameters(),
                                               lr=qf_lr,
                                               weight_decay=qf_weight_decay,
                                               **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Policy regularization coefficients (weights)
        self._i_pol_pre_activ_weight = i_policy_pre_activation_weight
        self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight

        if u_policy_pre_activation_weight is None:
            u_policy_pre_activation_weight = [
                i_policy_pre_activation_weight
                for _ in range(self._n_unintentional)
            ]
        self._u_policy_pre_activ_weight = \
            ptu.FloatTensor(u_policy_pre_activation_weight)

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Raw Pol Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Pol Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Rewards'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Action'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Mixing Weights'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional,
            self.explo_env.action_dim,
        ))

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None
Ejemplo n.º 4
0
def simulate_policy(args):

    np.random.seed(SEED)
    ptu.seed(SEED)

    data = joblib.load(args.file)
    if args.deterministic:
        if args.un > -1:
            print(
                'Using the deterministic version of the UNintentional policy '
                '%02d.' % args.un)
            if 'u_policy' in data:
                policy = MakeDeterministic(
                    MultiPolicySelector(data['u_policy'], args.un))
                # WeightedMultiPolicySelector(data['u_policy'], args.un))
            else:
                # policy = MakeDeterministic(data['u_policies'][args.un])
                if isinstance(data['policy'], TanhGaussianPolicy):
                    policy = MakeDeterministic(data['policy'])
                else:
                    policy = MakeDeterministic(
                        WeightedMultiPolicySelector(data['policy'], args.un))
        else:
            print('Using the deterministic version of the Intentional policy.')
            if isinstance(data['policy'], ExplorationPolicy):
                policy = MakeDeterministic(data['policy'])
            else:
                policy = data['policy']
    else:
        if args.un > -1:
            print('Using the UNintentional stochastic policy %02d' % args.un)
            if 'u_policy' in data:
                # policy = MultiPolicySelector(data['u_policy'], args.un)
                policy = WeightedMultiPolicySelector(data['u_policy'], args.un)
            else:
                policy = WeightedMultiPolicySelector(data['policy'], args.un)
                # policy = data['policy'][args.un]
        else:
            print('Using the Intentional stochastic policy.')
            # policy = data['exploration_policy']
            policy = data['policy']

    print("Policy loaded!!")

    # Load environment
    dirname = os.path.dirname(args.file)
    with open(os.path.join(dirname, 'variant.json')) as json_data:
        log_data = json.load(json_data)
        env_params = log_data['env_params']
        H = int(log_data['path_length'])
    env_params['is_render'] = True

    if 'obs_mean' in data.keys():
        obs_mean = data['obs_mean']
        print('OBS_MEAN')
        print(repr(obs_mean))
    else:
        obs_mean = None
        # obs_mean = np.array([ 0.07010766,  0.37585765,  0.21402615,  0.24426296,  0.5789634 ,
        #                       0.88510203,  1.6878743 ,  0.02656335,  0.03794186, -1.0241051 ,
        #                       -0.5226027 ,  0.6198239 ,  0.49062446,  0.01197532,  0.7888951 ,
        #                       -0.4857273 ,  0.69160587, -0.00617676,  0.08966777, -0.14694819,
        #                       0.9559917 ,  1.0450271 , -0.40958315,  0.86435956,  0.00609685,
        #                       -0.01115279, -0.21607827,  0.9762933 ,  0.80748135, -0.48661205,
        #                       0.7473679 ,  0.01649722,  0.15451911, -0.17285274,  0.89978695])

    if 'obs_var' in data.keys():
        obs_var = data['obs_var']
        print('OBS_VAR')
        print(repr(obs_var))
    else:
        obs_var = None
        # obs_var = np.array([0.10795759, 0.12807205, 0.9586606 , 0.46407   , 0.8994803 ,
        #                     0.35167143, 0.30286264, 0.34667444, 0.35105848, 1.9919134 ,
        #                     0.9462659 , 2.245269  , 0.84190637, 1.5407104 , 0.1       ,
        #                     0.10330457, 0.1       , 0.1       , 0.1       , 0.1528581 ,
        #                     0.1       , 0.1       , 0.1       , 0.1       , 0.1       ,
        #                     0.1       , 0.1       , 0.1       , 0.1       , 0.12320185,
        #                     0.1       , 0.18369523, 0.200373  , 0.11895574, 0.15118493])
    print(env_params)

    if args.subtask and args.un != -1:
        env_params['subtask'] = args.un
    # else:
    #     env_params['subtask'] = None

    env = NormalizedBoxEnv(
        CentauroTrayEnv(**env_params),
        # normalize_obs=True,
        normalize_obs=False,
        online_normalization=False,
        obs_mean=None,
        obs_var=None,
        obs_alpha=0.001,
    )
    print("Environment loaded!!")

    if args.gpu:
        set_gpu_mode(True)
        policy.cuda()
    if isinstance(policy, MakeDeterministic):
        if isinstance(policy.stochastic_policy, PyTorchModule):
            policy.stochastic_policy.train(False)
    else:
        if isinstance(policy, PyTorchModule):
            policy.train(False)

    while True:
        if args.record:
            rollout_start_fcn = lambda: \
                env.start_recording_video('centauro_video.mp4')
            rollout_end_fcn = lambda: \
                env.stop_recording_video()
        else:
            rollout_start_fcn = None
            rollout_end_fcn = None

        obs_normalizer = data.get('obs_normalizer')

        if args.H != -1:
            H = args.H

        path = rollout(
            env,
            policy,
            max_path_length=H,
            animated=True,
            obs_normalizer=obs_normalizer,
            rollout_start_fcn=rollout_start_fcn,
            rollout_end_fcn=rollout_end_fcn,
        )
        plot_rollout_reward(path)

        if hasattr(env, "log_diagnostics"):
            env.log_diagnostics([path])

        logger.dump_tabular()

        if args.record:
            break
Ejemplo n.º 5
0
    def __init__(
            self,
            env,
            policy,
            u_qf1,
            replay_buffer,
            batch_size=1024,
            normalize_obs=False,
            eval_env=None,
            i_qf1=None,
            u_qf2=None,
            i_qf2=None,
            i_vf=None,
            u_vf=None,
            action_prior='uniform',
            i_entropy_scale=1.,
            u_entropy_scale=None,
            auto_alpha=True,
            i_tgt_entro=None,
            u_tgt_entros=None,
            policy_lr=3e-4,
            qf_lr=3e-4,
            i_policy_mean_regu_weight=1e-3,
            i_policy_std_regu_weight=1e-3,
            i_policy_pre_activation_weight=0.,
            i_policy_mixing_coeff_weight=1e-3,
            u_policy_mean_regu_weight=None,
            u_policy_std_regu_weight=None,
            u_policy_pre_activation_weight=None,
            policy_weight_decay=0.,
            q_weight_decay=0.,
            optimizer='adam',
            # optimizer='rmsprop',
            # optimizer='sgd',
            optimizer_kwargs=None,
            i_soft_target_tau=5e-3,
            u_soft_target_tau=5e-3,
            i_target_update_interval=1,
            u_target_update_interval=1,
            reward_scale=1.,
            u_reward_scales=None,
            save_replay_buffer=False,
            eval_deterministic=True,
            log_tensorboard=False,
            **kwargs):

        # ###### #
        # Models #
        # ###### #

        # Exploration Policy
        self._policy = policy

        # Evaluation Policy
        if eval_deterministic:
            eval_policy = MakeDeterministic(self._policy)
        else:
            eval_policy = self._policy

        # Observation Normalizer
        if normalize_obs:
            self._obs_normalizer = RunningNormalizer(shape=env.obs_dim)
        else:
            self._obs_normalizer = None

        RLAlgorithm.__init__(self,
                             explo_env=env,
                             explo_policy=self._policy,
                             eval_env=eval_env,
                             eval_policy=eval_policy,
                             obs_normalizer=self._obs_normalizer,
                             **kwargs)

        # Number of Unintentional Tasks (Composable Tasks)
        self._n_unintentional = self._policy.n_heads

        # Evaluation Sampler (One for each unintentional)
        self.eval_u_samplers = [
            InPlacePathSampler(
                env=env,
                policy=WeightedMultiPolicySelector(self._policy, idx),
                total_samples=self.num_steps_per_eval,
                max_path_length=self.max_path_length,
                deterministic=True,
            ) for idx in range(self._n_unintentional)
        ]

        # Intentional (Main Task) Q-functions
        self._i_qf1 = i_qf1
        self._i_qf2 = i_qf2
        if i_vf is None:
            self._i_vf = None
            self._i_target_vf = None
            self._i_target_qf1 = self._i_qf1.copy()
            self._i_target_qf2 = \
                None if self._i_qf2 is None else self._i_qf2.copy()
        else:
            self._i_vf = i_vf
            self._i_target_vf = self._i_vf.copy()
            self._i_target_qf1 = None
            self._i_target_qf2 = None

        # Unintentional (Composable Tasks) Q-functions
        self._u_qf1 = u_qf1
        self._u_qf2 = u_qf2
        if u_vf is None:
            self._u_vf = None
            self._u_target_vf = None
            self._u_target_qf1 = self._u_qf1.copy()
            self._u_target_qf2 = self._u_qf2.copy()
        else:
            self._u_vf = u_vf
            self._u_target_vf = self._u_vf.copy()
            self._u_target_qf1 = None
            self._u_target_qf2 = None

        # Replay Buffer
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.save_replay_buffer = save_replay_buffer

        # Soft-update rate for target V-functions
        self._i_soft_target_tau = i_soft_target_tau
        self._u_soft_target_tau = u_soft_target_tau
        self._i_target_update_interval = i_target_update_interval
        self._u_target_update_interval = u_target_update_interval

        # Important algorithm hyperparameters
        self._action_prior = action_prior
        self._i_entropy_scale = i_entropy_scale
        if u_entropy_scale is None:
            u_entropy_scale = [
                i_entropy_scale for _ in range(self._n_unintentional)
            ]
        self._u_entropy_scale = torch.tensor(u_entropy_scale,
                                             dtype=torch.float32,
                                             device=ptu.device)

        # Desired Alphas
        self._auto_alphas = auto_alpha
        if i_tgt_entro is None:
            i_tgt_entro = -env.action_dim
        self._i_tgt_entro = torch.tensor([i_tgt_entro],
                                         dtype=torch.float32,
                                         device=ptu.device)
        if u_tgt_entros is None:
            u_tgt_entros = [i_tgt_entro for _ in range(self._n_unintentional)]
        self._u_tgt_entros = torch.tensor(u_tgt_entros,
                                          dtype=torch.float32,
                                          device=ptu.device)
        self._u_log_alphas = torch.zeros(self._n_unintentional,
                                         device=ptu.device,
                                         requires_grad=True)
        self._i_log_alpha = torch.zeros(1,
                                        device=ptu.device,
                                        requires_grad=True)

        # Reward Scales
        self.reward_scale = reward_scale
        if u_reward_scales is None:
            reward_scale = kwargs['reward_scale']
            u_reward_scales = [
                reward_scale for _ in range(self._n_unintentional)
            ]
        self._u_reward_scales = torch.tensor(u_reward_scales,
                                             dtype=torch.float32,
                                             device=ptu.device)

        # ########## #
        # Optimizers #
        # ########## #
        if optimizer.lower() == 'adam':
            optimizer_class = optim.Adam
            if optimizer_kwargs is None:
                optimizer_kwargs = dict(amsgrad=True,
                                        # amsgrad=False,
                                        )
        elif optimizer.lower() == 'rmsprop':
            optimizer_class = optim.RMSprop
            if optimizer_kwargs is None:
                optimizer_kwargs = dict()
        else:
            raise ValueError('Wrong optimizer')

        # Values optimizer
        vals_params_list = [self._u_qf1.parameters(), self._i_qf1.parameters()]
        if self._u_qf2 is not None:
            vals_params_list.append(self._u_qf2.parameters())
        if self._i_qf2 is not None:
            vals_params_list.append(self._i_qf2.parameters())
        if self._u_vf is not None:
            vals_params_list.append(self._u_vf.parameters())
        if self._i_vf is not None:
            vals_params_list.append(self._i_vf.parameters())
        vals_params = chain(*vals_params_list)

        self._values_optimizer = optimizer_class(vals_params,
                                                 lr=qf_lr,
                                                 weight_decay=q_weight_decay,
                                                 **optimizer_kwargs)

        # Policy optimizer
        self._policy_optimizer = optimizer_class(
            self._policy.parameters(),
            lr=policy_lr,
            weight_decay=policy_weight_decay,
            **optimizer_kwargs)

        # Alpha optimizers
        self._alphas_optimizer = optimizer_class(
            [self._u_log_alphas, self._i_log_alpha],
            lr=policy_lr,
            **optimizer_kwargs)

        # Weights for policy regularization coefficients
        self._i_pol_mean_regu_weight = i_policy_mean_regu_weight
        self._i_pol_std_regu_weight = i_policy_std_regu_weight
        self._i_pol_pre_activ_weight = i_policy_pre_activation_weight
        self._i_pol_mixing_coeff_weight = i_policy_mixing_coeff_weight

        if u_policy_mean_regu_weight is None:
            u_policy_mean_regu_weight = [
                i_policy_mean_regu_weight for _ in range(self._n_unintentional)
            ]
        self._u_policy_mean_regu_weight = \
            torch.tensor(u_policy_mean_regu_weight, dtype=torch.float32,
                         device=ptu.device)
        if u_policy_std_regu_weight is None:
            u_policy_std_regu_weight = [
                i_policy_std_regu_weight for _ in range(self._n_unintentional)
            ]
        self._u_policy_std_regu_weight = \
            torch.tensor(u_policy_std_regu_weight, dtype=torch.float32,
                         device=ptu.device)
        if u_policy_pre_activation_weight is None:
            u_policy_pre_activation_weight = [
                i_policy_pre_activation_weight
                for _ in range(self._n_unintentional)
            ]
        self._u_policy_pre_activ_weight = \
            torch.tensor(u_policy_pre_activation_weight, dtype=torch.float32,
                         device=ptu.device)

        # Useful Variables for logging
        self.log_data = dict()
        self.log_data['Pol KL Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Qf2 Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Vf Loss'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Rewards'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Entropy'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))
        self.log_data['Policy Mean'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Pol Log Std'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
            self.explo_env.action_dim,
        ))
        self.log_data['Mixing Weights'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional,
            self.explo_env.action_dim,
        ))
        self.log_data['Alphas'] = np.zeros((
            self.num_train_steps_per_epoch,
            self._n_unintentional + 1,
        ))

        # Tensorboard-like Logging
        self._log_tensorboard = log_tensorboard
        if log_tensorboard:
            self._summary_writer = \
                tensorboardX.SummaryWriter(log_dir=logger.get_snapshot_dir())
        else:
            self._summary_writer = None