Example #1
0
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(
            self.hparams
        )  # TODO: normalization is not required but will it be needed?
        self.eval_env = env_selector(self.hparams, config.seed + 1)
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim  # includes skill in case env is option wrapped
        self.qf = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.vf = ValueFunction(self.Do,
                                [config.layer_size, config.layer_size])
        self.vf_target = ValueFunction(self.Do,
                                       [config.layer_size, config.layer_size])
        self.vf_target.load_state_dict(self.vf.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            qf=self.qf,
            reg=config.reg,
            device=self.hparams.device
        )  # GMM policy with K mixtures, no reparametrization trick, regularization
        self.modules = [
            "Policy", self.policy, "QF", self.qf, "VF", self.vf, "VF_Target",
            self.vf_target
        ]

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        # Runs on CPU(moved sampling to (on_train_start) to avoid bug in DIAYN + use GPU instead of CPU(No need for device logic!!) as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # TODO remove device logic in Policy
        # Also the reason why wandb logger is not available
        self.batch_idx = None
    def __init__(self, config: Config) -> None:

        super().__init__(config)
        self.z = 0
        self._num_skills = self.hparams.num_skills
        self.env.set_reward_fn(HERF())
        self.eval_env.set_reward_fn(HERF())
        self.env.reset(state=None, skill=self.z)
        self.eval_env.reset(state=None, skill=self.z)
        self.batch_return = 0
        self.single_skill = config.single_skill
        self.double_skill = config.double_skill
        self.skilldata_val = [[] for i in range(self._num_skills)]
        if self.single_skill is None:
            self.batch_env = [
                env_selector(self.hparams, config.seed + 1)
                for i in range(self._num_skills)
            ]
            for i in range(self._num_skills):
                self.batch_env[i].set_reward_fn(HERF())
                self.batch_env[i].reset(skill=i)
        # TODO HERF only supports upto 25 skills, uses modulo beyond that.
        self.discriminator = Discriminator(
            self.Do - self._num_skills, [config.layer_size, config.layer_size],
            self._num_skills)
        self.distiller = [
            Discriminator(
                self.Do - self._num_skills,
                [self.hparams.disc_size[i], self.hparams.disc_size[i]],
                self._num_skills) for i in range(len(self.hparams.disc_size))
        ]

        self.sampler.reset()
        self._p_z = torch.FloatTensor(
            np.full(self._num_skills, 1.0 / self._num_skills))
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(self.hparams)
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.q1 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        self.q2 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.q1_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])
        self.q2_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])
        self.stage = None

        self.pool_train = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.pool_val = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            #TODO: pass both q functions to use policy in deterministic mode
            qf=self.q1_target,
            reg=config.reg,
            device=self.hparams.device,
            reparametrization=True
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # TODO: add assertion to test qf of policy and qf of model.

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        self.modules = [
            "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target",
            self.q1_target, "Q2_target", self.q2_target
        ]
Example #4
0
    def __init__(self, config: Config) -> None:
        self.hparams = config
        self.env = env_selector(self.hparams)  # TODO: ensure normalization is not required
        self.eval_env = env_selector(self.hparams, config.seed + 1)  # TODO: add functionality to optionwrap for DIAYN
        # TODO: check all config.names to ensure they are in dict
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.qf = ValueFunction(self.Do + self.Da, [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.vf = ValueFunction(self.Do, [config.layer_size, config.layer_size])
        self.vf_target = ValueFunction(self.Do, [config.layer_size, config.layer_size])
        self.vf_target.load_state_dict(self.vf.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            qf=self.qf,
            reg=config.reg,
            device="cpu"
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # self.policy.cuda(config.device)
        # self.vf.cuda(config.device)
        # self.qf.cuda(config.device)
        # self.vf_target.cuda(config.device)

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length)
        # self.env.reset(None,self.z)

        # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # Also the reason why wandb logger is not available
        self.pool.add_samples(self.sampler.sample(config.min_pool_size, self.policy))
        # self.optimizers = []
        # TODO: combining vf and policy, figure out more elegant way to have unlinked learning rates than as
        # a multiplication factor in the loss sum. Also figure out why having them separate doesn't increase
        # compute time by the expected
        self.optimizer_policy = optim.Adam(list(self.policy.parameters())  # +list(self.vf.parameters())
                                           , lr=self._policy_lr)
        self.optimizer_vf = optim.Adam(self.vf.parameters(), lr=self._vf_lr)
        self.optimizer_qf = optim.Adam(self.qf.parameters(), lr=self._qf_lr)
        self.optimizer = optim.Adam(list(self.policy.parameters())+
                                    list(self.vf.parameters())+
                                    list(self.qf.parameters()), lr=self._policy_lr)
Example #5
0
    def __init__(self, config: Config) -> None:
        super().__init__()
        self.hparams = config

        self.env = env_selector(
            self.hparams
        )  # TODO: normalization is not required but will it be needed?
        self.eval_env = env_selector(self.hparams, config.seed + 1)
        # TODO: check all config.names to ensure they are in dict
        self.Da = self.env.action_space.flat_dim
        self.Do = self.env.observation_space.flat_dim
        self.q1 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        self.q2 = ValueFunction(self.Do + self.Da,
                                [config.layer_size, config.layer_size])
        # Constructs a value function mlp with Relu hidden non-linearities, no output non-linearity and with xavier
        # init for weights and zero init for biases.
        self.q1_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])
        self.q2_target = ValueFunction(self.Do + self.Da,
                                       [config.layer_size, config.layer_size])

        self.q1_target.load_state_dict(self.q1.state_dict())
        self.q2_target.load_state_dict(self.q2.state_dict())

        self.pool = SimpleReplayBuffer(
            env_spec=self.env.spec,
            max_replay_buffer_size=config.max_pool_size,
        )  # create a replay buffer for state+skill and action.

        self.policy = GMMPolicy(
            env_spec=self.env.spec,
            K=config.K,
            hidden_layer_sizes=[config.layer_size, config.layer_size],
            #TODO: pass both q functions to use policy in deterministic mode
            qf=self.q1_target,
            reg=config.reg,
            device=self.hparams.device,
            reparametrization=True
        )  # GMM policy with K mixtures, no reparametrization trick, regularization

        # TODO: add assertion to test qf of policy and qf of model.

        self.sampler = Sampler(self.env, config.max_path_length)

        self._policy_lr = config.lr
        self._qf_lr = config.lr
        self._vf_lr = config.lr
        # TODO fix varialbe naming with _
        self._scale_reward = config.scale_reward
        self._discount = config.discount
        self._tau = config.tau
        self.max_path_return = -np.inf
        self.last_path_return = 0
        self.val_path_return = 0
        self._scale_entropy = config.scale_entropy

        self._save_full_state = config.save_full_state
        self.modules = [
            "Policy", self.policy, "Q1", self.q1, "Q2", self.q2, "Q1_target",
            self.q1_target, "Q2_target", self.q2_target
        ]
        # self.z = self.get_best_skill(self.policy, self.env, self.config.num_skills, self.config.max_path_length)
        # self.env.reset(None,self.z)

        # Runs on CPU as Models are transferred to GPU only by trainer which happens after the lightning model init.
        # Also the reason why wandb logger is not available
        self.batch_idx = None