Ejemplo n.º 1
0
    def create_model(self) -> None:
        """
        Initialize the model
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)

        self.q1 = (get_model("v", self.network_type)(
            state_dim, action_dim, "Qsa", self.layers).to(self.device).float())

        self.q2 = (get_model("v", self.network_type)(
            state_dim, action_dim, "Qsa", self.layers).to(self.device).float())

        self.policy = (get_model(
            "p", self.network_type)(state_dim,
                                    action_dim,
                                    self.layers,
                                    discrete,
                                    False,
                                    sac=True).to(self.device).float())

        self.q1_targ = deepcopy(self.q1).to(self.device).float()
        self.q2_targ = deepcopy(self.q2).to(self.device).float()

        # freeze target parameters
        for param in self.q1_targ.parameters():
            param.requires_grad = False
        for param in self.q2_targ.parameters():
            param.requires_grad = False

        # optimizers
        self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr)
        self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr)
        self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr)

        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr)

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)

        # set action scales
        if self.env.action_space is None:
            self.action_scale = torch.tensor(1.0).to(self.device)
            self.action_bias = torch.tensor(0.0).to(self.device)
        else:
            self.action_scale = torch.FloatTensor(
                (self.env.action_space.high - self.env.action_space.low) /
                2.0).to(self.device)
            self.action_bias = torch.FloatTensor(
                (self.env.action_space.high + self.env.action_space.low) /
                2.0).to(self.device)
Ejemplo n.º 2
0
    def create_model(self):
        # Instantiate networks and optimizers
        (state_dim, action_dim, disc,
         action_lim) = self.get_env_properties(self.env)

        self.policy_new, self.policy_old = (
            get_model("p", self.network_type)(state_dim,
                                              action_dim,
                                              self.layers,
                                              disc=disc,
                                              action_lim=action_lim),
            get_model("p", self.network_type)(state_dim,
                                              action_dim,
                                              self.layers,
                                              disc=disc,
                                              action_lim=action_lim),
        )
        self.policy_new = self.policy_new.to(self.device)
        self.policy_old = self.policy_old.to(self.device)

        self.value_fn = get_model("v", self.network_type)(
            state_dim, action_dim).to(self.device)

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.policy_new.load_state_dict(self.checkpoint["policy_weights"])
            self.value_fn.load_state_dict(self.checkpoint["value_weights"])
            for key, item in self.checkpoint.items():
                if key not in [
                        "policy_weights", "value_weights", "save_model"
                ]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.policy_old.load_state_dict(self.policy_new.state_dict())

        self.optimizer_policy = opt.Adam(self.policy_new.parameters(),
                                         lr=self.lr_policy)
        self.optimizer_value = opt.Adam(self.value_fn.parameters(),
                                        lr=self.lr_value)

        self.traj_reward = []
        self.policy_old.policy_hist = Variable(torch.Tensor()).to(self.device)
        self.policy_new.policy_hist = Variable(torch.Tensor()).to(self.device)
        self.value_fn.value_hist = Variable(torch.Tensor()).to(self.device)

        self.policy_new.loss_hist = Variable(torch.Tensor()).to(self.device)
        self.value_fn.loss_hist = Variable(torch.Tensor()).to(self.device)
Ejemplo n.º 3
0
    def create_model(self):
        state_dim = self.env.observation_space.shape[0]
        action_dim = self.env.action_space.shape[0]
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.ac.load_state_dict(self.checkpoint["weights"])
            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size)
        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_p)
        self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
Ejemplo n.º 4
0
    def create_model(self) -> None:
        """
        Initialize the model
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)
        if discrete:
            raise Exception(
                "Discrete Environments not supported for {}.".format(
                    __class__.__name__))
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)
        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_p)
        self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
Ejemplo n.º 5
0
    def create_model(self) -> None:
        """
        Creates actor critic model and initialises optimizers
        """
        input_dim, action_dim, discrete, action_lim = get_env_properties(
            self.env, self.network_type)

        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(input_dim,
                                                     action_dim,
                                                     self.layers,
                                                     "V",
                                                     discrete,
                                                     action_lim=action_lim).to(
                                                         self.device)

        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_policy)
        self.optimizer_value = opt.Adam(self.ac.critic.parameters(),
                                        lr=self.lr_value)

        self.rollout = RolloutBuffer(self.rollout_size, self.env)
Ejemplo n.º 6
0
    def create_model(self):
        state_dim, action_dim, disc = self.get_env_properties()
        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim, action_dim,
                                                     self.layers, "Qsa",
                                                     False).to(self.device)

        self.ac.qf1 = self.ac.critic
        self.ac.qf2 = get_model("v", self.network_type)(state_dim,
                                                        action_dim,
                                                        hidden=self.layers,
                                                        val_type="Qsa")

        self.ac.qf1.to(self.device)
        self.ac.qf2.to(self.device)

        if self.pretrained is not None:
            self.load(self)
            self.ac.actor.load_state_dict(self.checkpoint["policy_weights"])
            self.ac.qf1.load_state_dict(self.checkpoint["q1_weights"])
            self.ac.qf2.load_state_dict(self.checkpoint["q2_weights"])

            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size)
        self.q_params = (list(self.ac.qf1.parameters()) +
                         list(self.ac.qf2.parameters()))
        self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q)

        self.optimizer_policy = torch.optim.Adam(self.ac.actor.parameters(),
                                                 lr=self.lr_p)
Ejemplo n.º 7
0
    def create_model(self) -> None:
        """
        Initialize the model and target model for various variants of DQN.
        Initializes optimizer and replay buffers as well.
        """
        state_dim, action_dim, _, _ = get_env_properties(self.env)
        if self.network_type == "mlp":
            if self.dueling_dqn:
                self.model = DuelingDQNValueMlp(state_dim, action_dim)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValue(state_dim, action_dim,
                                                 self.num_atoms)
            elif self.noisy_dqn:
                self.model = NoisyDQNValue(state_dim, action_dim)
            else:
                self.model = get_model("v",
                                       self.network_type)(state_dim,
                                                          action_dim, "Qs")

        elif self.network_type == "cnn":
            self.framestack = self.env.framestack

            if self.dueling_dqn:
                self.model = DuelingDQNValueCNN(action_dim, self.framestack)
            elif self.noisy_dqn:
                self.model = NoisyDQNValueCNN(action_dim, self.framestack)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValueCNN(action_dim, self.num_atoms,
                                                    self.framestack)
            else:
                self.model = get_model("v", self.network_type)(action_dim,
                                                               self.framestack,
                                                               "Qs")

        self.target_model = deepcopy(self.model)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedBuffer(
                self.replay_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.replay_size, self.env)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)
Ejemplo n.º 8
0
    def create_model(self):
        '''
        Initialize the actor and critic networks 
        '''
        state_dim = self.env.observation_space.shape[0]

        action_lim = None
        if isinstance(self.env.action_space, gym.spaces.Discrete):
            action_dim = self.env.action_space.n
            discrete = True
        elif isinstance(self.env.action_space, gym.spaces.Box):
            action_dim = self.env.action_space.shape[0]
            action_lim = self.env.action_space.high[0]
            discrete = False
        else:
            raise NotImplementedError

        # Instantiate networks and optimizers
        self.ac = get_model("ac", self.network_type)(state_dim,
                                                     action_dim,
                                                     self.layers,
                                                     "V",
                                                     discrete,
                                                     action_lim=action_lim).to(
                                                         self.device)

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.ac.actor.load_state_dict(self.checkpoint["policy_weights"])
            self.ac.critic.load_state_dict(self.checkpoint["value_weights"])

            for key, item in self.checkpoint.items():
                if key not in [
                        "policy_weights", "value_weights", "save_model"
                ]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_policy)
        self.optimizer_value = opt.Adam(self.ac.critic.parameters(),
                                        lr=self.lr_value)

        self.policy_hist = Variable(torch.Tensor()).to(self.device)
        self.value_hist = Variable(torch.Tensor()).to(self.device)
        self.traj_reward = []
        self.policy_loss_hist = Variable(torch.Tensor()).to(self.device)
        self.value_loss_hist = Variable(torch.Tensor()).to(self.device)
Ejemplo n.º 9
0
    def create_model(self) -> None:
        state_dim, action_dim, discrete, _ = get_env_properties(self.env)
        if discrete:
            raise Exception(
                "Discrete Environments not supported for {}.".format(__class__.__name__)
            )
        if self.noise is not None:
            self.noise = self.noise(
                np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)
            )

        self.ac = get_model("ac", self.network_type)(
            state_dim, action_dim, self.layers, "Qsa", False
        ).to(self.device)

        self.ac.qf1 = self.ac.critic
        self.ac.qf2 = get_model("v", self.network_type)(
            state_dim, action_dim, hidden=self.layers, val_type="Qsa"
        )

        self.ac.qf1.to(self.device)
        self.ac.qf2.to(self.device)

        self.ac_target = deepcopy(self.ac).to(self.device)

        # freeze target network params
        for param in self.ac_target.parameters():
            param.requires_grad = False

        self.replay_buffer = ReplayBuffer(self.replay_size, self.env)
        self.q_params = list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters())
        self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q)

        self.optimizer_policy = torch.optim.Adam(
            self.ac.actor.parameters(), lr=self.lr_p
        )
Ejemplo n.º 10
0
    def create_model(self):
        """
        Initialize the actor and critic networks
        """
        input_dim, action_dim, discrete, action_lim = get_env_properties(
            self.env, self.network_type
        )

        # Instantiate networks and optimizers
        self.actor = get_model("p", self.network_type)(
            input_dim, action_dim, self.layers, "V", discrete, action_lim=action_lim
        ).to(self.device)

        self.optimizer_policy = opt.Adam(self.actor.parameters(), lr=self.lr_policy)

        self.rollout = RolloutBuffer(self.rollout_size, self.env,)
Ejemplo n.º 11
0
    def create_model(self) -> None:
        """
        Creates actor critic model and initialises optimizers
        """
        (state_dim, action_dim, discrete,
         action_lim) = self.get_env_properties()

        if self.noise is not None:
            self.noise = self.noise(np.zeros_like(action_dim),
                                    self.noise_std * np.ones_like(action_dim))

        self.ac = get_model("ac", self.network_type)(state_dim,
                                                     action_dim,
                                                     self.layers,
                                                     "V",
                                                     discrete,
                                                     action_lim=action_lim).to(
                                                         self.device)

        self.actor_optimizer = opt.Adam(self.ac.actor.parameters(),
                                        lr=self.lr_actor)

        self.critic_optimizer = opt.Adam(self.ac.critic.parameters(),
                                         lr=self.lr_critic)

        self.traj_reward = []
        self.actor_hist = torch.Tensor().to(self.device)
        self.critic_hist = torch.Tensor().to(self.device)

        self.actor_loss_hist = torch.Tensor().to(self.device)
        self.critic_loss_hist = torch.Tensor().to(self.device)

        # load paramaters if already trained
        if self.run_num is not None:
            self.load(self)
            self.ac.actor.load_state_dict(self.checkpoint["actor_weights"])
            self.ac.critic.load_state_dict(self.checkpoint["critic_weights"])
            for key, item in self.checkpoint.items():
                if key not in ["actor_weights", "critic_weights"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")
Ejemplo n.º 12
0
    def create_model(self):
        # Instantiate networks and optimizers
        input_dim, action_dim, discrete, action_lim = get_env_properties(
            self.env, self.network_type)

        self.ac = get_model("ac", self.network_type)(
            input_dim,
            action_dim,
            self.layers,
            "V",
            discrete,
            action_lim=action_lim,
            activation=self.activation,
        ).to(self.device)

        self.optimizer_policy = opt.Adam(self.ac.actor.parameters(),
                                         lr=self.lr_policy)
        self.optimizer_value = opt.Adam(self.ac.critic.parameters(),
                                        lr=self.lr_value)

        self.rollout = RolloutBuffer(self.rollout_size,
                                     self.env,
                                     gae_lambda=0.95)
Ejemplo n.º 13
0
    def create_model(self):
        state_dim = self.env.observation_space.shape[0]

        # initialize models
        if isinstance(self.env.action_space, gym.spaces.Discrete):
            action_dim = self.env.action_space.n
            disc = True
        elif isinstance(self.env.action_space, gym.spaces.Box):
            action_dim = self.env.action_space.shape[0]
            disc = False
        else:
            raise NotImplementedError

        self.q1 = get_model("v",
                            self.network_type)(state_dim, action_dim, "Qsa",
                                               self.layers).to(self.device)
        self.q2 = get_model("v",
                            self.network_type)(state_dim, action_dim, "Qsa",
                                               self.layers).to(self.device)

        self.policy = get_model("p",
                                self.network_type)(state_dim,
                                                   action_dim,
                                                   self.layers,
                                                   disc,
                                                   False,
                                                   sac=True).to(self.device)

        if self.pretrained is not None:
            self.load(self)
            self.q1.load_state_dict(self.checkpoint["q1_weights"])
            self.q2.load_state_dict(self.checkpoint["q2_weights"])
            self.policy.load_state_dict(self.checkpoint["policy_weights"])

            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.q1_targ = deepcopy(self.q1).to(self.device)
        self.q2_targ = deepcopy(self.q2).to(self.device)

        # freeze target parameters
        for p in self.q1_targ.parameters():
            p.requires_grad = False
        for p in self.q2_targ.parameters():
            p.requires_grad = False

        # optimizers
        self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr)
        self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr)
        self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr)

        if self.entropy_tuning:
            self.target_entropy = -torch.prod(
                torch.Tensor(self.env.action_space.shape).to(
                    self.device)).item()
            self.log_alpha = torch.zeros(1,
                                         requires_grad=True,
                                         device=self.device)
            self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr)

        self.replay_buffer = ReplayBuffer(self.replay_size)

        # set action scales
        if self.env.action_space is None:
            self.action_scale = torch.tensor(1.0).to(self.device)
            self.action_bias = torch.tensor(0.0).to(self.device)
        else:
            self.action_scale = torch.FloatTensor(
                (self.env.action_space.high - self.env.action_space.low) /
                2.0).to(self.device)
            self.action_bias = torch.FloatTensor(
                (self.env.action_space.high + self.env.action_space.low) /
                2.0).to(self.device)
Ejemplo n.º 14
0
    def create_model(self):
        '''
        Initialize the model and target model for various variants of DQN. 
        Initializes optimizer and replay buffers as well.
        '''
        state_dim, action_dim, disc = self.get_env_properties()
        if self.network_type == "mlp":
            if self.dueling_dqn:
                self.model = DuelingDQNValueMlp(state_dim, action_dim)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValue(
                    state_dim,
                    action_dim,
                    self.num_atoms,
                )
            elif self.noisy_dqn:
                self.model = NoisyDQNValue(state_dim, action_dim)
            else:
                self.model = get_model("v",
                                       self.network_type)(state_dim,
                                                          action_dim, "Qs")

        elif self.network_type == "cnn":
            if self.history_length is None:
                self.history_length = 4

            if self.transform is None:
                self.transform = transforms.Compose([
                    transforms.ToPILImage(),
                    transforms.Grayscale(),
                    transforms.Resize((110, 84)),
                    transforms.CenterCrop(84),
                    transforms.ToTensor()
                ])

            self.state_history = deque([
                self.transform(self.env.observation_space.sample()).reshape(
                    -1, 84, 84) for _ in range(self.history_length)
            ],
                                       maxlen=self.history_length)

            if self.dueling_dqn:
                self.model = DuelingDQNValueCNN(self.env.action_space.n,
                                                self.history_length)
            elif self.noisy_dqn:
                self.model = NoisyDQNValueCNN(self.env.action_space.n,
                                              self.history_length)
            elif self.categorical_dqn:
                self.model = CategoricalDQNValueCNN(self.env.action_space.n,
                                                    self.num_atoms,
                                                    self.history_length)
            else:
                self.model = get_model("v", self.network_type)(
                    self.env.action_space.n, self.history_length, "Qs")

        # load paramaters if already trained
        if self.pretrained is not None:
            self.load(self)
            self.model.load_state_dict(self.checkpoint["weights"])
            for key, item in self.checkpoint.items():
                if key not in ["weights", "save_model"]:
                    setattr(self, key, item)
            print("Loaded pretrained model")

        self.target_model = deepcopy(self.model)

        if self.prioritized_replay:
            self.replay_buffer = PrioritizedBuffer(
                self.replay_size, self.prioritized_replay_alpha)
        else:
            self.replay_buffer = ReplayBuffer(self.replay_size)

        self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)