def create_model(self) -> None: """ Initialize the model Initializes optimizer and replay buffers as well. """ state_dim, action_dim, discrete, _ = get_env_properties(self.env) self.q1 = (get_model("v", self.network_type)( state_dim, action_dim, "Qsa", self.layers).to(self.device).float()) self.q2 = (get_model("v", self.network_type)( state_dim, action_dim, "Qsa", self.layers).to(self.device).float()) self.policy = (get_model( "p", self.network_type)(state_dim, action_dim, self.layers, discrete, False, sac=True).to(self.device).float()) self.q1_targ = deepcopy(self.q1).to(self.device).float() self.q2_targ = deepcopy(self.q2).to(self.device).float() # freeze target parameters for param in self.q1_targ.parameters(): param.requires_grad = False for param in self.q2_targ.parameters(): param.requires_grad = False # optimizers self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr) self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr) self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr) if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr) self.replay_buffer = ReplayBuffer(self.replay_size, self.env) # set action scales if self.env.action_space is None: self.action_scale = torch.tensor(1.0).to(self.device) self.action_bias = torch.tensor(0.0).to(self.device) else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0).to(self.device)
def create_model(self): # Instantiate networks and optimizers (state_dim, action_dim, disc, action_lim) = self.get_env_properties(self.env) self.policy_new, self.policy_old = ( get_model("p", self.network_type)(state_dim, action_dim, self.layers, disc=disc, action_lim=action_lim), get_model("p", self.network_type)(state_dim, action_dim, self.layers, disc=disc, action_lim=action_lim), ) self.policy_new = self.policy_new.to(self.device) self.policy_old = self.policy_old.to(self.device) self.value_fn = get_model("v", self.network_type)( state_dim, action_dim).to(self.device) # load paramaters if already trained if self.pretrained is not None: self.load(self) self.policy_new.load_state_dict(self.checkpoint["policy_weights"]) self.value_fn.load_state_dict(self.checkpoint["value_weights"]) for key, item in self.checkpoint.items(): if key not in [ "policy_weights", "value_weights", "save_model" ]: setattr(self, key, item) print("Loaded pretrained model") self.policy_old.load_state_dict(self.policy_new.state_dict()) self.optimizer_policy = opt.Adam(self.policy_new.parameters(), lr=self.lr_policy) self.optimizer_value = opt.Adam(self.value_fn.parameters(), lr=self.lr_value) self.traj_reward = [] self.policy_old.policy_hist = Variable(torch.Tensor()).to(self.device) self.policy_new.policy_hist = Variable(torch.Tensor()).to(self.device) self.value_fn.value_hist = Variable(torch.Tensor()).to(self.device) self.policy_new.loss_hist = Variable(torch.Tensor()).to(self.device) self.value_fn.loss_hist = Variable(torch.Tensor()).to(self.device)
def create_model(self): state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[0] if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) # load paramaters if already trained if self.pretrained is not None: self.load(self) self.ac.load_state_dict(self.checkpoint["weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_p) self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
def create_model(self) -> None: """ Initialize the model Initializes optimizer and replay buffers as well. """ state_dim, action_dim, discrete, _ = get_env_properties(self.env) if discrete: raise Exception( "Discrete Environments not supported for {}.".format( __class__.__name__)) if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_p) self.optimizer_q = opt.Adam(self.ac.critic.parameters(), lr=self.lr_q)
def create_model(self) -> None: """ Creates actor critic model and initialises optimizers """ input_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network_type) if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(input_dim, action_dim, self.layers, "V", discrete, action_lim=action_lim).to( self.device) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) self.rollout = RolloutBuffer(self.rollout_size, self.env)
def create_model(self): state_dim, action_dim, disc = self.get_env_properties() if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "Qsa", False).to(self.device) self.ac.qf1 = self.ac.critic self.ac.qf2 = get_model("v", self.network_type)(state_dim, action_dim, hidden=self.layers, val_type="Qsa") self.ac.qf1.to(self.device) self.ac.qf2.to(self.device) if self.pretrained is not None: self.load(self) self.ac.actor.load_state_dict(self.checkpoint["policy_weights"]) self.ac.qf1.load_state_dict(self.checkpoint["q1_weights"]) self.ac.qf2.load_state_dict(self.checkpoint["q2_weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size) self.q_params = (list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters())) self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q) self.optimizer_policy = torch.optim.Adam(self.ac.actor.parameters(), lr=self.lr_p)
def create_model(self) -> None: """ Initialize the model and target model for various variants of DQN. Initializes optimizer and replay buffers as well. """ state_dim, action_dim, _, _ = get_env_properties(self.env) if self.network_type == "mlp": if self.dueling_dqn: self.model = DuelingDQNValueMlp(state_dim, action_dim) elif self.categorical_dqn: self.model = CategoricalDQNValue(state_dim, action_dim, self.num_atoms) elif self.noisy_dqn: self.model = NoisyDQNValue(state_dim, action_dim) else: self.model = get_model("v", self.network_type)(state_dim, action_dim, "Qs") elif self.network_type == "cnn": self.framestack = self.env.framestack if self.dueling_dqn: self.model = DuelingDQNValueCNN(action_dim, self.framestack) elif self.noisy_dqn: self.model = NoisyDQNValueCNN(action_dim, self.framestack) elif self.categorical_dqn: self.model = CategoricalDQNValueCNN(action_dim, self.num_atoms, self.framestack) else: self.model = get_model("v", self.network_type)(action_dim, self.framestack, "Qs") self.target_model = deepcopy(self.model) if self.prioritized_replay: self.replay_buffer = PrioritizedBuffer( self.replay_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)
def create_model(self): ''' Initialize the actor and critic networks ''' state_dim = self.env.observation_space.shape[0] action_lim = None if isinstance(self.env.action_space, gym.spaces.Discrete): action_dim = self.env.action_space.n discrete = True elif isinstance(self.env.action_space, gym.spaces.Box): action_dim = self.env.action_space.shape[0] action_lim = self.env.action_space.high[0] discrete = False else: raise NotImplementedError # Instantiate networks and optimizers self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "V", discrete, action_lim=action_lim).to( self.device) # load paramaters if already trained if self.pretrained is not None: self.load(self) self.ac.actor.load_state_dict(self.checkpoint["policy_weights"]) self.ac.critic.load_state_dict(self.checkpoint["value_weights"]) for key, item in self.checkpoint.items(): if key not in [ "policy_weights", "value_weights", "save_model" ]: setattr(self, key, item) print("Loaded pretrained model") self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) self.policy_hist = Variable(torch.Tensor()).to(self.device) self.value_hist = Variable(torch.Tensor()).to(self.device) self.traj_reward = [] self.policy_loss_hist = Variable(torch.Tensor()).to(self.device) self.value_loss_hist = Variable(torch.Tensor()).to(self.device)
def create_model(self) -> None: state_dim, action_dim, discrete, _ = get_env_properties(self.env) if discrete: raise Exception( "Discrete Environments not supported for {}.".format(__class__.__name__) ) if self.noise is not None: self.noise = self.noise( np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim) ) self.ac = get_model("ac", self.network_type)( state_dim, action_dim, self.layers, "Qsa", False ).to(self.device) self.ac.qf1 = self.ac.critic self.ac.qf2 = get_model("v", self.network_type)( state_dim, action_dim, hidden=self.layers, val_type="Qsa" ) self.ac.qf1.to(self.device) self.ac.qf2.to(self.device) self.ac_target = deepcopy(self.ac).to(self.device) # freeze target network params for param in self.ac_target.parameters(): param.requires_grad = False self.replay_buffer = ReplayBuffer(self.replay_size, self.env) self.q_params = list(self.ac.qf1.parameters()) + list(self.ac.qf2.parameters()) self.optimizer_q = torch.optim.Adam(self.q_params, lr=self.lr_q) self.optimizer_policy = torch.optim.Adam( self.ac.actor.parameters(), lr=self.lr_p )
def create_model(self): """ Initialize the actor and critic networks """ input_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network_type ) # Instantiate networks and optimizers self.actor = get_model("p", self.network_type)( input_dim, action_dim, self.layers, "V", discrete, action_lim=action_lim ).to(self.device) self.optimizer_policy = opt.Adam(self.actor.parameters(), lr=self.lr_policy) self.rollout = RolloutBuffer(self.rollout_size, self.env,)
def create_model(self) -> None: """ Creates actor critic model and initialises optimizers """ (state_dim, action_dim, discrete, action_lim) = self.get_env_properties() if self.noise is not None: self.noise = self.noise(np.zeros_like(action_dim), self.noise_std * np.ones_like(action_dim)) self.ac = get_model("ac", self.network_type)(state_dim, action_dim, self.layers, "V", discrete, action_lim=action_lim).to( self.device) self.actor_optimizer = opt.Adam(self.ac.actor.parameters(), lr=self.lr_actor) self.critic_optimizer = opt.Adam(self.ac.critic.parameters(), lr=self.lr_critic) self.traj_reward = [] self.actor_hist = torch.Tensor().to(self.device) self.critic_hist = torch.Tensor().to(self.device) self.actor_loss_hist = torch.Tensor().to(self.device) self.critic_loss_hist = torch.Tensor().to(self.device) # load paramaters if already trained if self.run_num is not None: self.load(self) self.ac.actor.load_state_dict(self.checkpoint["actor_weights"]) self.ac.critic.load_state_dict(self.checkpoint["critic_weights"]) for key, item in self.checkpoint.items(): if key not in ["actor_weights", "critic_weights"]: setattr(self, key, item) print("Loaded pretrained model")
def create_model(self): # Instantiate networks and optimizers input_dim, action_dim, discrete, action_lim = get_env_properties( self.env, self.network_type) self.ac = get_model("ac", self.network_type)( input_dim, action_dim, self.layers, "V", discrete, action_lim=action_lim, activation=self.activation, ).to(self.device) self.optimizer_policy = opt.Adam(self.ac.actor.parameters(), lr=self.lr_policy) self.optimizer_value = opt.Adam(self.ac.critic.parameters(), lr=self.lr_value) self.rollout = RolloutBuffer(self.rollout_size, self.env, gae_lambda=0.95)
def create_model(self): state_dim = self.env.observation_space.shape[0] # initialize models if isinstance(self.env.action_space, gym.spaces.Discrete): action_dim = self.env.action_space.n disc = True elif isinstance(self.env.action_space, gym.spaces.Box): action_dim = self.env.action_space.shape[0] disc = False else: raise NotImplementedError self.q1 = get_model("v", self.network_type)(state_dim, action_dim, "Qsa", self.layers).to(self.device) self.q2 = get_model("v", self.network_type)(state_dim, action_dim, "Qsa", self.layers).to(self.device) self.policy = get_model("p", self.network_type)(state_dim, action_dim, self.layers, disc, False, sac=True).to(self.device) if self.pretrained is not None: self.load(self) self.q1.load_state_dict(self.checkpoint["q1_weights"]) self.q2.load_state_dict(self.checkpoint["q2_weights"]) self.policy.load_state_dict(self.checkpoint["policy_weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.q1_targ = deepcopy(self.q1).to(self.device) self.q2_targ = deepcopy(self.q2).to(self.device) # freeze target parameters for p in self.q1_targ.parameters(): p.requires_grad = False for p in self.q2_targ.parameters(): p.requires_grad = False # optimizers self.q1_optimizer = opt.Adam(self.q1.parameters(), self.lr) self.q2_optimizer = opt.Adam(self.q2.parameters(), self.lr) self.policy_optimizer = opt.Adam(self.policy.parameters(), self.lr) if self.entropy_tuning: self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to( self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = opt.Adam([self.log_alpha], lr=self.lr) self.replay_buffer = ReplayBuffer(self.replay_size) # set action scales if self.env.action_space is None: self.action_scale = torch.tensor(1.0).to(self.device) self.action_bias = torch.tensor(0.0).to(self.device) else: self.action_scale = torch.FloatTensor( (self.env.action_space.high - self.env.action_space.low) / 2.0).to(self.device) self.action_bias = torch.FloatTensor( (self.env.action_space.high + self.env.action_space.low) / 2.0).to(self.device)
def create_model(self): ''' Initialize the model and target model for various variants of DQN. Initializes optimizer and replay buffers as well. ''' state_dim, action_dim, disc = self.get_env_properties() if self.network_type == "mlp": if self.dueling_dqn: self.model = DuelingDQNValueMlp(state_dim, action_dim) elif self.categorical_dqn: self.model = CategoricalDQNValue( state_dim, action_dim, self.num_atoms, ) elif self.noisy_dqn: self.model = NoisyDQNValue(state_dim, action_dim) else: self.model = get_model("v", self.network_type)(state_dim, action_dim, "Qs") elif self.network_type == "cnn": if self.history_length is None: self.history_length = 4 if self.transform is None: self.transform = transforms.Compose([ transforms.ToPILImage(), transforms.Grayscale(), transforms.Resize((110, 84)), transforms.CenterCrop(84), transforms.ToTensor() ]) self.state_history = deque([ self.transform(self.env.observation_space.sample()).reshape( -1, 84, 84) for _ in range(self.history_length) ], maxlen=self.history_length) if self.dueling_dqn: self.model = DuelingDQNValueCNN(self.env.action_space.n, self.history_length) elif self.noisy_dqn: self.model = NoisyDQNValueCNN(self.env.action_space.n, self.history_length) elif self.categorical_dqn: self.model = CategoricalDQNValueCNN(self.env.action_space.n, self.num_atoms, self.history_length) else: self.model = get_model("v", self.network_type)( self.env.action_space.n, self.history_length, "Qs") # load paramaters if already trained if self.pretrained is not None: self.load(self) self.model.load_state_dict(self.checkpoint["weights"]) for key, item in self.checkpoint.items(): if key not in ["weights", "save_model"]: setattr(self, key, item) print("Loaded pretrained model") self.target_model = deepcopy(self.model) if self.prioritized_replay: self.replay_buffer = PrioritizedBuffer( self.replay_size, self.prioritized_replay_alpha) else: self.replay_buffer = ReplayBuffer(self.replay_size) self.optimizer = opt.Adam(self.model.parameters(), lr=self.lr)