def initial_phase_training(self, max_epochs=1000, sup_batch_size=64): # change optimizer to Adam for unsupervised learning self.action_rep.optim = torch.optim.Adam(self.action_rep.parameters(), lr=1e-3) initial_losses = [] print("Inital training phase started...") for counter in range(max_epochs): losses = [] states, actions, rewards, next_states, terminals = self.replay_memory.sample( sup_batch_size, random_machine=self.np_random) states = torch.from_numpy(states).to(device) actions_combined = torch.from_numpy(actions).to( device) # make sure to separate actions and action-parameters action = actions_combined[:, 0].long() action_para = actions_combined[:, self.num_actions + 1:] next_states = torch.from_numpy(next_states).to(device) loss = self.self_supervised_update(states, action, action_para, next_states) losses.append(loss) initial_losses.append(np.mean(losses)) if counter % 1 == 0: print("Epoch {} loss:: {}".format( counter, np.mean(initial_losses[-10:]))) # Terminate initial phase once action representations have converged. # if len(initial_losses) >= 20 and np.mean(initial_losses[-10:]) + 1e-5 >= np.mean(initial_losses[-20:]): # print("Converged...") # break print('... Initial training phase terminated!') self.initial_phase = False hard_update_target_network(self.action_rep, self.target_action_rep)
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.actor = MultiPassQActor(self.observation_space.shape[0], self.num_actions, self.action_parameter_sizes, **kwargs['actor_kwargs']).to(device) self.actor_target = MultiPassQActor(self.observation_space.shape[0], self.num_actions, self.action_parameter_sizes, **kwargs['actor_kwargs']).to(device) hard_update_target_network(self.actor, self.actor_target) self.actor_target.eval() self.actor_optimiser = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor)
def set_action_parameter_passthrough_weights(self, initial_weights, initial_bias=None): passthrough_layer = self.actor_param.action_parameters_passthrough_layer assert initial_weights.shape == passthrough_layer.weight.data.size() passthrough_layer.weight.data = torch.Tensor(initial_weights).float().to(self.device) if initial_bias is not None: assert initial_bias.shape == passthrough_layer.bias.data.size() passthrough_layer.bias.data = torch.Tensor(initial_bias).float().to(self.device) passthrough_layer.requires_grad = False passthrough_layer.weight.requires_grad = False passthrough_layer.bias.requires_grad = False hard_update_target_network(self.actor_param, self.actor_param_target)
def __init__( self, observation_space, action_space, actor_class=QActor, actor_kwargs={}, actor_param_class=ParamActor, actor_param_kwargs={}, epsilon_initial=1.0, epsilon_final=0.05, epsilon_steps=10000, batch_size=64, gamma=0.99, tau_actor=0.01, # Polyak averaging factor for copying target weights tau_actor_param=0.001, replay_memory_size=1000000, learning_rate_actor=0.0001, learning_rate_actor_param=0.00001, initial_memory_threshold=0, use_ornstein_noise=False, # if false, uses epsilon-greedy with uniform-random action-parameter exploration loss_func=F.mse_loss, # F.mse_loss clip_grad=10, inverting_gradients=False, zero_index_gradients=False, indexed=False, weighted=False, average=False, random_weighted=False, device="cuda" if torch.cuda.is_available() else "cpu", seed=None): super(PDQNAgent, self).__init__(observation_space, action_space) self.device = torch.device(device) self.num_actions = self.action_space.spaces[0].n self.action_parameter_sizes = np.array([ self.action_space.spaces[i].shape[0] for i in range(1, self.num_actions + 1) ]) self.action_parameter_size = int(self.action_parameter_sizes.sum()) self.action_max = torch.from_numpy(np.ones( (self.num_actions, ))).float().to(device) self.action_min = -self.action_max.detach() self.action_range = (self.action_max - self.action_min).detach() print([ self.action_space.spaces[i].high for i in range(1, self.num_actions + 1) ]) self.action_parameter_max_numpy = np.concatenate([ self.action_space.spaces[i].high for i in range(1, self.num_actions + 1) ]).ravel() self.action_parameter_min_numpy = np.concatenate([ self.action_space.spaces[i].low for i in range(1, self.num_actions + 1) ]).ravel() self.action_parameter_range_numpy = (self.action_parameter_max_numpy - self.action_parameter_min_numpy) self.action_parameter_max = torch.from_numpy( self.action_parameter_max_numpy).float().to(device) self.action_parameter_min = torch.from_numpy( self.action_parameter_min_numpy).float().to(device) self.action_parameter_range = torch.from_numpy( self.action_parameter_range_numpy).float().to(device) self.epsilon = epsilon_initial self.epsilon_initial = epsilon_initial self.epsilon_final = epsilon_final self.epsilon_steps = epsilon_steps self.indexed = indexed self.weighted = weighted self.average = average self.random_weighted = random_weighted assert (weighted ^ average ^ random_weighted ) or not (weighted or average or random_weighted) self.action_parameter_offsets = self.action_parameter_sizes.cumsum() self.action_parameter_offsets = np.insert( self.action_parameter_offsets, 0, 0) self.batch_size = batch_size self.gamma = gamma self.replay_memory_size = replay_memory_size self.initial_memory_threshold = initial_memory_threshold self.learning_rate_actor = learning_rate_actor self.learning_rate_actor_param = learning_rate_actor_param self.inverting_gradients = inverting_gradients self.tau_actor = tau_actor self.tau_actor_param = tau_actor_param self._step = 0 self._episode = 0 self.updates = 0 self.clip_grad = clip_grad self.zero_index_gradients = zero_index_gradients self.np_random = None self.seed = seed self._seed(seed) self.use_ornstein_noise = use_ornstein_noise self.noise = OrnsteinUhlenbeckActionNoise( self.action_parameter_size, random_machine=self.np_random, mu=0., theta=0.15, sigma=0.0001) #, theta=0.01, sigma=0.01) print(self.num_actions + self.action_parameter_size) self.replay_memory = Memory(replay_memory_size, observation_space.shape, (1 + self.action_parameter_size, ), next_actions=False) self.actor = actor_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **actor_kwargs).to(device) self.actor_target = actor_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **actor_kwargs).to(device) hard_update_target_network(self.actor, self.actor_target) self.actor_target.eval() self.actor_param = actor_param_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **actor_param_kwargs).to(device) self.actor_param_target = actor_param_class( self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **actor_param_kwargs).to(device) hard_update_target_network(self.actor_param, self.actor_param_target) self.actor_param_target.eval() self.loss_func = loss_func # l1_smooth_loss performs better but original paper used MSE # Original DDPG paper [Lillicrap et al. 2016] used a weight decay of 0.01 for Q (critic) # but setting weight_decay=0.01 on the critic_optimiser seems to perform worse... # using AMSgrad ("fixed" version of Adam, amsgrad=True) doesn't seem to help either... self.actor_optimiser = optim.Adam( self.actor.parameters(), lr=self.learning_rate_actor) #, betas=(0.95, 0.999)) self.actor_param_optimiser = optim.Adam( self.actor_param.parameters(), lr=self.learning_rate_actor_param ) #, betas=(0.95, 0.999)) #, weight_decay=critic_l2_reg)
def __init__( self, observation_space, action_space, actor_class=Actor, reduced_action_dim=3, parameter_action_dim=4, actor_kwargs={}, critic_class=Critic, critic_kwargs={}, epsilon_initial=1.0, epsilon_final=0.01, epsilon_steps=10000, batch_size=64, gamma=0.99, beta=0.5, # averaging factor between off-policy and on-policy targets during n-step updates tau_actor=0.001, # Polyak averaging factor for updating target weights tau_critic=0.001, replay_memory=None, # memory buffer object replay_memory_size=1000000, learning_rate_actor=0.00001, learning_rate_critic=0.001, initial_memory_threshold=0, clip_grad=10, adam_betas=(0.95, 0.999), use_ornstein_noise=False, # if false, uses epsilon-greedy with uniform-random action-parameter exploration loss_func=F.mse_loss, # F.smooth_l1_loss inverting_gradients=False, n_step_returns=False, initial_phase=True, embed_lr=1e-4, initial_phase_epochs=2000, seed=None): super(PADDPGAgent, self).__init__(observation_space, action_space) self.num_actions = self.action_space.spaces[0].n self.action_parameter_sizes = np.array([ self.action_space.spaces[i].shape[0] for i in range(1, self.num_actions + 1) ]) self.action_parameter_size = int(self.action_parameter_sizes.sum()) self.action_max = torch.from_numpy(np.ones( (self.num_actions, ))).float().to(device) self.action_min = -self.action_max.detach() self.action_range = (self.action_max - self.action_min).detach() self.action_parameter_max_numpy = np.concatenate([ self.action_space.spaces[i].high for i in range(1, self.num_actions + 1) ]).ravel() self.action_parameter_min_numpy = np.concatenate([ self.action_space.spaces[i].low for i in range(1, self.num_actions + 1) ]).ravel() self.action_parameter_range_numpy = (self.action_parameter_max_numpy - self.action_parameter_min_numpy) self.action_parameter_max = torch.from_numpy( self.action_parameter_max_numpy).float().to(device) self.action_parameter_min = torch.from_numpy( self.action_parameter_min_numpy).float().to(device) self.action_parameter_range = torch.from_numpy( self.action_parameter_range_numpy).float().to(device) self.epsilon = epsilon_initial self.epsilon_initial = epsilon_initial self.epsilon_final = epsilon_final self.epsilon_steps = epsilon_steps self.clip_grad = clip_grad self.batch_size = batch_size self.gamma = gamma self.beta = beta self.replay_memory_size = replay_memory_size self.initial_memory_threshold = initial_memory_threshold self.learning_rate_actor = learning_rate_actor self.learning_rate_critic = learning_rate_critic self.inverting_gradients = inverting_gradients self.tau_actor = tau_actor self.tau_critic = tau_critic self._step = 0 self._episode = 0 self.updates = 0 self.np_random = None self.seed = seed self._seed(seed) #embedding初始部分 self.action_rep = ActionRepresentation.Action_representation( state_dim=self.observation_space.shape[0], action_dim=self.num_actions, reduced_action_dim=self.num_actions, parameter_action_dim=self.action_parameter_size) self.target_action_rep = ActionRepresentation.Action_representation( state_dim=self.observation_space.shape[0], action_dim=self.num_actions, reduced_action_dim=self.num_actions, parameter_action_dim=self.action_parameter_size) hard_update_target_network(self.action_rep, self.target_action_rep) self.initial_phase = initial_phase self.reduced_action_dim = reduced_action_dim self.parameter_action_dim = parameter_action_dim self.embed_lr = embed_lr self.initial_phase_epochs = initial_phase_epochs self.use_ornstein_noise = use_ornstein_noise self.noise = OrnsteinUhlenbeckActionNoise( self.action_parameter_size, random_machine=self.np_random, mu=0., theta=0.15, sigma=0.0001) self.noise1 = OrnsteinUhlenbeckActionNoise(self.num_actions) print(self.num_actions + self.action_parameter_size) self.n_step_returns = n_step_returns if replay_memory is None: self.replay_memory = MemoryNStepReturns( replay_memory_size, observation_space.shape, (1 + self.num_actions + self.action_parameter_size, ), next_actions=False, n_step_returns=self.n_step_returns) else: self.replay_memory = replay_memory self.actor = actor_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **actor_kwargs).to(device) self.actor_target = actor_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **actor_kwargs).to(device) hard_update_target_network(self.actor, self.actor_target) self.actor_target.eval() self.critic = critic_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **critic_kwargs).to(device) self.critic_target = critic_class(self.observation_space.shape[0], self.num_actions, self.action_parameter_size, **critic_kwargs).to(device) hard_update_target_network(self.critic, self.critic_target) self.critic_target.eval() self.loss_func = loss_func # l1_smooth_loss performs better but original paper used MSE self.actor_optimiser = optim.Adam(self.actor.parameters(), lr=self.learning_rate_actor, betas=adam_betas) self.critic_optimiser = optim.Adam(self.critic.parameters(), lr=self.learning_rate_critic, betas=adam_betas) self.action_rep_optimiser = optim.SGD(self.action_rep.parameters(), lr=self.embed_lr)