def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size # print('hola action size', task.action_size) self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.3 self.exploration_theta = 2.0 self.exploration_sigma = 20 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 10 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, lr_actor=0.0003, lr_critic=0.0003, state_dim=8, discount=0.99, action_dim=1, replay_buffer_capacity=1000000, tau=0.005, batch_size=256, reward_scaling=1, rbc_controller=RBCAgent, safe_exploration=None, hidden_dim=None): self.gamma = discount self.tau = tau self.memory = ReplayBuffer(input_shape=state_dim, n_actions=action_dim, max_mem_size=replay_buffer_capacity) self.batch_size = batch_size self.n_actions = action_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.hidden_size = hidden_dim self.actor = ActorNetwork(learning_rate=lr_actor, input_size=state_dim, max_action=1, n_actions=action_dim, name='actor', hidden_size=self.hidden_size) self.critic_1 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_1', hidden_size=self.hidden_size) self.critic_2 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_2', hidden_size=self.hidden_size) self.value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='value', hidden_size=self.hidden_size) self.target_value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='target_value', hidden_size=self.hidden_size) self.scale = reward_scaling self.update_network_parameters(tau=1)
def __init__(self, task, expl_mu, expl_th, expl_sigma, gamma, tau, batch=64): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = expl_mu self.exploration_theta = expl_th self.exploration_sigma = expl_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 200000 self.batch_size = batch self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.10 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # score tracker self.best_score = -np.inf self.achievement = False # Episode variables self.reset_episode()
def train(): # build SFDQN print('building SFDQN') deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params) sfdqn = SFDQN(deep_sf=deep_sf, buffer=ReplayBuffer(sfdqn_params['buffer_params']), **sfdqn_params, **agent_params) # train SFDQN print('training SFDQN') train_tasks, test_tasks = generate_tasks(False) sfdqn_perf = sfdqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) # build DQN print('building DQN') dqn = DQN(model_lambda=dqn_model_lambda, buffer=ReplayBuffer(dqn_params['buffer_params']), **dqn_params, **agent_params) # training DQN print('training DQN') train_tasks, test_tasks = generate_tasks(True) dqn_perf = dqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) # smooth data def smooth(y, box_pts): return np.convolve(y, np.ones(box_pts) / box_pts, mode='same') sfdqn_perf = smooth(sfdqn_perf, 10)[:-5] dqn_perf = smooth(dqn_perf, 10)[:-5] x = np.linspace(0, 4, sfdqn_perf.size) # reporting progress ticksize = 14 textsize = 18 plt.rc('font', size=textsize) # controls default text sizes plt.rc('axes', titlesize=textsize) # fontsize of the axes title plt.rc('axes', labelsize=textsize) # fontsize of the x and y labels plt.rc('xtick', labelsize=ticksize) # fontsize of the tick labels plt.rc('ytick', labelsize=ticksize) # fontsize of the tick labels plt.rc('legend', fontsize=ticksize) # legend fontsize plt.figure(figsize=(8, 6)) ax = plt.gca() ax.plot(x, sfdqn_perf, label='SFDQN') ax.plot(x, dqn_perf, label='DQN') plt.xlabel('training task index') plt.ylabel('averaged test episode reward') plt.title('Testing Reward Averaged over all Test Tasks') plt.tight_layout() plt.legend(frameon=False) plt.savefig('figures/sfdqn_return.png')
def __init__(self, observation_space=None, action_space=None, hidden_dim=None, discount=0.99, tau=0.005, lr=None, batch_size=256, replay_buffer_capacity=1e5, start_training=None, exploration_period=None, action_scaling_coef=1., reward_scaling=1., update_per_step=1, iterations_as=2, seed=0, deterministic=None, rbc_controller=None, safe_exploration=None): if hidden_dim is None: hidden_dim = [256, 256] self.start_training = start_training self.discount = discount self.batch_size = batch_size self.tau = tau self.action_scaling_coef = action_scaling_coef self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.deterministic = deterministic self.update_per_step = update_per_step self.iterations_as = iterations_as self.exploration_period = exploration_period self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.reset_action_tracker() self.reset_reward_tracker() self.time_step = 0 self.action_space = action_space self.observation_space = observation_space # Optimizers/Loss using the Huber loss self.soft_q_criterion = nn.SmoothL1Loss() # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") state_dim = self.observation_space.shape[0] action_dim = self.action_space.shape[0] self.alpha = 0.05 self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(action_dim), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetwork(state_dim, action_dim, self.action_space, self.action_scaling_coef, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr) self.target_entropy = -np.prod(self.action_space.shape).item() self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)
class SAC2Agent(): def __init__(self, observation_space=None, action_space=None, hidden_dim=None, discount=0.99, tau=0.005, lr=None, batch_size=256, replay_buffer_capacity=1e5, start_training=None, exploration_period=None, action_scaling_coef=1., reward_scaling=1., update_per_step=1, iterations_as=2, seed=0, deterministic=None, rbc_controller=None, safe_exploration=None): if hidden_dim is None: hidden_dim = [256, 256] self.start_training = start_training self.discount = discount self.batch_size = batch_size self.tau = tau self.action_scaling_coef = action_scaling_coef self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.deterministic = deterministic self.update_per_step = update_per_step self.iterations_as = iterations_as self.exploration_period = exploration_period self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.reset_action_tracker() self.reset_reward_tracker() self.time_step = 0 self.action_space = action_space self.observation_space = observation_space # Optimizers/Loss using the Huber loss self.soft_q_criterion = nn.SmoothL1Loss() # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") state_dim = self.observation_space.shape[0] action_dim = self.action_space.shape[0] self.alpha = 0.05 self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(action_dim), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetwork(state_dim, action_dim, self.action_space, self.action_scaling_coef, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr) self.target_entropy = -np.prod(self.action_space.shape).item() self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr) def reset_action_tracker(self): self.action_tracker = [] def reset_reward_tracker(self): self.reward_tracker = [] def choose_action(self, simulation_step, electricity_price, storage_soc, observation): if simulation_step < self.safe_exploration: action = self.rbc_controller.choose_action( electricity_price=electricity_price, storage_soc=storage_soc) actions = t.tensor([action], dtype=t.float).to(self.device) # print(action) else: if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) actions, _, _ = self.policy_net.sample(state) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_ctr < self.batch_size: return state, action, reward, next_state, done = self.memory.sample_buffer( self.batch_size) if self.device.type == "cuda": state = t.cuda.FloatTensor(state).to(self.device) next_state = t.cuda.FloatTensor(next_state).to(self.device) action = t.cuda.FloatTensor(action).to(self.device) reward = t.cuda.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.cuda.FloatTensor(done).unsqueeze(1).to(self.device) else: state = t.FloatTensor(state).to(self.device) next_state = t.FloatTensor(next_state).to(self.device) action = t.FloatTensor(action).to(self.device) reward = t.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.FloatTensor(done).unsqueeze(1).to(self.device) with t.no_grad(): # Update Q-values. First, sample an action from the Gaussian policy/distribution for the current (next) state and its associated log probability of occurrence. new_next_actions, new_log_pi, _ = self.policy_net.sample( next_state) target_q_values = t.min( self.target_soft_q_net1(next_state, new_next_actions), self.target_soft_q_net2(next_state, new_next_actions), ) - self.alpha * new_log_pi q_target = reward + (1 - done) * self.discount * target_q_values # self.q_tracker.append(q_target.mean()) # Update Soft Q-Networks q1_pred = self.soft_q_net1(state, action) q2_pred = self.soft_q_net2(state, action) q1_loss = self.soft_q_criterion(q1_pred, q_target) q2_loss = self.soft_q_criterion(q2_pred, q_target) self.soft_q_optimizer1.zero_grad() q1_loss.backward() self.soft_q_optimizer1.step() self.soft_q_optimizer2.zero_grad() q2_loss.backward() self.soft_q_optimizer2.step() # Update Policy new_actions, log_pi, _ = self.policy_net.sample(state) q_new_actions = t.min(self.soft_q_net1(state, new_actions), self.soft_q_net2(state, new_actions)) policy_loss = (self.alpha * log_pi - q_new_actions).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() self.alpha = 0.05 # self.log_alpha.exp() # Soft Updates for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau)
class SACAgent(): def __init__(self, state_dim=None, action_dim=None, hidden_dim=None, discount=0.99, tau=0.005, lr_actor=None, lr_critic=None, batch_size=256, replay_buffer_capacity=1e5, learning_start=None, reward_scaling=1., seed=0, rbc_controller=None, safe_exploration=None, automatic_entropy_tuning=False, alpha=1): if hidden_dim is None: hidden_dim = [256, 256] self.learning_start = learning_start self.discount = discount self.batch_size = batch_size self.tau = tau self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.automatic_entropy_tuning = automatic_entropy_tuning self.time_step = 0 # Optimizers/Loss using the Huber loss # self.soft_q_criterion = f.mse_loss # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(1), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr_critic) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr_critic) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr_actor) if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / action_dim)) * 0.98 self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_critic, eps=1e-4) else: self.alpha = alpha def choose_action(self, simulation_step, electricity_price, storage_soc, observation): if simulation_step < self.safe_exploration: action = self.rbc_controller.choose_action( electricity_price=electricity_price, storage_soc=storage_soc) actions = t.tensor([action], dtype=t.float).to(self.device) # print(action) else: if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) actions, _, _ = self.policy_net.sample(state) return actions.cpu().detach().numpy()[0] def get_actions_probabilities(self, observation): if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) _, (actions_probabilities, _), _ = self.policy_net.sample(state) return actions_probabilities.cpu().detach().numpy()[0] def get_q_values(self, observation): if self.device.type == "cuda": state = t.cuda.FloatTensor([observation]).to(self.device) else: state = t.FloatTensor([observation]).to(self.device) q_1 = self.soft_q_net1(state) q_2 = self.soft_q_net2(state) q_1 = q_1.cpu().detach().numpy()[0] q_2 = q_2.cpu().detach().numpy()[0] return q_1, q_2 def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def learn(self): if self.memory.mem_ctr < self.batch_size: return state, action, reward, next_state, done = self.memory.sample_buffer( self.batch_size) if self.device.type == "cuda": state = t.cuda.FloatTensor(state).to(self.device) next_state = t.cuda.FloatTensor(next_state).to(self.device) action = t.cuda.LongTensor(action).to(self.device) reward = t.cuda.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.cuda.FloatTensor(done).unsqueeze(1).to(self.device) else: state = t.FloatTensor(state).to(self.device) next_state = t.FloatTensor(next_state).to(self.device) action = t.FloatTensor(action).to(self.device) reward = t.FloatTensor(reward).unsqueeze(1).to(self.device) done = t.FloatTensor(done).unsqueeze(1).to(self.device) with t.no_grad(): # Update Q-values. First, sample an action from the Gaussian policy/distribution for the current (next) state and its associated log probability of occurrence. new_next_actions, (action_probabilities, log_action_probabilities ), _ = self.policy_net.sample(next_state) qf1_next_target = self.target_soft_q_net1(next_state) qf2_next_target = self.target_soft_q_net2(next_state) min_qf_next_target = action_probabilities * ( t.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities) min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1) q_target = reward + (1 - done) * self.discount * min_qf_next_target # self.q_tracker.append(q_target.mean()) # Update Soft Q-Networks q1_pred = self.soft_q_net1(state) q2_pred = self.soft_q_net2(state) q1_pred = q1_pred.gather(1, action.reshape([self.batch_size, 1])) q2_pred = q2_pred.gather(1, action.reshape([self.batch_size, 1])) q1_loss = f.mse_loss(q1_pred, q_target) q2_loss = f.mse_loss(q2_pred, q_target) self.soft_q_optimizer1.zero_grad() q1_loss.backward() self.soft_q_optimizer1.step() self.soft_q_optimizer2.zero_grad() q2_loss.backward() self.soft_q_optimizer2.step() # Update Policy new_actions, ( action_probabilities, log_action_probabilities), _ = self.policy_net.sample(state) min_qf_pi = t.min(self.soft_q_net1(state), self.soft_q_net2(state)) inside_term = self.alpha * log_action_probabilities - min_qf_pi policy_loss = (action_probabilities * inside_term).sum(dim=1).mean() log_action_probabilities = t.sum(log_action_probabilities * action_probabilities, dim=1) self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() if self.automatic_entropy_tuning: alpha_loss = -(self.log_alpha * (log_action_probabilities + self.target_entropy).detach()).mean() else: alpha_loss = None if alpha_loss is not None: self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() # Soft Updates for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) def save_models(self, path): print('...saving models...') t.save(self.soft_q_net1, path + '\\critic_1.pth') t.save(self.soft_q_net2, path + '\\critic_2.pth') t.save(self.policy_net, path + '\\actor.pth') def load_models(self, path): print('...loading models...') dev = self.device self.soft_q_net1 = t.load(path + '\\critic_1.pth', map_location=dev) self.soft_q_net2 = t.load(path + '\\critic_2.pth', map_location=dev) self.policy_net = t.load(path + '\\actor.pth', map_location=dev)
def __init__(self, state_dim=None, action_dim=None, hidden_dim=None, discount=0.99, tau=0.005, lr_actor=None, lr_critic=None, batch_size=256, replay_buffer_capacity=1e5, learning_start=None, reward_scaling=1., seed=0, rbc_controller=None, safe_exploration=None, automatic_entropy_tuning=False, alpha=1): if hidden_dim is None: hidden_dim = [256, 256] self.learning_start = learning_start self.discount = discount self.batch_size = batch_size self.tau = tau self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.automatic_entropy_tuning = automatic_entropy_tuning self.time_step = 0 # Optimizers/Loss using the Huber loss # self.soft_q_criterion = f.mse_loss # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(1), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr_critic) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr_critic) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr_actor) if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / action_dim)) * 0.98 self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_critic, eps=1e-4) else: self.alpha = alpha
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, expl_mu, expl_th, expl_sigma, gamma, tau, batch=64): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = expl_mu self.exploration_theta = expl_th self.exploration_sigma = expl_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 200000 self.batch_size = batch self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done, save): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, save) # Roll over last state and action self.last_state = next_state def act(self, state, learn): """Returns actions for given state(s) as per current policy.""" if learn == False: # min_noise = 1e-12 self.actor_local.model.load_weights('qd_weights.h5') state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] # return list(action + [min_noise]*4) return list(action) else: state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences, save): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model, save) def soft_update(self, local_model, target_model, save=False): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) if save: local_model.save_weights('qd_weights.h5')
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 1000000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.95 # discount factor self.tau = 0.002 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, states): state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class SACAgent: def __init__(self, lr_actor=0.0003, lr_critic=0.0003, state_dim=8, discount=0.99, action_dim=1, replay_buffer_capacity=1000000, tau=0.005, batch_size=256, reward_scaling=1, rbc_controller=RBCAgent, safe_exploration=None, hidden_dim=None): self.gamma = discount self.tau = tau self.memory = ReplayBuffer(input_shape=state_dim, n_actions=action_dim, max_mem_size=replay_buffer_capacity) self.batch_size = batch_size self.n_actions = action_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.hidden_size = hidden_dim self.actor = ActorNetwork(learning_rate=lr_actor, input_size=state_dim, max_action=1, n_actions=action_dim, name='actor', hidden_size=self.hidden_size) self.critic_1 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_1', hidden_size=self.hidden_size) self.critic_2 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_2', hidden_size=self.hidden_size) self.value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='value', hidden_size=self.hidden_size) self.target_value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='target_value', hidden_size=self.hidden_size) self.scale = reward_scaling self.update_network_parameters(tau=1) def choose_action(self, simulation_step, electricity_price, storage_soc, observation): if simulation_step < self.safe_exploration: action = self.rbc_controller.choose_action( electricity_price=electricity_price, storage_soc=storage_soc) actions = t.tensor([action], dtype=t.float).to(self.actor.device) # print(action) else: state = t.tensor([observation], dtype=t.float).to(self.actor.device) actions, _ = self.actor.sample_normal(state, rep=False) return actions.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def update_network_parameters(self, tau=None): if tau is None: tau = self.tau target_value_params = self.target_value.named_parameters() value_params = self.value.named_parameters() target_value_state_dict = dict(target_value_params) value_state_dict = dict(value_params) for name in value_state_dict: value_state_dict[name] = tau * value_state_dict[name].clone() + \ (1 - tau) * target_value_state_dict[name].clone() self.target_value.load_state_dict(value_state_dict) def save_models(self, path): print('...saving models...') t.save(self.actor, path + '\\actor.pth') t.save(self.value, path + '\\value.pth') t.save(self.target_value, path + '\\target_value.pth') t.save(self.critic_1, path + '\\critic_1.pth') t.save(self.critic_2, path + '\\critic_2.pth') def load_models(self, path): print('...loading models...') dev = self.actor.device self.actor = t.load(path + '\\actor.pth', map_location=dev) self.value = t.load(path + '\\value.pth', map_location=dev) self.target_value = t.load(path + '\\target_value.pth', map_location=dev) self.critic_1 = t.load(path + '\\critic_1.pth', map_location=dev) self.critic_2 = t.load(path + '\\critic_2.pth', map_location=dev) def learn(self): if self.memory.mem_ctr < self.batch_size: return state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = t.tensor(reward, dtype=t.float).to(self.actor.device) done = t.tensor(done).to(self.actor.device) new_state = t.tensor(new_state, dtype=t.float).to(self.actor.device) state = t.tensor(state, dtype=t.float).to(self.actor.device) action = t.tensor(action, dtype=t.float).to(self.actor.device) value = self.value(state).view(-1) value_ = self.target_value(new_state).view(-1) value_[done] = 0.0 actions, log_prob = self.actor.sample_normal(state, rep=False) log_prob = log_prob.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = t.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) self.value.opt.zero_grad() value_target = critic_value - log_prob value_loss = 0.5 * f.mse_loss(value, value_target) value_loss.backward(retain_graph=True) self.value.opt.step() actions, log_prob = self.actor.sample_normal(state, rep=True) log_prob = log_prob.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = t.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_prob - critic_value actor_loss = t.mean(actor_loss) self.actor.opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor.opt.step() self.critic_1.opt.zero_grad() self.critic_2.opt.zero_grad() q_hat = self.scale * reward + self.gamma * value_ q1_old_policy = self.critic_1.forward(state, action).view(-1) q2_old_policy = self.critic_2.forward(state, action).view(-1) critic_1_loss = 0.5 * f.mse_loss(q1_old_policy, q_hat) critic_2_loss = 0.5 * f.mse_loss(q2_old_policy, q_hat) critic_loss = critic_1_loss + critic_2_loss critic_loss.backward() self.critic_1.opt.step() self.critic_2.opt.step() self.update_network_parameters() def learn_actor(self, updates: int, batch_size): for i in range(0, updates): print(i) state, _, _, _, _ = self.memory.sample_buffer(batch_size) state = t.tensor(state, dtype=t.float).to(self.actor.device) actions, log_prob = self.actor.sample_normal(state, rep=True) log_prob = log_prob.view(-1) q1_new_policy = self.critic_1.forward(state, actions) q2_new_policy = self.critic_2.forward(state, actions) critic_value = t.min(q1_new_policy, q2_new_policy) critic_value = critic_value.view(-1) actor_loss = log_prob - critic_value actor_loss = t.mean(actor_loss) self.actor.opt.zero_grad() actor_loss.backward(retain_graph=True) self.actor.opt.step()
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.10 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # score tracker self.best_score = -np.inf self.achievement = False # Episode variables self.reset_episode() def reset_episode(self): self.count = 0 self.noise.reset() self.achievement = False self.total_reward = [0.0, 0.0, 0.0] state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done, achievement): self.count += 1 self.total_reward = self.total_reward[:-1] self.total_reward = np.concatenate([[int(reward)], self.total_reward]) # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state self.score = np.mean(self.total_reward) self.achievement = achievement if self.score > self.best_score: self.best_score = self.score # print(self.total_reward, np.mean(self.total_reward), np.round(self.last_state[1:4], 2)) def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)