def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size # print('hola action size', task.action_size) self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights(self.critic_local.model.get_weights()) self.actor_target.model.set_weights(self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.3 self.exploration_theta = 2.0 self.exploration_sigma = 20 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 10 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, lr_actor=0.0003, lr_critic=0.0003, state_dim=8, discount=0.99, action_dim=1, replay_buffer_capacity=1000000, tau=0.005, batch_size=256, reward_scaling=1, rbc_controller=RBCAgent, safe_exploration=None, hidden_dim=None): self.gamma = discount self.tau = tau self.memory = ReplayBuffer(input_shape=state_dim, n_actions=action_dim, max_mem_size=replay_buffer_capacity) self.batch_size = batch_size self.n_actions = action_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.hidden_size = hidden_dim self.actor = ActorNetwork(learning_rate=lr_actor, input_size=state_dim, max_action=1, n_actions=action_dim, name='actor', hidden_size=self.hidden_size) self.critic_1 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_1', hidden_size=self.hidden_size) self.critic_2 = CriticNetwork(learning_rate=lr_critic, input_size=state_dim, n_actions=action_dim, name='critic_2', hidden_size=self.hidden_size) self.value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='value', hidden_size=self.hidden_size) self.target_value = ValueNetwork(learning_rate=lr_critic, input_size=state_dim, name='target_value', hidden_size=self.hidden_size) self.scale = reward_scaling self.update_network_parameters(tau=1)
def __init__(self, task, expl_mu, expl_th, expl_sigma, gamma, tau, batch=64): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = expl_mu self.exploration_theta = expl_th self.exploration_sigma = expl_sigma self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 200000 self.batch_size = batch self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = gamma # discount factor self.tau = tau # for soft update of target parameters
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.10 self.exploration_sigma = 0.15 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # score tracker self.best_score = -np.inf self.achievement = False # Episode variables self.reset_episode()
def train(): # build SFDQN print('building SFDQN') deep_sf = DeepSF(keras_model_handle=sf_model_lambda, **sfdqn_params) sfdqn = SFDQN(deep_sf=deep_sf, buffer=ReplayBuffer(sfdqn_params['buffer_params']), **sfdqn_params, **agent_params) # train SFDQN print('training SFDQN') train_tasks, test_tasks = generate_tasks(False) sfdqn_perf = sfdqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) # build DQN print('building DQN') dqn = DQN(model_lambda=dqn_model_lambda, buffer=ReplayBuffer(dqn_params['buffer_params']), **dqn_params, **agent_params) # training DQN print('training DQN') train_tasks, test_tasks = generate_tasks(True) dqn_perf = dqn.train(train_tasks, n_samples, test_tasks=test_tasks, n_test_ev=agent_params['n_test_ev']) # smooth data def smooth(y, box_pts): return np.convolve(y, np.ones(box_pts) / box_pts, mode='same') sfdqn_perf = smooth(sfdqn_perf, 10)[:-5] dqn_perf = smooth(dqn_perf, 10)[:-5] x = np.linspace(0, 4, sfdqn_perf.size) # reporting progress ticksize = 14 textsize = 18 plt.rc('font', size=textsize) # controls default text sizes plt.rc('axes', titlesize=textsize) # fontsize of the axes title plt.rc('axes', labelsize=textsize) # fontsize of the x and y labels plt.rc('xtick', labelsize=ticksize) # fontsize of the tick labels plt.rc('ytick', labelsize=ticksize) # fontsize of the tick labels plt.rc('legend', fontsize=ticksize) # legend fontsize plt.figure(figsize=(8, 6)) ax = plt.gca() ax.plot(x, sfdqn_perf, label='SFDQN') ax.plot(x, dqn_perf, label='DQN') plt.xlabel('training task index') plt.ylabel('averaged test episode reward') plt.title('Testing Reward Averaged over all Test Tasks') plt.tight_layout() plt.legend(frameon=False) plt.savefig('figures/sfdqn_return.png')
def __init__(self, observation_space=None, action_space=None, hidden_dim=None, discount=0.99, tau=0.005, lr=None, batch_size=256, replay_buffer_capacity=1e5, start_training=None, exploration_period=None, action_scaling_coef=1., reward_scaling=1., update_per_step=1, iterations_as=2, seed=0, deterministic=None, rbc_controller=None, safe_exploration=None): if hidden_dim is None: hidden_dim = [256, 256] self.start_training = start_training self.discount = discount self.batch_size = batch_size self.tau = tau self.action_scaling_coef = action_scaling_coef self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.deterministic = deterministic self.update_per_step = update_per_step self.iterations_as = iterations_as self.exploration_period = exploration_period self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.reset_action_tracker() self.reset_reward_tracker() self.time_step = 0 self.action_space = action_space self.observation_space = observation_space # Optimizers/Loss using the Huber loss self.soft_q_criterion = nn.SmoothL1Loss() # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") state_dim = self.observation_space.shape[0] action_dim = self.action_space.shape[0] self.alpha = 0.05 self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(action_dim), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetwork(state_dim, action_dim, self.action_space, self.action_scaling_coef, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr) self.target_entropy = -np.prod(self.action_space.shape).item() self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr)
def __init__(self, state_dim=None, action_dim=None, hidden_dim=None, discount=0.99, tau=0.005, lr_actor=None, lr_critic=None, batch_size=256, replay_buffer_capacity=1e5, learning_start=None, reward_scaling=1., seed=0, rbc_controller=None, safe_exploration=None, automatic_entropy_tuning=False, alpha=1): if hidden_dim is None: hidden_dim = [256, 256] self.learning_start = learning_start self.discount = discount self.batch_size = batch_size self.tau = tau self.reward_scaling = reward_scaling t.manual_seed(seed) np.random.seed(seed) self.action_list_ = [] self.action_list2_ = [] self.hidden_dim = hidden_dim self.rbc_controller = rbc_controller self.safe_exploration = safe_exploration self.automatic_entropy_tuning = automatic_entropy_tuning self.time_step = 0 # Optimizers/Loss using the Huber loss # self.soft_q_criterion = f.mse_loss # device self.device = t.device("cuda" if t.cuda.is_available() else "cpu") self.memory = ReplayBuffer(input_shape=int(state_dim), n_actions=int(1), max_mem_size=int(replay_buffer_capacity)) # init networks self.soft_q_net1 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_net2 = SoftQNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net1 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) self.target_soft_q_net2 = SoftQNetworkDiscrete( state_dim, action_dim, hidden_dim).to(self.device) for target_param, param in zip(self.target_soft_q_net1.parameters(), self.soft_q_net1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_soft_q_net2.parameters(), self.soft_q_net2.parameters()): target_param.data.copy_(param.data) # Policy self.policy_net = PolicyNetworkDiscrete(state_dim, action_dim, hidden_dim).to(self.device) self.soft_q_optimizer1 = optim.Adam(self.soft_q_net1.parameters(), lr=lr_critic) self.soft_q_optimizer2 = optim.Adam(self.soft_q_net2.parameters(), lr=lr_critic) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=lr_actor) if self.automatic_entropy_tuning: # we set the max possible entropy as the target entropy self.target_entropy = -np.log((1.0 / action_dim)) * 0.98 self.log_alpha = t.zeros(1, requires_grad=True, device=self.device) self.alpha = self.log_alpha.exp() self.alpha_optimizer = optim.Adam([self.log_alpha], lr=lr_critic, eps=1e-4) else: self.alpha = alpha