def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2, gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=100,warmup=1000, noise=0.1): self.alpha = alpha self.beta = beta self.tau = tau self.batch_size = batch_size self.max_action = env.action_space.high self.min_action = env.action_space.low self.gamma = gamma self.n_actions = n_actions self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.update_actor_iter = update_actor_interval self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic1') self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic2') self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='actor') # target nets self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic1') self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic2') self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_actor') self.noise = noise # set the target nets to be exactly as our nets self.update_network_parameters(tau=1)
def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=64): self.alpha = alpha self.tau = tau self.beta = beta self.batch_size = batch_size self.gamma = gamma self.n_actions = n_actions print(batch_size, fc1Dms, fc2Dms) self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='actor') self.critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='critic') self.target_actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_actor') self.target_critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_critic') self.update_network_parameters(tau=1)
def __init__(self, env, n1, n2, sigma=0.16, act_lr=6e-8, crit_lr=1e-4): ''' TODO: agent needs error node for learning env - the environment to act in n1 - number of place cells in x direction n2 - number place cells in y direction total number of place cells will be n1 x n2 ''' n_place_cells = n1*n2 action_indices = list(range(len(env.actions))) self.env = env self.gamma = 0.95 # discount factor # Create shared input node self.net = nengo.Network() self.net.input = nengo.Ensemble( n_neurons=n_place_cells, dimensions=2, radius=2 ) # initialize neural net for actor self.Actor = ActorNet( n_pc=n_place_cells, input_node=self.net.input, n_neuron_out=200, lr=act_lr ) # initialize neural net for critic self.Critic = CriticNet( input_node=self.net.input, n_neuron_out=100, lr=crit_lr ) self.DecisionMaker = DecisionNode(action_indices) self.Error = ErrorNode(self.gamma)
from Environment import TestEnv2D ''' Note: if reward delay in combination with resetting leads to no learning try staying at goal for multiple steps before resetting ''' env = TestEnv2D() BACKEND = 'CPU' dt = 0.001 duration = 400 discount = 0.9995 with nengo.Network() as net: envnode = nengo.Node(lambda t: env.step(), size_out=4) in_ens = nengo.Ensemble(n_neurons=1000, radius=2, dimensions=2) # encodes position critic = CriticNet(in_ens, n_neuron_out=1000, lr=1e-5) error = ErrorNode( discount=discount ) # seems like a reasonable value to have a reward gradient over the entire episode switch = Switch(state=1, switch_off=False, switchtime=duration / 2) # needed for compatibility with error implementation nengo.Connection(envnode[:2], in_ens) # error node connections # reward = input[0] value = input[1] switch = input[2] state = input[3] reset = input[4].astype(int) nengo.Connection(envnode[1], error.net.errornode[0], synapse=0) # reward connection nengo.Connection(critic.net.output, error.net.errornode[1], synapse=0) # value prediction nengo.Connection(switch.net.switch, error.net.errornode[2],
class DDpgAgent(): def __init__(self, alpha, beta, input_dims, tau, n_actions, gamma=0.99, fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=64): self.alpha = alpha self.tau = tau self.beta = beta self.batch_size = batch_size self.gamma = gamma self.n_actions = n_actions print(batch_size, fc1Dms, fc2Dms) self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.noise = OUActionNoise(mu=np.zeros(n_actions)) self.actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='actor') self.critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='critic') self.target_actor = ActorNet(alpha=alpha, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_actor') self.target_critic = CriticNet(beta=beta, input_dims=input_dims, n_actions=n_actions, fc1Dms=fc1Dms, fc2Dms=fc2Dms, name='target_critic') self.update_network_parameters(tau=1) def choose_action(self, state): self.actor.eval( ) # set the network into evaluation mode (because we are batch norm) state = T.tensor([state], dtype=T.float).to(self.actor.device) # the actions we got is totally determenistic so we need to add noise mu = self.actor.forward(state).to(self.actor.device) # adding noise to the actor output (states in the paper p4) mu_prime = mu + T.tensor(self.noise(), dtype=T.float).to( self.actor.device) # back to train mode self.actor.train() # detach() takes the tensor from the cpu, then we convert it to numpy # in order to feed it to the invironment return mu_prime.cpu().detach().numpy()[0] def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.critic.save_checkpoint() self.target_critic.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.target_actor.load_checkpoint() self.critic.load_checkpoint() self.target_critic.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return states, actions, rewards, new_states, dones = \ self.memory.sampling(self.batch_size) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states = T.tensor(states, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device) # print(states.shape) # print('actions size inside learn', actions.shape) target_actions = self.target_actor.forward(new_states) target_critic_value = self.target_critic.forward( new_states, target_actions) critic_value = self.critic.forward(states, actions) target_critic_value[ dones] = 0.0 # make the value of the terminal state =0 target_critic_value = target_critic_value.view( -1) # not sure why ?? TODO:test target = rewards + self.gamma * target_critic_value target = target.view(self.batch_size, 1) # convert to the same size as critic_value self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.actor.optimizer.zero_grad() actor_loss = -self.critic.forward(states, self.actor.forward(states)) actor_loss = T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() # print("im inside the learn function >>>>>>") self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_state_dict = dict(target_critic_params) target_actor_state_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau * critic_state_dict[name].clone() + \ (1-tau) * target_critic_state_dict[name].clone() for name in actor_state_dict: actor_state_dict[name] = tau * actor_state_dict[name].clone() + \ (1-tau) * target_actor_state_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) self.target_actor.load_state_dict(actor_state_dict)
class TD3Agent(): def __init__(self, alpha, beta, input_dims, tau, env, n_actions=2, gamma=0.99, update_actor_interval=2,fc1Dms=400, fc2Dms=300, max_size=1000000, batch_size=100,warmup=1000, noise=0.1): self.alpha = alpha self.beta = beta self.tau = tau self.batch_size = batch_size self.max_action = env.action_space.high self.min_action = env.action_space.low self.gamma = gamma self.n_actions = n_actions self.learn_step_cntr = 0 self.time_step = 0 self.warmup = warmup self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.update_actor_iter = update_actor_interval self.critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic1') self.critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Critic2') self.actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='actor') # target nets self.target_critic_1 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic1') self.target_critic_2 = CriticNet(beta, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_critic2') self.target_actor = ActorNet(alpha, input_dims, n_actions, fc1Dms, fc2Dms, name='Target_actor') self.noise = noise # set the target nets to be exactly as our nets self.update_network_parameters(tau=1) def remember(self, state, action, reward, new_state, done): self.memory.store_transition(state, action, reward, new_state, done) def choose_action(self, state): # check if we are in time after the warmup if self.time_step < self.warmup: # scale is the standard deviation mu = T.tensor(np.random.normal(scale=self.noise, size=(self.n_actions,))) else: # print("the warmup is done") state = T.tensor(state, dtype=T.float).to(self.actor.device) mu = self.actor.forward(state).to(self.actor.device) mu_prime = mu + T.tensor(np.random.normal(scale=self.noise), dtype=T.float).to(self.actor.device) # we want to make sure the mu is not out of the max action the env can take mu_prime = T.clamp(mu_prime, self.min_action[0], self.max_action[0]) self.time_step +=1 # action.shape= (2,) # print(action) return mu_prime.cpu().detach().numpy() def learn(self): if self.memory.mem_cntr < self.batch_size: # print("not learning") return states, actions, rewards, new_states, dones = \ self.memory.sampling(self.batch_size) actions = T.tensor(actions, dtype=T.float).to(self.actor.device) rewards = T.tensor(rewards, dtype=T.float).to(self.actor.device) states = T.tensor(states, dtype=T.float).to(self.actor.device) dones = T.tensor(dones).to(self.actor.device) new_states = T.tensor(new_states, dtype=T.float).to(self.actor.device) # regularization # might breake if elements of min and max are not equal target_action = self.target_actor.forward(new_states) + \ T.clamp(T.tensor(np.random.normal(scale=0.2)), -0.5, 0.5) target_action = T.clamp(target_action, self.min_action[0], self.max_action[0]) target_critic1_q = self.target_critic_1.forward(new_states, target_action) target_critic2_q = self.target_critic_2.forward(new_states, target_action) target_critic1_q[dones] = 0 target_critic2_q[dones] = 0 target_critic1_q = target_critic1_q.view(-1) target_critic2_q = target_critic2_q.view(-1) q1 = self.critic_1.forward(states, actions) q2 = self.critic_2.forward(states, actions) y = rewards + self.gamma * T.min(target_critic1_q, target_critic2_q) y = y.view(self.batch_size, 1) self.critic_1.optimizer.zero_grad() self.critic_2.optimizer.zero_grad() q1_loss = F.mse_loss(y, q1) q2_loss = F.mse_loss(y, q2) critic_loss = q1_loss + q2_loss critic_loss.backward() self.critic_1.optimizer.step() self.critic_2.optimizer.step() self.learn_step_cntr +=1 if self.learn_step_cntr % self.update_actor_iter != 0: return # print("learning>>") self.actor.optimizer.zero_grad() actor_loss = self.critic_1.forward(states, self.actor.forward(states)) actor_loss = -T.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() target_actor_params = self.target_actor.named_parameters() critic1_params = self.critic_1.named_parameters() critic2_params = self.critic_2.named_parameters() target_critic1_params = self.target_critic_1.named_parameters() target_critic2_params = self.target_critic_2.named_parameters() critic1_state_dict = dict(critic1_params) critic2_state_dict = dict(critic2_params) target_critic1_state_dict = dict(target_critic1_params) target_critic2_state_dict = dict(target_critic2_params) actor_state_dict = dict(actor_params) target_actor_state_dict = dict(target_actor_params) # for name in target_actor_state_dict: # target_actor_state_dict[name] = tau * actor_state_dict[name].clone() + \ # (1-tau) * target_actor_state_dict[name].clone() # for name in target_critic1_state_dict: # target_critic1_state_dict[name] = tau * critic1_state_dict[name].clone() +\ # (1-tau) * target_critic1_state_dict[name].clone() # for name in target_critic2_state_dict: # target_critic2_state_dict[name] = tau * critic2_state_dict[name].clone() +\ # (1-tau) * target_critic2_state_dict[name].clone() # self.target_actor.load_state_dict(target_actor_state_dict) # self.target_critic_1.load_state_dict(target_critic1_state_dict) # self.target_critic_2.load_state_dict(target_critic2_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau) * target_actor_state_dict[name].clone() for name in critic1_state_dict: critic1_state_dict[name] = tau*critic1_state_dict[name].clone() + \ (1-tau) * target_critic1_state_dict[name].clone() for name in critic2_state_dict: critic2_state_dict[name] = tau*critic2_state_dict[name].clone() + \ (1-tau) * target_critic2_state_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) self.target_critic_1.load_state_dict(critic1_state_dict) self.target_critic_2.load_state_dict(critic2_state_dict) def save_models(self): self.actor.save_checkpoint() self.target_actor.save_checkpoint() self.target_critic_1.save_checkpoint() self.target_critic_2.save_checkpoint() self.critic_1.save_checkpoint() self.critic_2.save_checkpoint() def load_models(self): self.actor.load_checkpoint() self.critic_1.load_checkpoint() self.critic_2.load_checkpoint() self.target_actor.load_checkpoint() self.target_critic_1.load_checkpoint() self.target_critic_2.load_checkpoint()