def __init__(self, env): self.env = env #self.stateDim = obs2state(env.reset().observation).size()[1] #self.actionDim = env.action_spec().shape[0] self.stateDim = env.observation_space.shape[0] self.actionDim = env.action_space.shape[0] self.actor = Actor(self.env) self.critic = Critic(self.env) self.targetActor = deepcopy(Actor(self.env)) self.targetCritic = deepcopy(Critic(self.env)) self.actorOptim = optim.Adam(self.actor.parameters(), lr=ACTOR_LR) self.criticOptim = optim.Adam(self.critic.parameters(), lr=CRITIC_LR) self.criticLoss = nn.MSELoss() self.noise = OUNoise(mu=np.zeros(self.actionDim), sigma=SIGMA) self.replayBuffer = Buffer(BUFFER_SIZE) self.batchSize = MINIBATCH_SIZE self.checkpoint_dir = CHECKPOINT_DIR self.discount = DISCOUNT self.warmup = WARMUP self.epsilon = EPSILON self.epsilon_decay = EPSILON_DECAY self.rewardgraph = [] self.stepgraph = [] self.start = 0 self.end = NUM_EPISODES
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) # self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Function "Steps" is excluded, as it is implemented in the MADDPG #def step(self, state, action, reward, next_state, done, times): """Save experience in replay memory, and use random sample from buffer to learn."""
def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def train(rank, args, shared_model, counter, lock, optimizer=None): FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor env = gym.make("FetchPickAndPlace-v1") env2 = gym.wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal']) model = Actor() model2 = second() if args.use_cuda: model.cuda() model2.cuda() if os.path.isfile(args.save_path2): print('Loading second parametets ...') pretrained_dict = torch.load(args.save_path2) model_dict2 = model2.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict2} model_dict2.update(pretrained_dict) model2.load_state_dict(model_dict2) for p in model.fc1.parameters(): p.requires_grad = False for p in model.fc2.parameters(): p.requires_grad = False if optimizer is None: optimizer = optim.Adam(shared_model.parameters(), lr=args.lr) model.train() model2.eval() done = True for num_iter in count(): with lock: counter.value += 1 #print(num_iter, counter.value) lastObs = env.reset() goal = lastObs['desired_goal'] objectPos = lastObs['observation'][3:6] object_rel_pos = lastObs['observation'][6:9] object_oriented_goal = object_rel_pos.copy() object_oriented_goal[2] += 0.03 # first make the gripper go slightly above the object timeStep = 0 #count the total number of timesteps if rank == 0: if num_iter % args.save_interval == 0 and num_iter > 0: #print ("Saving model at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path1) if num_iter % (args.save_interval * 2.5) == 0 and num_iter > 0 and rank == 1: # Second saver in-case first processes crashes #print ("Saving model for process 1 at :" + args.save_path) torch.save(shared_model.state_dict(), args.save_path1) model.load_state_dict(shared_model.state_dict()) values, log_probs, rewards, entropies = [], [], [], [] if done: cx = Variable(torch.zeros(1, 32)).type(FloatTensor) hx = Variable(torch.zeros(1, 32)).type(FloatTensor) else: cx = Variable(cx.data).type(FloatTensor) hx = Variable(hx.data).type(FloatTensor) state_inp = torch.from_numpy(env2.observation(lastObs)).type(FloatTensor) #criterion = nn.MSELoss() value, y, (hx, cx) = model(state_inp, hx, cx) prob = F.softmax(y) log_prob = F.log_softmax(y, dim=-1) act_model = prob.max(-1, keepdim=True)[1].data entropy = -(log_prob * prob).sum(-1, keepdim=True) log_prob = log_prob.gather(-1, Variable(act_model)) action_out = act_model.to(torch.device("cpu")) #action_out = torch.tensor([[1]]) entropies.append(entropy), log_probs.append(log_prob), values.append(value) #print(action_out) while np.linalg.norm(object_oriented_goal) >= 0.015 and timeStep <= env._max_episode_steps: #env.render() action = [0, 0, 0, 0, 0, 0] act_tensor= act(state_inp, action_out, model2) #print(act_tensor) for i in range(len(object_oriented_goal)): action[i] = act_tensor[i].cpu().detach().numpy() object_oriented_goal = object_rel_pos.copy() object_oriented_goal[2] += 0.03 action[3] = 0.05 obsDataNew, reward, done, info = env.step(action) timeStep += 1 objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor) if timeStep >= env._max_episode_steps: reward = torch.Tensor([-1.0]).type(FloatTensor) break if timeStep < env._max_episode_steps: reward = torch.Tensor([1.0]).type(FloatTensor) rewards.append(reward) value, y, (hx, cx) = model(state_inp, hx, cx) prob = F.softmax(y) log_prob = F.log_softmax(y, dim=-1) act_model = prob.max(-1, keepdim=True)[1].data entropy = -(log_prob * prob).sum(-1, keepdim=True) log_prob = log_prob.gather(-1, Variable(act_model)) action_out = act_model.to(torch.device("cpu")) entropies.append(entropy), log_probs.append(log_prob), values.append(value) #action_out = torch.tensor([[0]]) while np.linalg.norm(object_rel_pos) >= 0.005 and timeStep <= env._max_episode_steps : #env.render() action = [0, 0, 0, 0, 0, 0] act_tensor= act(state_inp, action_out, model2) for i in range(len(object_oriented_goal)): action[i] = act_tensor[i].cpu().detach().numpy() action[3]= -0.01 if action_out == 0: action[4] = act_tensor[3].cpu().detach().numpy() obsDataNew, reward, done, info = env.step(action) timeStep += 1 objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor) if timeStep >= env._max_episode_steps: reward = torch.Tensor([-1.0]).type(FloatTensor) break if timeStep < env._max_episode_steps: reward = torch.Tensor([1.0]).type(FloatTensor) rewards.append(reward) value, y, (hx, cx) = model(state_inp, hx, cx) prob = F.softmax(y) log_prob = F.log_softmax(y, dim=-1) act_model = prob.max(-1, keepdim=True)[1].data entropy = -(log_prob * prob).sum(-1, keepdim=True) log_prob = log_prob.gather(-1, Variable(act_model)) action_out = act_model.to(torch.device("cpu")) entropies.append(entropy), log_probs.append(log_prob), values.append(value) #action_out = torch.tensor([[2]]) while np.linalg.norm(goal - objectPos) >= 0.01 and timeStep <= env._max_episode_steps : #env.render() action = [0, 0, 0, 0, 0, 0] act_tensor= act(state_inp, action_out, model2) for i in range(len(goal - objectPos)): action[i] = act_tensor[i].cpu().detach().numpy() action[3] = -0.01 obsDataNew, reward, done, info = env.step(action) timeStep += 1 state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor) objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] if timeStep >= env._max_episode_steps: break while True: #limit the number of timesteps in the episode to a fixed duration #env.render() action = [0, 0, 0, 0, 0, 0] action[3] = -0.01 # keep the gripper closed obsDataNew, reward, done, info = env.step(action) timeStep += 1 objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] if timeStep >= env._max_episode_steps: break if info['is_success'] == 1.0: reward = torch.Tensor([1.0]).type(FloatTensor) else: reward = torch.Tensor([-1.0]).type(FloatTensor) rewards.append(reward) R = torch.zeros(1, 1) values.append(Variable(R).type(FloatTensor)) policy_loss = 0 value_loss = 0 R = Variable(R).type(FloatTensor) gae = torch.zeros(1, 1).type(FloatTensor) for i in reversed(range(len(rewards))): R = args.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = rewards[i] + args.gamma * \ values[i + 1].data - values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ log_probs[i] * Variable(gae).type(FloatTensor) total_loss = policy_loss + args.value_loss_coef * value_loss optimizer.zero_grad() (total_loss).backward(retain_graph=True) torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ensure_shared_grads(model, shared_model) optimizer.step()
def test(rank, args, shared_model, counter): FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor env = gym.make("FetchPickAndPlace-v1") env2 = gym.wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal']) model = Actor() model2 = second() if args.use_cuda: model.cuda() model2.cuda() done = True savefile = os.getcwd() + '/train/mario_curves.csv' title = ['No. episodes', 'No. of success'] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerow(title) if os.path.isfile(args.save_path2): print('Loading second parametets ...') pretrained_dict = torch.load(args.save_path2) model_dict2 = model2.state_dict() pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict2} model_dict2.update(pretrained_dict) model2.load_state_dict(model_dict2) model2.eval() model.eval() while True: model.load_state_dict(shared_model.state_dict()) model.eval() ep_num = 0 success = 0 num_ep = counter.value while ep_num < 50: ep_num +=1 lastObs = env.reset() goal = lastObs['desired_goal'] objectPos = lastObs['observation'][3:6] object_rel_pos = lastObs['observation'][6:9] object_oriented_goal = object_rel_pos.copy() object_oriented_goal[2] += 0.03 # first make the gripper go slightly above the object timeStep = 0 if done: cx = Variable(torch.zeros(1, 32)).type(FloatTensor) hx = Variable(torch.zeros(1, 32)).type(FloatTensor) else: cx = Variable(cx.data).type(FloatTensor) hx = Variable(hx.data).type(FloatTensor) state_inp = torch.from_numpy(env2.observation(lastObs)).type(FloatTensor) value, y, (hx, cx) = model(state_inp, hx, cx) prob = F.softmax(y) act_model = prob.max(-1, keepdim=True)[1].data action_out = act_model.to(torch.device("cpu")) ##action_out = torch.tensor([[1]]) while np.linalg.norm(object_oriented_goal) >= 0.015 and timeStep <= env._max_episode_steps: #env.render() action = [0, 0, 0, 0, 0, 0] act_tensor= act(state_inp, action_out, model2) #print(act_tensor) for i in range(len(object_oriented_goal)): action[i] = act_tensor[i].cpu().detach().numpy() object_oriented_goal = object_rel_pos.copy() object_oriented_goal[2] += 0.03 action[3] = 0.05 obsDataNew, reward, done, info = env.step(action) timeStep += 1 objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor) if timeStep >= env._max_episode_steps: break value, y, (hx, cx) = model(state_inp, hx, cx) prob = F.softmax(y) act_model = prob.max(-1, keepdim=True)[1].data action_out = act_model.to(torch.device("cpu")) #action_out = torch.tensor([[0]]) while np.linalg.norm(object_rel_pos) >= 0.005 and timeStep <= env._max_episode_steps : #env.render() action = [0, 0, 0, 0, 0, 0] act_tensor= act(state_inp, action_out, model2) for i in range(len(object_oriented_goal)): action[i] = act_tensor[i].cpu().detach().numpy() action[3]= -0.01 if action_out ==0: action[4] = act_tensor[3].cpu().detach().numpy() obsDataNew, reward, done, info = env.step(action) timeStep += 1 objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor) if timeStep >= env._max_episode_steps: break value, y, (hx, cx) = model(state_inp, hx, cx) prob = F.softmax(y) act_model = prob.max(-1, keepdim=True)[1].data action_out = act_model.to(torch.device("cpu")) #action_out = torch.tensor([[2]]) while np.linalg.norm(goal - objectPos) >= 0.01 and timeStep <= env._max_episode_steps : #env.render() action = [0, 0, 0, 0, 0, 0] act_tensor= act(state_inp, action_out, model2) for i in range(len(goal - objectPos)): action[i] = act_tensor[i].cpu().detach().numpy() action[3] = -0.01 obsDataNew, reward, done, info = env.step(action) timeStep += 1 state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor) objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] if timeStep >= env._max_episode_steps: break while True: #limit the number of timesteps in the episode to a fixed duration #env.render() action = [0, 0, 0, 0, 0, 0] action[3] = -0.01 # keep the gripper closed obsDataNew, reward, done, info = env.step(action) timeStep += 1 objectPos = obsDataNew['observation'][3:6] object_rel_pos = obsDataNew['observation'][6:9] if timeStep >= env._max_episode_steps: break if info['is_success'] == 1.0: success +=1 if done: #lastObs = env.reset() if ep_num % 49==0: print("num episodes {}, success {}".format(num_ep, success)) data = [counter.value, success] with open(savefile, 'a', newline='') as sfile: writer = csv.writer(sfile) writer.writerows([data])
help='entropy term coefficient (default: 0.01)') parser.add_argument('--value-loss-coef', type=float, default=0.5, help='value loss coefficient (default: 0.5)') parser.add_argument('--gamma', type=float, default=0.9, help='discount factor for rewards (default: 0.9)') parser.add_argument('--tau', type=float, default=1.00, help='parameter for GAE (default: 1.00)') args = parser.parse_args() model = Actor() model2 = second() if args.use_cuda: model.cuda() model2.cuda() torch.cuda.manual_seed_all(21) optimizer = optim.Adam(model.parameters(), lr=0.0001) if os.path.isfile(args.save_path1): print('Loading A3C parametets ...') model.load_state_dict(torch.load(args.save_path1)) if os.path.isfile(args.save_path2): print('Loading second parametets ...') pretrained_dict = torch.load(args.save_path2) model_dict2 = model2.state_dict()
help='model save interval (default: 10)') parser.add_argument('--lr', type=float, default=0.0001, help='learning rate (default: 0.0001)') args = parser.parse_args() mp = _mp.get_context('spawn') print("Cuda: " + str(torch.cuda.is_available())) if __name__ == '__main__': os.environ['OMP_NUM_THREADS'] = '1' args = parser.parse_args() env = gym.make("FetchPickAndPlace-v1") shared_model = Actor() if args.use_cuda: shared_model.cuda() torch.cuda.manual_seed_all(30) shared_model.share_memory() if os.path.isfile(args.save_path1): print('Loading A3C parametets ...') pretrained_dict = torch.load(args.save_path1) model_dict = shared_model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } model_dict.update(pretrained_dict)
class DDPG: def __init__(self, env): self.env = env self.stateDim = obs2state(env.reset().observation).size()[1] self.actionDim = env.action_spec().shape[0] self.actor = Actor(self.env).cuda() self.critic = Critic(self.env).cuda() self.targetActor = deepcopy(Actor(self.env)).cuda() self.targetCritic = deepcopy(Critic(self.env)).cuda() self.actorOptim = optim.Adam(self.actor.parameters(), lr=ACTOR_LR) self.criticOptim = optim.Adam(self.critic.parameters(), lr=CRITIC_LR) self.criticLoss = nn.MSELoss() self.noise = OUNoise(mu=np.zeros(self.actionDim), sigma=SIGMA) self.replayBuffer = Buffer(BUFFER_SIZE) self.batchSize = MINIBATCH_SIZE self.checkpoint_dir = CHECKPOINT_DIR self.discount = DISCOUNT self.warmup = WARMUP self.epsilon = EPSILON self.epsilon_decay = EPSILON_DECAY self.rewardgraph = [] self.start = 0 self.end = NUM_EPISODES def getQTarget(self, nextStateBatch, rewardBatch, terminalBatch): """Inputs: Batch of next states, rewards and terminal flags of size self.batchSize Calculates the target Q-value from reward and bootstraped Q-value of next state using the target actor and target critic Outputs: Batch of Q-value targets""" targetBatch = torch.FloatTensor(rewardBatch).cuda() nonFinalMask = torch.ByteTensor( tuple(map(lambda s: s != True, terminalBatch))) nextStateBatch = torch.cat(nextStateBatch) nextActionBatch = self.targetActor(nextStateBatch) nextActionBatch.volatile = True qNext = self.targetCritic(nextStateBatch, nextActionBatch) nonFinalMask = self.discount * nonFinalMask.type( torch.cuda.FloatTensor) targetBatch += nonFinalMask * qNext.squeeze().data return Variable(targetBatch, volatile=False) def updateTargets(self, target, original): """Weighted average update of the target network and original network Inputs: target actor(critic) and original actor(critic)""" for targetParam, orgParam in zip(target.parameters(), original.parameters()): targetParam.data.copy_((1 - TAU)*targetParam.data + \ TAU*orgParam.data) def getMaxAction(self, curState): """Inputs: Current state of the episode Returns the action which maximizes the Q-value of the current state-action pair""" spec = self.env.action_spec() minAct = Variable(torch.cuda.FloatTensor(spec.minimum), requires_grad=False) maxAct = Variable(torch.cuda.FloatTensor(spec.maximum), requires_grad=False) noise = self.epsilon * Variable(torch.FloatTensor(self.noise()), volatile=True).cuda() action = self.actor(curState) actionNoise = action + noise return actionNoise def train(self): if not os.path.exists(self.checkpoint_dir): os.makedirs(self.checkpoint_dir) print('Training started...') for i in range(self.start, self.end): time_step = self.env.reset() ep_reward = 0 while not time_step.last(): #Visualize Training display.clear_output(wait=True) plt.imshow(self.env.physics.render()) plt.show() # Get maximizing action curState = Variable(obs2state(time_step.observation), volatile=True).cuda() self.actor.eval() action = self.getMaxAction(curState) curState.volatile = False action.volatile = False self.actor.train() # Step episode time_step = self.env.step(action.data) nextState = Variable(obs2state(time_step.observation), volatile=True).cuda() reward = time_step.reward ep_reward += reward terminal = time_step.last() # Update replay bufer self.replayBuffer.append( (curState, action, nextState, reward, terminal)) # Training loop if len(self.replayBuffer) >= self.warmup: curStateBatch, actionBatch, nextStateBatch, \ rewardBatch, terminalBatch = self.replayBuffer.sample_batch(self.batchSize) curStateBatch = torch.cat(curStateBatch) actionBatch = torch.cat(actionBatch) qPredBatch = self.critic(curStateBatch, actionBatch) qTargetBatch = self.getQTarget(nextStateBatch, rewardBatch, terminalBatch) # Critic update self.criticOptim.zero_grad() criticLoss = self.criticLoss(qPredBatch, qTargetBatch) print('Critic Loss: {}'.format(criticLoss)) criticLoss.backward() self.criticOptim.step() # Actor update self.actorOptim.zero_grad() actorLoss = -torch.mean( self.critic(curStateBatch, self.actor(curStateBatch))) print('Actor Loss: {}'.format(actorLoss)) actorLoss.backward() self.actorOptim.step() # Update Targets self.updateTargets(self.targetActor, self.actor) self.updateTargets(self.targetCritic, self.critic) self.epsilon -= self.epsilon_decay if i % 20 == 0: self.save_checkpoint(i) self.rewardgraph.append(ep_reward) def save_checkpoint(self, episode_num): checkpointName = self.checkpoint_dir + 'ep{}.pth.tar'.format( episode_num) checkpoint = { 'episode': episode_num, 'actor': self.actor.state_dict(), 'critic': self.critic.state_dict(), 'targetActor': self.targetActor.state_dict(), 'targetCritic': self.targetCritic.state_dict(), 'actorOpt': self.actorOptim.state_dict(), 'criticOpt': self.criticOptim.state_dict(), 'replayBuffer': self.replayBuffer, 'rewardgraph': self.rewardgraph, 'epsilon': self.epsilon } torch.save(checkpoint, checkpointName) def loadCheckpoint(self, checkpointName): if os.path.isfile(checkpointName): print("Loading checkpoint...") checkpoint = torch.load(checkpointName) self.start = checkpoint['episode'] + 1 self.actor.load_state_dict(checkpoint['actor']) self.critic.load_state_dict(checkpoint['critic']) self.targetActor.load_state_dict(checkpoint['targetActor']) self.targetCritic.load_state_dict(checkpoint['targetCritic']) self.actorOptim.load_state_dict(checkpoint['actorOpt']) self.criticOptim.load_state_dict(checkpoint['criticOpt']) self.replayBuffer = checkpoint['replayBuffer'] self.rewardgraph = checkpoint['rewardgraph'] self.epsilon = checkpoint['epsilon'] print('Checkpoint loaded') else: raise OSError('Checkpoint not found')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, times): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and times % LEARN_EVERY == 0: for _ in range(LEARN_TIMES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * self.epsilon return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states.to(device)) Q_targets_next = self.critic_target(next_states.to(device), actions_next.to(device)) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss( Q_expected, Q_targets ) # Critic loss uses TD method for DQN (first the MSE loss then backpropagation) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() clip_grad_norm_(self.critic_local.parameters(), 1) # Clip the gradient when update critic network self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean( ) # Actor loss uses -Q from local critic network # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ----------------------- update epsilon and noise ----------------------- # self.epsilon *= EPSILON_DECAY self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)