Ejemplo n.º 1
0
 def __init__(self, env):
     self.env = env
     #self.stateDim = obs2state(env.reset().observation).size()[1]
     #self.actionDim = env.action_spec().shape[0]
     self.stateDim = env.observation_space.shape[0]
     self.actionDim = env.action_space.shape[0]
     self.actor = Actor(self.env)
     self.critic = Critic(self.env)
     self.targetActor = deepcopy(Actor(self.env))
     self.targetCritic = deepcopy(Critic(self.env))
     self.actorOptim = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
     self.criticOptim = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)
     self.criticLoss = nn.MSELoss()
     self.noise = OUNoise(mu=np.zeros(self.actionDim), sigma=SIGMA)
     self.replayBuffer = Buffer(BUFFER_SIZE)
     self.batchSize = MINIBATCH_SIZE
     self.checkpoint_dir = CHECKPOINT_DIR
     self.discount = DISCOUNT
     self.warmup = WARMUP
     self.epsilon = EPSILON
     self.epsilon_decay = EPSILON_DECAY
     self.rewardgraph = []
     self.stepgraph = []
     self.start = 0
     self.end = NUM_EPISODES
Ejemplo n.º 2
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        #
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

        # Function "Steps" is excluded, as it is implemented in the MADDPG

        #def step(self, state, action, reward, next_state, done, times):
        """Save experience in replay memory, and use random sample from buffer to learn."""
Ejemplo n.º 3
0
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)
Ejemplo n.º 4
0
def train(rank, args, shared_model, counter, lock, optimizer=None):
    FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor
    
    env = gym.make("FetchPickAndPlace-v1")
    env2 = gym.wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal'])

    model = Actor()
    model2 = second()

    if args.use_cuda:
        model.cuda()
        model2.cuda()

    if os.path.isfile(args.save_path2):
        print('Loading second parametets ...')
        pretrained_dict = torch.load(args.save_path2)
        model_dict2 = model2.state_dict()
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict2}
        model_dict2.update(pretrained_dict) 
        model2.load_state_dict(model_dict2)
    
    for p in model.fc1.parameters():
        p.requires_grad = False
    for p in model.fc2.parameters():
        p.requires_grad = False
        
    if optimizer is None:
        optimizer = optim.Adam(shared_model.parameters(), lr=args.lr)
    
    model.train()
    model2.eval()
    done = True       
    for num_iter in count():
        with lock:
            counter.value += 1
        #print(num_iter, counter.value)
        lastObs = env.reset()
        goal = lastObs['desired_goal']
        objectPos = lastObs['observation'][3:6]
        object_rel_pos = lastObs['observation'][6:9]
        object_oriented_goal = object_rel_pos.copy()
        object_oriented_goal[2] += 0.03 # first make the gripper go slightly above the object    
        timeStep = 0 #count the total number of timesteps
        if rank == 0:

            if num_iter % args.save_interval == 0 and num_iter > 0:
                #print ("Saving model at :" + args.save_path)            
                torch.save(shared_model.state_dict(), args.save_path1)

        if num_iter % (args.save_interval * 2.5) == 0 and num_iter > 0 and rank == 1:    # Second saver in-case first processes crashes 
            #print ("Saving model for process 1 at :" + args.save_path)            
            torch.save(shared_model.state_dict(), args.save_path1)
        
        model.load_state_dict(shared_model.state_dict())
        values, log_probs, rewards, entropies = [], [], [], []
        if done:
            cx = Variable(torch.zeros(1, 32)).type(FloatTensor)
            hx = Variable(torch.zeros(1, 32)).type(FloatTensor)
        else:
            cx = Variable(cx.data).type(FloatTensor)
            hx = Variable(hx.data).type(FloatTensor)

        state_inp = torch.from_numpy(env2.observation(lastObs)).type(FloatTensor)
        #criterion = nn.MSELoss()
        value, y, (hx, cx) = model(state_inp, hx, cx)
        prob = F.softmax(y)
        log_prob = F.log_softmax(y, dim=-1)
        act_model = prob.max(-1, keepdim=True)[1].data
        entropy = -(log_prob * prob).sum(-1, keepdim=True)
        log_prob = log_prob.gather(-1, Variable(act_model))
        action_out = act_model.to(torch.device("cpu"))
        #action_out = torch.tensor([[1]])
        entropies.append(entropy), log_probs.append(log_prob), values.append(value)
        #print(action_out)
        while np.linalg.norm(object_oriented_goal) >= 0.015 and timeStep <= env._max_episode_steps:
            #env.render()
            action = [0, 0, 0, 0, 0, 0]
            act_tensor= act(state_inp, action_out, model2)      
            #print(act_tensor)     
            for i in range(len(object_oriented_goal)):
                action[i] = act_tensor[i].cpu().detach().numpy()

            object_oriented_goal = object_rel_pos.copy()            
            object_oriented_goal[2] += 0.03
            
            action[3] = 0.05
            obsDataNew, reward, done, info = env.step(action)
            timeStep += 1
            objectPos = obsDataNew['observation'][3:6]
            object_rel_pos = obsDataNew['observation'][6:9]
            state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor)
            if timeStep >= env._max_episode_steps: 
                reward = torch.Tensor([-1.0]).type(FloatTensor)
                break
        
        if timeStep < env._max_episode_steps: 
            reward = torch.Tensor([1.0]).type(FloatTensor)
        rewards.append(reward)
        
        value, y, (hx, cx) = model(state_inp, hx, cx)
        prob = F.softmax(y)
        log_prob = F.log_softmax(y, dim=-1)
        act_model = prob.max(-1, keepdim=True)[1].data
        entropy = -(log_prob * prob).sum(-1, keepdim=True)
        log_prob = log_prob.gather(-1, Variable(act_model))
        action_out = act_model.to(torch.device("cpu"))
        entropies.append(entropy), log_probs.append(log_prob), values.append(value)
        #action_out = torch.tensor([[0]])
        while np.linalg.norm(object_rel_pos) >= 0.005 and timeStep <= env._max_episode_steps :
            #env.render()
            action = [0, 0, 0, 0, 0, 0]
            act_tensor= act(state_inp, action_out, model2)   

            for i in range(len(object_oriented_goal)):
                action[i] = act_tensor[i].cpu().detach().numpy()
            
            action[3]= -0.01 
            if action_out == 0:
                action[4] = act_tensor[3].cpu().detach().numpy()
            
            obsDataNew, reward, done, info = env.step(action)
            timeStep += 1

            objectPos = obsDataNew['observation'][3:6]
            object_rel_pos = obsDataNew['observation'][6:9]
            state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor)
            if timeStep >= env._max_episode_steps: 
                reward = torch.Tensor([-1.0]).type(FloatTensor)
                break
        
        if timeStep < env._max_episode_steps: 
            reward = torch.Tensor([1.0]).type(FloatTensor)
        rewards.append(reward)

        value, y, (hx, cx) = model(state_inp, hx, cx)
        prob = F.softmax(y)
        log_prob = F.log_softmax(y, dim=-1)
        act_model = prob.max(-1, keepdim=True)[1].data
        entropy = -(log_prob * prob).sum(-1, keepdim=True)
        log_prob = log_prob.gather(-1, Variable(act_model))
        action_out = act_model.to(torch.device("cpu"))
        entropies.append(entropy), log_probs.append(log_prob), values.append(value)
        #action_out = torch.tensor([[2]])
        while np.linalg.norm(goal - objectPos) >= 0.01 and timeStep <= env._max_episode_steps :
            
            #env.render()
            action = [0, 0, 0, 0, 0, 0]
            act_tensor= act(state_inp, action_out, model2)

            for i in range(len(goal - objectPos)):
                action[i] = act_tensor[i].cpu().detach().numpy()
            
            action[3] = -0.01
            obsDataNew, reward, done, info = env.step(action)
            timeStep += 1
            state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor)
            objectPos = obsDataNew['observation'][3:6]
            object_rel_pos = obsDataNew['observation'][6:9]
            if timeStep >= env._max_episode_steps: 
                break

        while True: #limit the number of timesteps in the episode to a fixed duration
            #env.render()
            action = [0, 0, 0, 0, 0, 0]
            action[3] = -0.01 # keep the gripper closed

            obsDataNew, reward, done, info = env.step(action)
            timeStep += 1

            objectPos = obsDataNew['observation'][3:6]
            object_rel_pos = obsDataNew['observation'][6:9]

            if timeStep >= env._max_episode_steps: break
        
        if info['is_success'] == 1.0:
            reward = torch.Tensor([1.0]).type(FloatTensor)
        else:
            reward = torch.Tensor([-1.0]).type(FloatTensor)
        rewards.append(reward)
        
        R = torch.zeros(1, 1)
        values.append(Variable(R).type(FloatTensor))
        policy_loss = 0
        value_loss = 0
        R = Variable(R).type(FloatTensor)
        gae = torch.zeros(1, 1).type(FloatTensor)

        for i in reversed(range(len(rewards))):
            R = args.gamma * R + rewards[i]
            advantage = R - values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            delta_t = rewards[i] + args.gamma * \
                values[i + 1].data - values[i].data
            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                log_probs[i] * Variable(gae).type(FloatTensor)

        total_loss = policy_loss + args.value_loss_coef * value_loss
        optimizer.zero_grad()

        (total_loss).backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        ensure_shared_grads(model, shared_model)
        optimizer.step()
Ejemplo n.º 5
0
def test(rank, args, shared_model, counter):
    
    FloatTensor = torch.cuda.FloatTensor if args.use_cuda else torch.FloatTensor
    env = gym.make("FetchPickAndPlace-v1")
    env2 = gym.wrappers.FlattenDictWrapper(env, dict_keys=['observation', 'desired_goal'])

    model = Actor()
    model2 = second()
    if args.use_cuda:
        model.cuda()
        model2.cuda()

    done = True       
    

    savefile = os.getcwd() + '/train/mario_curves.csv'
    title = ['No. episodes', 'No. of success']
    with open(savefile, 'a', newline='') as sfile:
        writer = csv.writer(sfile)
        writer.writerow(title)   

    if os.path.isfile(args.save_path2):
        print('Loading second parametets ...')
        pretrained_dict = torch.load(args.save_path2)
        model_dict2 = model2.state_dict()
        pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict2}
        model_dict2.update(pretrained_dict) 
        model2.load_state_dict(model_dict2)

    model2.eval()
    model.eval()
    while True:
        model.load_state_dict(shared_model.state_dict())
        model.eval()
        ep_num = 0
        success = 0
        num_ep = counter.value
        while ep_num < 50:
            ep_num +=1            
            lastObs = env.reset()
            goal = lastObs['desired_goal']
            objectPos = lastObs['observation'][3:6]
            object_rel_pos = lastObs['observation'][6:9]
            object_oriented_goal = object_rel_pos.copy()
            object_oriented_goal[2] += 0.03 # first make the gripper go slightly above the object    
            timeStep = 0
            if done:
                cx = Variable(torch.zeros(1, 32)).type(FloatTensor)
                hx = Variable(torch.zeros(1, 32)).type(FloatTensor)
            else:
                cx = Variable(cx.data).type(FloatTensor)
                hx = Variable(hx.data).type(FloatTensor)

            state_inp = torch.from_numpy(env2.observation(lastObs)).type(FloatTensor)
            value, y, (hx, cx) = model(state_inp, hx, cx)
            prob = F.softmax(y)
            act_model = prob.max(-1, keepdim=True)[1].data
            action_out = act_model.to(torch.device("cpu"))
            ##action_out = torch.tensor([[1]])
            while np.linalg.norm(object_oriented_goal) >= 0.015 and timeStep <= env._max_episode_steps:
                #env.render()
                action = [0, 0, 0, 0, 0, 0]
                act_tensor= act(state_inp, action_out, model2)      
                #print(act_tensor)     
                for i in range(len(object_oriented_goal)):
                    action[i] = act_tensor[i].cpu().detach().numpy()

                object_oriented_goal = object_rel_pos.copy()            
                object_oriented_goal[2] += 0.03
                
                action[3] = 0.05
                obsDataNew, reward, done, info = env.step(action)
                timeStep += 1
                objectPos = obsDataNew['observation'][3:6]
                object_rel_pos = obsDataNew['observation'][6:9]
                state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor)
                if timeStep >= env._max_episode_steps: 
                    break
    
            value, y, (hx, cx) = model(state_inp, hx, cx)
            prob = F.softmax(y)
            act_model = prob.max(-1, keepdim=True)[1].data
            action_out = act_model.to(torch.device("cpu"))
            #action_out = torch.tensor([[0]])
            while np.linalg.norm(object_rel_pos) >= 0.005 and timeStep <= env._max_episode_steps :
                #env.render()
                action = [0, 0, 0, 0, 0, 0]
                act_tensor= act(state_inp, action_out, model2)

                for i in range(len(object_oriented_goal)):
                    action[i] = act_tensor[i].cpu().detach().numpy()
                
                action[3]= -0.01 
                if action_out ==0:
                    action[4] = act_tensor[3].cpu().detach().numpy()
                
                obsDataNew, reward, done, info = env.step(action)
                timeStep += 1

                objectPos = obsDataNew['observation'][3:6]
                object_rel_pos = obsDataNew['observation'][6:9]
                state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor)
                if timeStep >= env._max_episode_steps: 
                    break
            
            value, y, (hx, cx) = model(state_inp, hx, cx)
            prob = F.softmax(y)            
            act_model = prob.max(-1, keepdim=True)[1].data            
            action_out = act_model.to(torch.device("cpu"))
            #action_out = torch.tensor([[2]])
            while np.linalg.norm(goal - objectPos) >= 0.01 and timeStep <= env._max_episode_steps :
            
                #env.render()
                action = [0, 0, 0, 0, 0, 0]
                act_tensor= act(state_inp, action_out, model2)

                for i in range(len(goal - objectPos)):
                    action[i] = act_tensor[i].cpu().detach().numpy()
                
                action[3] = -0.01
                obsDataNew, reward, done, info = env.step(action)
                timeStep += 1
                state_inp = torch.from_numpy(env2.observation(obsDataNew)).type(FloatTensor)
                objectPos = obsDataNew['observation'][3:6]
                object_rel_pos = obsDataNew['observation'][6:9]
                if timeStep >= env._max_episode_steps: 
                    break
            
            while True: #limit the number of timesteps in the episode to a fixed duration
                #env.render()
                action = [0, 0, 0, 0, 0, 0]
                action[3] = -0.01 # keep the gripper closed

                obsDataNew, reward, done, info = env.step(action)
                timeStep += 1

                objectPos = obsDataNew['observation'][3:6]
                object_rel_pos = obsDataNew['observation'][6:9]
                if timeStep >= env._max_episode_steps: break
                
            if info['is_success'] == 1.0:
                success +=1
            if done:
                #lastObs = env.reset()
                if ep_num % 49==0:            
                    print("num episodes {}, success {}".format(num_ep, success))
                    data = [counter.value, success]
                    with open(savefile, 'a', newline='') as sfile:
                        writer = csv.writer(sfile)
                        writer.writerows([data])
Ejemplo n.º 6
0
                    help='entropy term coefficient (default: 0.01)')
parser.add_argument('--value-loss-coef',
                    type=float,
                    default=0.5,
                    help='value loss coefficient (default: 0.5)')
parser.add_argument('--gamma',
                    type=float,
                    default=0.9,
                    help='discount factor for rewards (default: 0.9)')
parser.add_argument('--tau',
                    type=float,
                    default=1.00,
                    help='parameter for GAE (default: 1.00)')
args = parser.parse_args()

model = Actor()
model2 = second()
if args.use_cuda:
    model.cuda()
    model2.cuda()
torch.cuda.manual_seed_all(21)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

if os.path.isfile(args.save_path1):
    print('Loading A3C parametets ...')
    model.load_state_dict(torch.load(args.save_path1))

if os.path.isfile(args.save_path2):
    print('Loading second parametets ...')
    pretrained_dict = torch.load(args.save_path2)
    model_dict2 = model2.state_dict()
Ejemplo n.º 7
0
                    help='model save interval (default: 10)')
parser.add_argument('--lr',
                    type=float,
                    default=0.0001,
                    help='learning rate (default: 0.0001)')
args = parser.parse_args()

mp = _mp.get_context('spawn')
print("Cuda: " + str(torch.cuda.is_available()))

if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'

    args = parser.parse_args()
    env = gym.make("FetchPickAndPlace-v1")
    shared_model = Actor()
    if args.use_cuda:
        shared_model.cuda()
    torch.cuda.manual_seed_all(30)

    shared_model.share_memory()

    if os.path.isfile(args.save_path1):
        print('Loading A3C parametets ...')
        pretrained_dict = torch.load(args.save_path1)
        model_dict = shared_model.state_dict()
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items() if k in model_dict
        }
        model_dict.update(pretrained_dict)
Ejemplo n.º 8
0
class DDPG:
    def __init__(self, env):
        self.env = env
        self.stateDim = obs2state(env.reset().observation).size()[1]
        self.actionDim = env.action_spec().shape[0]
        self.actor = Actor(self.env).cuda()
        self.critic = Critic(self.env).cuda()
        self.targetActor = deepcopy(Actor(self.env)).cuda()
        self.targetCritic = deepcopy(Critic(self.env)).cuda()
        self.actorOptim = optim.Adam(self.actor.parameters(), lr=ACTOR_LR)
        self.criticOptim = optim.Adam(self.critic.parameters(), lr=CRITIC_LR)
        self.criticLoss = nn.MSELoss()
        self.noise = OUNoise(mu=np.zeros(self.actionDim), sigma=SIGMA)
        self.replayBuffer = Buffer(BUFFER_SIZE)
        self.batchSize = MINIBATCH_SIZE
        self.checkpoint_dir = CHECKPOINT_DIR
        self.discount = DISCOUNT
        self.warmup = WARMUP
        self.epsilon = EPSILON
        self.epsilon_decay = EPSILON_DECAY
        self.rewardgraph = []
        self.start = 0
        self.end = NUM_EPISODES

    def getQTarget(self, nextStateBatch, rewardBatch, terminalBatch):
        """Inputs: Batch of next states, rewards and terminal flags of size self.batchSize
            Calculates the target Q-value from reward and bootstraped Q-value of next state
            using the target actor and target critic
           Outputs: Batch of Q-value targets"""

        targetBatch = torch.FloatTensor(rewardBatch).cuda()
        nonFinalMask = torch.ByteTensor(
            tuple(map(lambda s: s != True, terminalBatch)))
        nextStateBatch = torch.cat(nextStateBatch)
        nextActionBatch = self.targetActor(nextStateBatch)
        nextActionBatch.volatile = True
        qNext = self.targetCritic(nextStateBatch, nextActionBatch)

        nonFinalMask = self.discount * nonFinalMask.type(
            torch.cuda.FloatTensor)
        targetBatch += nonFinalMask * qNext.squeeze().data

        return Variable(targetBatch, volatile=False)

    def updateTargets(self, target, original):
        """Weighted average update of the target network and original network
            Inputs: target actor(critic) and original actor(critic)"""

        for targetParam, orgParam in zip(target.parameters(),
                                         original.parameters()):
            targetParam.data.copy_((1 - TAU)*targetParam.data + \
                                          TAU*orgParam.data)

    def getMaxAction(self, curState):
        """Inputs: Current state of the episode
            Returns the action which maximizes the Q-value of the current state-action pair"""

        spec = self.env.action_spec()
        minAct = Variable(torch.cuda.FloatTensor(spec.minimum),
                          requires_grad=False)
        maxAct = Variable(torch.cuda.FloatTensor(spec.maximum),
                          requires_grad=False)
        noise = self.epsilon * Variable(torch.FloatTensor(self.noise()),
                                        volatile=True).cuda()
        action = self.actor(curState)
        actionNoise = action + noise
        return actionNoise

    def train(self):
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        print('Training started...')

        for i in range(self.start, self.end):
            time_step = self.env.reset()
            ep_reward = 0

            while not time_step.last():

                #Visualize Training
                display.clear_output(wait=True)
                plt.imshow(self.env.physics.render())
                plt.show()

                # Get maximizing action
                curState = Variable(obs2state(time_step.observation),
                                    volatile=True).cuda()
                self.actor.eval()
                action = self.getMaxAction(curState)
                curState.volatile = False
                action.volatile = False
                self.actor.train()

                # Step episode
                time_step = self.env.step(action.data)
                nextState = Variable(obs2state(time_step.observation),
                                     volatile=True).cuda()
                reward = time_step.reward
                ep_reward += reward
                terminal = time_step.last()

                # Update replay bufer
                self.replayBuffer.append(
                    (curState, action, nextState, reward, terminal))

                # Training loop
                if len(self.replayBuffer) >= self.warmup:

                    curStateBatch, actionBatch, nextStateBatch, \
                    rewardBatch, terminalBatch = self.replayBuffer.sample_batch(self.batchSize)
                    curStateBatch = torch.cat(curStateBatch)
                    actionBatch = torch.cat(actionBatch)

                    qPredBatch = self.critic(curStateBatch, actionBatch)
                    qTargetBatch = self.getQTarget(nextStateBatch, rewardBatch,
                                                   terminalBatch)

                    # Critic update
                    self.criticOptim.zero_grad()
                    criticLoss = self.criticLoss(qPredBatch, qTargetBatch)
                    print('Critic Loss: {}'.format(criticLoss))
                    criticLoss.backward()
                    self.criticOptim.step()

                    # Actor update
                    self.actorOptim.zero_grad()
                    actorLoss = -torch.mean(
                        self.critic(curStateBatch, self.actor(curStateBatch)))
                    print('Actor Loss: {}'.format(actorLoss))
                    actorLoss.backward()
                    self.actorOptim.step()

                    # Update Targets
                    self.updateTargets(self.targetActor, self.actor)
                    self.updateTargets(self.targetCritic, self.critic)
                    self.epsilon -= self.epsilon_decay

            if i % 20 == 0:
                self.save_checkpoint(i)
            self.rewardgraph.append(ep_reward)

    def save_checkpoint(self, episode_num):
        checkpointName = self.checkpoint_dir + 'ep{}.pth.tar'.format(
            episode_num)
        checkpoint = {
            'episode': episode_num,
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'targetActor': self.targetActor.state_dict(),
            'targetCritic': self.targetCritic.state_dict(),
            'actorOpt': self.actorOptim.state_dict(),
            'criticOpt': self.criticOptim.state_dict(),
            'replayBuffer': self.replayBuffer,
            'rewardgraph': self.rewardgraph,
            'epsilon': self.epsilon
        }
        torch.save(checkpoint, checkpointName)

    def loadCheckpoint(self, checkpointName):
        if os.path.isfile(checkpointName):
            print("Loading checkpoint...")
            checkpoint = torch.load(checkpointName)
            self.start = checkpoint['episode'] + 1
            self.actor.load_state_dict(checkpoint['actor'])
            self.critic.load_state_dict(checkpoint['critic'])
            self.targetActor.load_state_dict(checkpoint['targetActor'])
            self.targetCritic.load_state_dict(checkpoint['targetCritic'])
            self.actorOptim.load_state_dict(checkpoint['actorOpt'])
            self.criticOptim.load_state_dict(checkpoint['criticOpt'])
            self.replayBuffer = checkpoint['replayBuffer']
            self.rewardgraph = checkpoint['rewardgraph']
            self.epsilon = checkpoint['epsilon']
            print('Checkpoint loaded')
        else:
            raise OSError('Checkpoint not found')
Ejemplo n.º 9
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, random_seed):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            random_seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.epsilon = EPSILON

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise process
        self.noise = OUNoise(action_size, random_seed)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, times):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        # Save experience / reward
        self.memory.add(state, action, reward, next_state, done)

        # Learn, if enough samples are available in memory
        if len(self.memory) > BATCH_SIZE and times % LEARN_EVERY == 0:
            for _ in range(LEARN_TIMES):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Returns actions for given state as per current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.epsilon
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value
        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        actions_next = self.actor_target(next_states.to(device))
        Q_targets_next = self.critic_target(next_states.to(device),
                                            actions_next.to(device))
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(
            Q_expected, Q_targets
        )  # Critic loss uses TD method for DQN (first the MSE loss then backpropagation)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic_local.parameters(),
                        1)  # Clip the gradient when update critic network
        self.critic_optimizer.step()

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean(
        )  # Actor loss uses -Q from local critic network
        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)

        # ----------------------- update epsilon and noise ----------------------- #
        self.epsilon *= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)