Exemple #1
0
    def _optimize(self):
        """Sample batch from experience replay pool and update the policy"""
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions, weights, idxes = self.memory.sample(
            self.BATCH_SIZE, self.beta.anneal())
        batch = Transition(*zip(*transitions))

        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)
        weights = torch.tensor(weights,
                               dtype=torch.float32,
                               device=self.device)

        q = self._q(states, actions)
        expected_q = self._expected_q(batch.next_state, rewards)

        # update the priority of each transition
        td_error = expected_q - q
        new_priorities = torch.abs(td_error) + self.eps
        self.memory.update_priorities(idxes, new_priorities.flatten())

        # Compute Huber loss
        loss = F.smooth_l1_loss(q, expected_q, reduction='none')
        loss = (weights * loss).mean()

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Exemple #2
0
def optimize():
    if len(replaymemory) < BATCH_SIZE:
        return
    # sample tuples
    trainsitions = replaymemory.sample(BATCH_SIZE)


    batch = Transition(*zip(*trainsitions))
    actions = tuple((map(lambda a:torch.tensor([[a]], device=device), batch.action)))
    rewards = tuple((map(lambda r:torch.tensor([r], device=device), batch.reward)))

    state_batch = torch.cat(batch.state).to(device)
    action_batch = torch.cat(actions)
    reward_batch = torch.cat(rewards)
    next_state_batch = batch.next_state

    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state_batch)),dtype=torch.bool, device=device)
    non_final_next_states = torch.cat([s for s in next_state_batch if s is not None]).to(device)

    #policy net output q value for cur state
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # target net output q value for cur state
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values)
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()    

    pass
Exemple #3
0
    def train(self, ds):
        dataset = ds.get_dataset()
        loss = 0
        q_score = 0
        act = 0
        N = 10000
        l, s, n = 0, 0, 0
        for i in range(30_000_000):
            state, action, reward, next_state, raw = next(dataset)
            self.agent.push(Transition(state, action, reward, next_state), i)

            if i % 2 == 0 and i:
                l, s = self.agent.train(i)
                if l:
                    loss += l
                    q_score += s
                    n += 1

            if i % N == 0:
                act = self.agent.act(state, i)
                #print(time.time() - self.t)
                if n:
                    print(
                        'step %d, train: %d, act:%.2f, score:%.2f, loss:%.4f' %
                        (i, n, act, q_score / n, loss / n))
                self.t = time.time()
                loss, q_score, n = 0, 0, 0
                act = 0.0
                #print('update model file')
                self.save()
Exemple #4
0
    def train(self, env: gym.Env, n_steps):
        rewards = []
        steps = []
        episode_rewards = []
        state = np_to_unsq_tensor(env.reset())
        loop_range = tqdm.tqdm(range(n_steps))
        for step in loop_range:
            with torch.no_grad():
                z = self.z_net(state)
            if random.random() < self.epsilon:  # Random action
                action = torch.LongTensor([[env.action_space.sample()]])
            else:
                action = select_argmax_action(z, self.atoms)
            next_state, reward, done, info = env.step(squeeze_np(action))
            next_state = np_to_unsq_tensor(next_state) if not done else None
            self.replay_buffer.remember(
                Transition(state, action, torch.tensor([[reward]]),
                           next_state))
            state = next_state

            # Perform training step
            self._train_step(step)

            # Update episode stats
            episode_rewards.append(reward)
            if done:
                state = np_to_unsq_tensor(env.reset())
                rewards.append(sum(episode_rewards))
                steps.append(step)
                episode_rewards = []
                loop_range.set_description(f'Reward {rewards[-1]}')
        return Plot(steps, rewards, None)
Exemple #5
0
    def update(self):
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Compute Q(s', a') for all a'
        # TODO: Use a target network???
        next_qvals = self.network(batch.next_state, batch.next_acts)
        # Take the max over next q-values
        next_qvals = torch.tensor([vals.max() for vals in next_qvals],
                                  device=device)
        # Zero all the next_qvals that are done
        next_qvals = next_qvals * (
            1 - torch.tensor(batch.done, dtype=torch.float, device=device))
        targets = torch.tensor(batch.reward, dtype=torch.float,
                               device=device) + self.gamma * next_qvals

        # Next compute Q(s, a)
        # Nest each action in a list - so that it becomes the only admissible cmd
        nested_acts = tuple([[a] for a in batch.act])
        qvals = self.network(batch.state, nested_acts)
        # Combine the qvals: Maybe just do a greedy max for generality
        qvals = torch.cat(qvals)

        # Compute Huber loss
        loss = F.smooth_l1_loss(qvals, targets.detach())
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
        self.optimizer.step()
        return loss.item()
Exemple #6
0
    def fit_buffer(self):
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # Update actor and critic according to the batch
        actor_loss, critic_loss = self.agent.update_params(batch)
        self.metrics['actor_loss'].append(actor_loss)
        self.metrics['critic_loss'].append(critic_loss)
Exemple #7
0
    def update(self):

        if len(self.memory) < self.BATCH_SIZE:
            return

        # get training batch

        transitions = self.memory.sample(self.BATCH_SIZE)

        batch = Transition(*zip(*transitions))

        state_batch = torch.cat(batch.state)

        action_batch = torch.cat(batch.action)

        reward_batch = torch.cat(batch.reward).unsqueeze(1)

        next_state = torch.cat(batch.next_state)

        # update value network

        state_action = torch.cat((state_batch, action_batch), dim=1)
        state_action_value = self.value_network(state_action)

        next_action = self.action_target_network(next_state).detach()

        next_state_action = torch.cat((next_state, next_action), dim=1)
        next_state_action_value = self.value_target_network(
            next_state_action).detach()

        expected_state_action_value = (self.DISCOUNT *
                                       next_state_action_value) + reward_batch

        value_loss = self.criterion(state_action_value,
                                    expected_state_action_value)

        self.value_optimizer.zero_grad()

        value_loss.backward()
        self.value_optimizer.step()

        # update action network

        optim_action = self.action_network(state_batch)

        optim_state_action = torch.cat((state_batch, optim_action), dim=1)

        action_loss = -self.value_network(optim_state_action)
        action_loss = action_loss.mean()

        self.action_optimizer.zero_grad()

        action_loss.backward()
        self.action_optimizer.step()

        # update target network
        soft_update(self.value_target_network, self.value_network, 0.01)
        soft_update(self.action_target_network, self.action_network, 0.01)
Exemple #8
0
	def optimize(self, step):
		# print(len(self.memory))
		if len(self.memory) < self.batch_size * 10:
				return
		transitions = self.memory.sample(self.batch_size)
		# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
		# detailed explanation). This converts batch-array of Transitions
		# to Transition of batch-arrays.
		batch = Transition(*zip(*transitions))

		# Compute a mask of non-final states and concatenate the batch elements
		# (a final state would've been the one after which simulation ended)
		next_state = torch.FloatTensor(batch.next_state).to(device)
		non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, next_state)))
		non_final_next_states = torch.cat([s for s in next_state if s is not None])

		state_batch = torch.FloatTensor(batch.state).to(device)
		action_batch = torch.LongTensor(torch.add(torch.tensor(batch.action), torch.tensor(1))).to(device)
		reward_batch = torch.FloatTensor(batch.reward).to(device)

		# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
		# columns of actions taken. These are the actions which would've been taken
		# for each batch state according to policy_net
		l = self.policy_net(state_batch).size(0)
		state_action_values = self.policy_net(state_batch)[95:l:96].gather(1, action_batch.reshape((self.batch_size, 1)))
		state_action_values = state_action_values.squeeze(-1)

		# Compute V(s_{t+1}) for all next states.
		# Expected values of actions for non_final_next_states are computed based
		# on the "older" target_net; selecting their best reward with max(1)[0].
		# This is merged based on the mask, such that we'll have either the expected
		# state value or 0 in case the state was final.
		next_state_values = torch.zeros(self.batch_size, device=device)
		next_state_values[non_final_mask] = self.target_net(next_state)[95:l:96].max(1)[0].detach()
		# Compute the expected Q values
		expected_state_action_values = (next_state_values * self.gamma) + reward_batch

		# Compute the loss
		loss = torch.nn.MSELoss()(expected_state_action_values, state_action_values)

		# Optimize the model
		
		loss.backward()
		for param in self.policy_net.parameters():
				param.grad.data.clamp_(-1, 1)
		
		self.optimizer.step()
		
		if step % self.T == 0:
			# print('soft_update')
			gamma = 0.001
			param_before = copy.deepcopy(self.target_net)
			target_update = copy.deepcopy(self.target_net.state_dict())
			for k in target_update.keys():
				target_update[k] = self.target_net.state_dict()[k] * (1 - gamma) + self.policy_net.state_dict()[k] * gamma
			self.target_net.load_state_dict(target_update)
Exemple #9
0
    def update(self):
        if len(self.memory) < self.batch_size:
            return

        batch_loss = None
        num_per_step = int(self.batch_size / self.accummulate_step)
        for _ in range(self.accummulate_step):

            transitions = self.memory.sample(num_per_step)
            batch = Transition(*zip(*transitions))

            # Compute Q(s', a') for all a'
            # TODO: Use a target network???
            next_history = []
            for act, history in zip(batch.act, batch.history):
                next_history.append(history + [act])
            next_qvals = self.network(batch.next_state, batch.next_acts,
                                      next_history)
            # Take the max over next q-values
            next_qvals = torch.tensor([vals.max() for vals in next_qvals],
                                      device=device)
            # Zero all the next_qvals that are done
            next_qvals = next_qvals * (
                1 - torch.tensor(batch.done, dtype=torch.float, device=device))
            targets = torch.tensor(batch.reward,
                                   dtype=torch.float,
                                   device=device) + self.gamma * next_qvals

            # Next compute Q(s, a)
            # Nest each action in a list - so that it becomes the only admissible cmd
            nested_acts = tuple([[a] for a in batch.act])
            qvals = self.network(batch.state, nested_acts, batch.history)
            # Combine the qvals: Maybe just do a greedy max for generality
            qvals = torch.cat(qvals)

            loss = F.smooth_l1_loss(qvals, targets.detach())

            # Compute Huber loss
            if batch_loss is None:
                batch_loss = loss
            else:
                batch_loss += loss

        batch_loss /= num_per_step

        self.optimizer.zero_grad()
        batch_loss.backward()
        nn.utils.clip_grad_norm_(self.network.parameters(), self.clip)
        self.optimizer.step()
        # self.scheduler.step()

        return loss.item()
Exemple #10
0
    def optimize_model(self, config):
        #transitions = self.memory.sample(config.batch_size)
        # PrioritizedReplayMemory
        transitions, weights, indices = self.memory.sample(
            config.batch_size, config.beta)
        transitions = self.transition_to_tensor(transitions)
        batch = Transition(*zip(*transitions))
        loss, weights_loss = self.get_loss(batch, config, weights,
                                           config.gamma)

        # N Step
        transitions_n, _, _ = self.memory_n.sample_from_indices(
            config.batch_size, config.beta, indices)
        transitions_n = self.transition_to_tensor(transitions_n)
        batch_n = Transition(*zip(*transitions_n))
        gamma_n = config.gamma**config.n_step
        loss_n, weights_loss_n = self.get_loss(batch_n, config, weights,
                                               gamma_n)
        weights_loss += weights_loss_n

        self.optimizer.zero_grad()
        #loss.backward()
        # PrioritizedReplayMemory
        weights_loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        # PrioritizedReplayMemory
        loss_for_prior = loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + config.prior_eps
        self.memory.update_priorities(indices, new_priorities)
        # N Step
        self.memory_n.update_priorities(indices, new_priorities)

        # Noisy Net
        self.policy_net.reset_noise()
        self.target_net.reset_noise()
Exemple #11
0
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see http://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Exemple #12
0
def optimize():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))
    states = torch.cat(batch.state)
    actions = torch.cat(batch.action)
    rewards = torch.cat(batch.reward)
    actual_q = policy_net(states).gather(1, actions)
    expected_q_value = expected_q(batch.next_state, rewards)
    loss = F.smooth_l1_loss(
        actual_q, expected_q_value)  # loss between actual q and expected q

    # optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Exemple #13
0
 def step(self):
     self.env.steps_done += 1
     x = self.env.input().reshape(self.n_agents, -1).cuda(self.device)
     phi = []
     for i in range(self.n_agents):
         with torch.no_grad():
             phi.append(self.Encoder(x[i]))
     # after encoding
     n = torch.cat(phi, dim=0).reshape(self.n_agents, -1)
     # state
     s = []
     for i in range(self.n_agents):
         ni = torch.cat((n[0:i], n[i + 1:])).reshape(-1)
         s.append(torch.cat((x[i], ni)))
     s = torch.cat(s).reshape(self.n_agents, -1)
     # epsilon-greedy
     actions = self.select_action(s)
     # collect rewards
     rewards = self.env.step(actions)
     if rewards == -1:
         return "done"
     # state_{t+1}
     x_tp1 = self.env.input().reshape(self.n_agents, -1).cuda(self.device)
     phi_tp1 = []
     for i in range(self.n_agents):
         with torch.no_grad():
             phi_tp1.append(self.Encoder(x_tp1[i]))
     n_tp1 = torch.cat(phi_tp1, dim=0).reshape(self.n_agents, -1)
     s_tp1 = []
     for i in range(self.n_agents):
         ni = torch.cat((n_tp1[0:i], n_tp1[i + 1:])).reshape(-1)
         s_tp1.append(torch.cat((x_tp1[i], ni)))
     s_tp1 = torch.cat(s_tp1).reshape(self.n_agents, -1)
     # initial Transition tuple
     res = []
     for i in range(self.n_agents):
         res.append(
             Transition(state=s[i],
                        action=actions[i],
                        next_state=s_tp1[i],
                        reward=rewards[i]))
     return res
Exemple #14
0
    def _optimize(self):
        if len(self.memory) < self.BATCH_SIZE:
            return
        transitions = self.memory.sample(self.BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        states = torch.cat(batch.state)
        actions = torch.cat(batch.action)
        rewards = torch.cat(batch.reward)

        # calculate q value and expected q value
        q = self._q(states, actions)
        expected_q = self._expected_q(batch.next_state, rewards)
        loss = F.smooth_l1_loss(q, expected_q)

        # optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()
Exemple #15
0
    def learn(self, mem):
        transitions = mem.sample(self.batch_size)
        batch = Transition(*zip(*transitions))  # Transpose the batch

        states = Variable(torch.stack(batch.state, 0))
        actions = Variable(torch.LongTensor(batch.action).unsqueeze(1))
        rewards = Variable(torch.Tensor(batch.reward))
        non_final_mask = torch.ByteTensor(
            tuple(map(
                lambda s: s is not None,
                batch.next_state)))  # Only process non-terminal next states
        next_states = Variable(
            torch.stack(tuple(s for s in batch.next_state if s is not None),
                        0),
            volatile=True
        )  # Prevent backpropagating through expected action values

        Qs = self.policy_net(states).gather(1, actions)  # Q(s_t, a_t; θpolicy)
        next_state_argmax_indices = self.policy_net(next_states).max(
            1, keepdim=True
        )[1]  # Perform argmax action selection using policy network: argmax_a[Q(s_t+1, a; θpolicy)]
        Qns = Variable(torch.zeros(
            self.batch_size))  # Q(s_t+1, a) = 0 if s_t+1 is terminal
        Qns[non_final_mask] = self.target_net(next_states).gather(
            1, next_state_argmax_indices
        )  # Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget)
        Qns.volatile = False  # Remove volatile flag to prevent propagating it through loss
        target = rewards + (
            self.discount * Qns
        )  # Double-Q target: Y = r + γ.Q(s_t+1, argmax_a[Q(s_t+1, a; θpolicy)]; θtarget)

        loss = F.smooth_l1_loss(
            Qs, target)  # Huber loss on TD-error δ: δ = Y - Q(s_t, a_t)
        # TODO: TD-error clipping?
        self.policy_net.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm(self.policy_net.parameters(),
                                self.max_gradient_norm)  # Clamp gradients
        self.optimiser.step()
Exemple #16
0
def optimize_model(policy_net, optimizer):
    # first sample a batch
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))
    # non_final_mask is the mask to tag all the item whose next_state is not None as True
    non_final_mask = tuple(map(lambda s: s is not None, batch.next_state))
    non_final_mask = torch.tensor(non_final_mask,
                                  device=device,
                                  dtype=torch.uint8)
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # policy_net(state_batch) is used to get all value among all actions
    # gather method is used to get the value corresponding to certain action
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)

    # compute the V(s_{t+1}) for $s_{t+1}$ which is final state, we set V(s_{t+1}) = 0
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(
        1)[0].detach()
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Huber loss
    loss = F.smooth_l1_loss(state_action_values,
                            expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Exemple #17
0
    def optimize(self, batch_size, global_step=None):
        if len(self.memory) < batch_size:
            return None

        self.memory.batch_size = batch_size
        for transitions_batch in self.memory:
            # transform list of tuples into a tuple of lists.
            # explanation here: https://stackoverflow.com/a/19343/3343043
            batch = Transition(*zip(*transitions_batch))

            state_batch = torch.cat(batch.state)
            next_state_batch = torch.cat(batch.next_state)
            action_batch = torch.cat(batch.action)
            reward_batch = torch.cat(batch.reward)

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken. These are the actions which would've been taken
            # for each batch state according to policy_net
            state_action_values = self.model(state_batch).gather(
                1, action_batch.reshape(batch_size, 1))

            # Compute the expected Q values
            with torch.no_grad():
                next_state_values = self.model(next_state_batch).max(
                    1)[0].detach()
                expected_state_action_values = (next_state_values *
                                                self.gamma) + reward_batch

            # Compute Huber loss
            loss = F.smooth_l1_loss(state_action_values,
                                    expected_state_action_values.unsqueeze(1))

            # Optimize the model
            self.optimizer.zero_grad()
            loss.backward()
            self.writer.add_scalar('training/loss', loss.item(), global_step)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=2)
            self.optimizer.step()
Exemple #18
0
    def update(self):
        if len(self.memory) < self.BATCH_SIZE:
            print("[Warning] Memory data less than batch sizes!")
            return

        transitions = self.memory.sample(self.BATCH_SIZE)
        batch = Transition(*zip(*transitions))

        final_mask = torch.cat(batch.done)
        non_final_next_states = torch.cat(
            [s for s in batch.next_state if s is not None])
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        next_state_values = torch.zeros(self.BATCH_SIZE, 1, device=device)
        next_state_values[final_mask.bitwise_not()] = self.target_net(
            non_final_next_states).max(1, True)[0].detach()

        expected_state_action_values = (next_state_values *
                                        self.GAMMA) + reward_batch

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

        self.update_count += 1
        if self.update_count % self.TARGET_UPDATE == 0:
            self.update_target_net()
Exemple #19
0
def optimize_model(policy_net, target_net, optimizer, memory):
    if len(memory) < BATCH_SIZE:
        return
    
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))

    # compute a mask of non-final states
    non_final_mask = torch.tensor([s is not None for s in batch.next_state],
                                  device=device, dtype=torch.uint8)
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])

    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # model computes Q(s_t)
    # use this to compute Q(s_t, a)
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # V(s_{t+1})
    # all final states have 0 value
    # double Q-learning implemented
    policy_best_actions = policy_net(non_final_next_states).argmax(dim=1)
    i = torch.arange(len(policy_best_actions))
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states)[i, policy_best_actions].detach()

    # expected Q values
    expected_state_action_values = reward_batch + (next_state_values * GAMMA)
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(dim=1))
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()
Exemple #20
0
 def optimize(self):
     transitions = self.memory.sample(500)
     normalized_transitions = Transition(*zip(*transitions))
Exemple #21
0
def optimize_model(
    dqn_net,
    target_net,
    memory,
    learning_rate,
    batch_size,
    size_board,
    gamma,
    optimizer,
    device,
):

    # Sample batch
    tree_indexes, memory_batch, batch_ISWeights = memory.sample(batch_size)

    samples = Transition(*zip(*memory_batch))

    states_batch = samples.state
    actions_batch = samples.action
    rewards_batch = samples.reward
    next_states_batch = samples.next_state
    dones_batch = samples.done

    target_qs_batch = []

    torch_next_states_batch = (torch.from_numpy(
        np.asarray(next_states_batch)).float().to(device))

    # Get Q values for next state
    q_next_state = dqn_net(torch_next_states_batch, batch_size, size_board)

    # REMOVER detach depois e testar !!!!!!!!!!!
    q_target_next_state = (target_net(torch_next_states_batch, batch_size,
                                      size_board).cpu().detach())

    for i in range(0, len(memory_batch)):
        terminal = dones_batch[i]

        # Get max action value index
        action = np.argmax(q_next_state[i].cpu().detach().numpy())

        # If we are in terminal state, only equals reward
        if terminal:
            target_qs_batch.append(rewards_batch[i])
        else:
            target = rewards_batch[i] + gamma * q_target_next_state[i][action]
            target_qs_batch.append(target)

    targets_batch = np.array([each for each in target_qs_batch])

    torch_states_batch = torch.from_numpy(
        np.asarray(states_batch)).float().to(device)

    output = dqn_net(torch_states_batch, batch_size, size_board)

    torch_actions_batch = torch.from_numpy(np.asarray(actions_batch))
    torch_actions_batch = torch_actions_batch.unsqueeze(0)
    torch_actions_batch = torch_actions_batch.view(batch_size, 1)

    # Q is our predicted Q value
    q_values = output.gather(1, torch_actions_batch.to(device))
    q_values = q_values.float()

    # Absolute error for update tree
    absolute_errors = (
        torch.abs(q_values - torch.from_numpy(targets_batch).view(
            batch_size, 1).float().to(device)).cpu().detach().numpy())

    torch_batch_ISWeights = torch.from_numpy(batch_ISWeights).to(device)

    # Mean squared error
    diff_target = q_values - torch.from_numpy(targets_batch).view(
        batch_size, 1).float().to(device)
    squared_diff = diff_target**2
    weighted_squared_diff = squared_diff * torch_batch_ISWeights

    # Loss
    loss = torch.mean(weighted_squared_diff)

    # Optimization
    optimizer.zero_grad()

    loss.backward()
    optimizer.step()

    # Squeze absolute errors
    absolute_errors = np.squeeze(absolute_errors, 1)

    # Memory tree update
    memory.batch_update(tree_indexes, absolute_errors)

    return loss.cpu().detach().numpy()
    def optimize_model(self):
        if (len(self.memory) < self.batch_size
                or self.train_step % self.update_every != 0):
            return

        transitions = self.memory.sample(self.batch_size)

        # transpose the batch so that we get a transition
        # of batch array
        batch = Transition(*zip(*transitions))

        # mask for non-final states
        # these are states where we will have another move to make
        # after the current move
        non_final_mask = torch.logical_not(
            torch.tensor(batch.done, dtype=torch.bool, device=self.device))

        # get the next states that are not final
        non_final_next_state_batch = torch.tensor(
            [s['obs'] for s in batch.next_state],
            dtype=torch.float,
            device=self.device)[non_final_mask]

        # get the legal actions for these non-final
        # next states
        non_final_next_state_legal_actions = torch.stack([
            torch.tensor(self.pad_actions(s['legal_actions']),
                         dtype=torch.long,
                         device=self.device) for s in batch.next_state
        ],
                                                         dim=0)[non_final_mask]

        # get the state, action, and reward batches

        state_batch = torch.tensor([s['obs'] for s in batch.state],
                                   dtype=torch.float,
                                   device=self.device)

        action_batch = torch.tensor(batch.action,
                                    dtype=torch.long,
                                    device=self.device).view(-1, 1)

        reward_batch = torch.tensor(batch.reward,
                                    dtype=torch.float,
                                    device=self.device)

        # compute Q(s_t, a)
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)
        # compute max a for Q(s_{t+1}, a)

        # if s_{t+1} is a final state, then Q(s_{t+1}, a) is 0
        next_state_values = torch.zeros(self.batch_size, device=self.device)

        if non_final_next_state_batch.size()[0] != 0:
            # get predicted rewards for all non-final next states
            next_state_all_values = \
                    self.target_net(non_final_next_state_batch).detach()

            # only select rewards for valid actions
            next_state_valid_values = next_state_all_values.gather(
                1, non_final_next_state_legal_actions)
            # get the max reward for a valid action
            next_state_values[non_final_mask] = (
                next_state_valid_values.max(1)[0])

        expected_state_action_values = ((next_state_values * self.gamma) +
                                        reward_batch)

        # minimize Q(s_t, a) - reward + (gamma * max a Q(s_{t+1}, a))
        # the predicted total reward for choosing action a at state s_t
        # should equal the actual reward for that action plus the
        # predicted total reward for choosing another action at state s_{t+1}
        loss = self.criterion(state_action_values,
                              expected_state_action_values.unsqueeze(1))

        self.loss_sum += loss.item()

        self.optimizer.zero_grad()
        loss.backward()

        self.weight_updates += 1

        if self.clip_grads:
            for param in self.policy_net.parameters():
                param.grad.data.clamp_(-1, 1)

        self.optimizer.step()

        if self.weight_updates % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
Exemple #23
0
 def update(self, state, action, reward, state_next, done):
     record = Transition(state, action, reward, state_next, done)
     self._memory.add(record)
     if len(self._memory) >= self._batch_size:
         self.train_step()
        mask = np.array([not done])
        nextState = stateToTensor(nextState, desiredGoal)
        nextStateNumpy = nextState.cpu().numpy()
        reward = calcReward(state[0, -3:], desiredGoal, orginalDistance)
        episodeReward = reward
        shortMemory.push(state, action, mask, nextStateNumpy, reward)

        if done:
            break
        else:
            state = nextState.to(device)

        if len(memory) > batchSize:
            for _ in range(updatesPerStep):
                transition = memory.sample(batchSize)
                batch = Transition(*zip(*transition))
                valueLoss = agent.updateParameters(batch, device)
                valueLossEp += valueLoss

    memory.append(shortMemory)
    rewards.append(episodeReward)

    if episode % checkEvery == 0:
        testRewards = []
        for _ in range(numberOfTests):
            state = env.reset()
            startingPositionPuck = state["achieved_goal"]
            orginalDistance = np.linalg.norm(startingPositionPuck -
                                             desiredGoal)
            while True:
                state = stateToTensor(state, desiredGoal).to(device=device)
Exemple #25
0
    def compute_bellman_residual(self, batch, target_state_action_value=None):
        # Compute concatenate the batch elements
        if not isinstance(batch.current_state, torch.Tensor):
            # logger.info("Casting the batch to torch.tensor")
            current_state = torch.cat(
                tuple(torch.tensor([batch.current_state],
                                   dtype=torch.float))).to(self.device)
            current_future_pos = torch.cat(
                tuple(
                    torch.tensor([batch.current_future_pos],
                                 dtype=torch.float))).to(self.device)
            current_past_pos = torch.cat(
                tuple(torch.tensor([batch.current_past_pos],
                                   dtype=torch.float))).to(self.device)

            action = torch.tensor(batch.action,
                                  dtype=torch.long).to(self.device)
            reward = torch.tensor(batch.reward,
                                  dtype=torch.float).to(self.device)

            next_state = torch.cat(
                tuple(torch.tensor([batch.next_state],
                                   dtype=torch.float))).to(self.device)
            next_future_pos = torch.cat(
                tuple(torch.tensor([batch.next_future_pos],
                                   dtype=torch.float))).to(self.device)
            next_past_pos = torch.cat(
                tuple(torch.tensor([batch.next_past_pos],
                                   dtype=torch.float))).to(self.device)

            terminal = torch.tensor(batch.terminal,
                                    dtype=torch.bool).to(self.device)
            batch = Transition(current_state, current_future_pos, current_past_pos,\
                action, reward,\
                    next_state, next_future_pos, next_past_pos,\
                        terminal, batch.info)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        current_state_action_values, current_trajectory = self.value_net(
            batch.current_state, batch.current_past_pos, batch.action)
        state_action_values = current_state_action_values.gather(
            1, batch.action.unsqueeze(1)).squeeze(1)

        if target_state_action_value is None:
            with torch.no_grad():
                # Compute V(s_{t+1}) for all next states.
                next_state_values = torch.zeros(batch.reward.shape).to(
                    self.device)
                if self.config["double"]:
                    # Double Q-learning: pick best actions from policy network
                    next_state_action_values, next_trajectory = self.value_net(
                        batch.next_state, batch.next_past_pos)
                    _, best_actions = next_state_action_values.max(1)
                    # Double Q-learning: estimate action values from target network
                    next_target_state_action_values, next_target_tragectory = self.target_net(
                        batch.next_state, batch.next_past_pos)
                    best_values = next_target_state_action_values.gather(
                        1, best_actions.unsqueeze(1)).squeeze(1)
                else:
                    next_state_action_values, next_trajectory = self.target_net(
                        batch.next_state, bacth.next_past_pos)
                    best_values, _ = next_state_action_values.max(1)
                next_state_values[~batch.terminal] = best_values[~batch.
                                                                 terminal]
                # Compute the expected Q values
                target_state_action_value = batch.reward + self.config[
                    "gamma"] * next_state_values

        # Compute loss
        rl_loss = self.rl_lossFunction(state_action_values,
                                       target_state_action_value)
        predict_loss = self.predict_lossfunction(current_trajectory,
                                                 batch.current_future_pos)

        self.writer.add_scalar('step/rl_loss', rl_loss, self.step)
        self.writer.add_scalar('step/predict_loss', predict_loss, self.step)

        return rl_loss + predict_loss, target_state_action_value, batch
Exemple #26
0
 def sample_minibatch(self):
     if len(self.memory) < self.config["batch_size"]:
         return None
     transitions = self.memory.sample(self.config["batch_size"])
     return Transition(*zip(*transitions))
Exemple #27
0
                duration.append(lifespan[i][-1])
                lifespan[i].append(0)

            if lifespan[i][-1] > 0:  # 500일때 버림
                memory.push(s[i], a[i], r[i], s_gotten[i], done[i])

        if frame_count > initial_exploration:
            eps -= 0.00005
            eps = max(eps, 0.1)
            batch = memory.sample(batch_size)
            s = torch.FloatTensor([*batch.s]).to(device)
            a = torch.LongTensor([*batch.a]).unsqueeze(-1).to(device)
            r = torch.FloatTensor([*batch.r]).unsqueeze(-1).to(device)
            ns = torch.FloatTensor([*batch.ns]).to(device)
            nt = torch.BoolTensor(np.array(batch.nt).tolist()).unsqueeze(-1).to(device)
            agent.train_model(Transition(s, a, r, ns, nt), solver, gamma, F.mse_loss)
            if frame_count % update_target == 0:
                agent.update()
                if len(duration) > 100:
                    score = np.array(duration)[-100:].mean()
                    print('score:', score)
                    if score > 498:
                        break
        frame_count += 1
    envs.close()

    env = gym.make(env_name)
    s = env.reset()
    while True:
        preprocessed_s = torch.FloatTensor(s).unsqueeze(0).to(device)
        a = agent.response(preprocessed_s)
Exemple #28
0
def train_dqn(settings):
    required_settings = [
        "batch_size",
        "checkpoint_frequency",
        "device",
        "eps_start",
        "eps_end",
        "eps_cliff",
        "eps_decay",
        "gamma",
        "log_freq",
        "logs_dir",
        "lr",
        "max_steps",
        "memory_size",
        "model_name",
        "num_episodes",
        "out_dir",
        "target_net_update_freq",
    ]
    if not settings_is_valid(settings, required_settings):
        raise Exception(
            f"Settings object {settings} missing some required settings.")

    batch_size = settings["batch_size"]
    checkpoint_frequency = settings["checkpoint_frequency"]
    device = settings["device"]
    eps_start = settings["eps_start"]
    eps_end = settings["eps_end"]
    eps_cliff = settings["eps_cliff"]
    # eps_decay = settings["eps_decay"]
    gamma = settings["gamma"]
    logs_dir = settings["logs_dir"]
    log_freq = settings["log_freq"]
    lr = settings["lr"]
    max_steps = settings["max_steps"]
    memory_size = settings["memory_size"]
    model_name = settings["model_name"]
    num_episodes = settings["num_episodes"]
    out_dir = settings["out_dir"]
    target_net_update_freq = settings["target_net_update_freq"]

    # Initialize environment
    env = gym.make("StarGunner-v0")

    # Initialize model
    num_actions = env.action_space.n
    settings["num_actions"] = num_actions
    policy_net = DQN(settings).to(device)
    target_net = DQN(settings).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    # Initialize memory
    logging.info("Initializing memory.")
    memory = ReplayMemory(memory_size)
    memory.init_with_random((1, 3, 84, 84), num_actions)
    logging.info("Finished initializing memory.")

    # Initialize other model ingredients
    optimizer = optim.Adam(policy_net.parameters(), lr=lr)

    # Initialize tensorboard
    writer = SummaryWriter(logs_dir)

    # Loop over episodes
    policy_net.train()
    steps_done = 0
    log_reward_acc = 0.0
    log_steps_acc = 0
    for episode in tqdm(range(num_episodes)):
        state = process_state(env.reset()).to(device)
        reward_acc = 0.0
        loss_acc = 0.0

        # Loop over steps in episode
        for t in range(max_steps):
            with torch.no_grad():
                Q = policy_net.forward(state.type(torch.float))

            # Get best predicted action and perform it
            if steps_done < eps_cliff:
                epsilon = -(eps_start -
                            eps_end) / eps_cliff * steps_done + eps_start
            else:
                epsilon = eps_end

            if random.random() < epsilon:
                predicted_action = torch.tensor([env.action_space.sample()
                                                 ]).to(device)
            else:
                predicted_action = torch.argmax(Q, dim=1)
            next_state, raw_reward, done, info = env.step(
                predicted_action.item())
            # Note that next state could also be a difference
            next_state = process_state(next_state)
            reward = torch.tensor([clamp_reward(raw_reward)])

            # Save to memory
            memory.push(state.to("cpu"), predicted_action.to("cpu"),
                        next_state, reward)

            # Move to next state
            state = next_state.to(device)

            # Sample from memory
            batch = Transition(*zip(*memory.sample(batch_size)))

            # Mask terminal state (adapted from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)
            final_mask = torch.tensor(
                tuple(map(lambda s: s is not None, batch.next_state)),
                device=device,
                dtype=torch.bool,
            )
            # print("FINAL_MASK", final_mask.shape)
            state_batch = torch.cat(batch.state).type(torch.float).to(device)
            next_state_batch = torch.cat(batch.next_state).type(
                torch.float).to(device)
            action_batch = torch.cat(batch.action).to(device)
            reward_batch = torch.cat(batch.reward).to(device)

            # print("STATE_BATCH SHAPE", state_batch.shape)
            # print("STATE_BATCH", state_batch[4, :, 100])
            # print("ACTION_BATCH SHAPE", action_batch.shape)
            # print("ACTION_BATCH", action_batch)
            # print("REWARD_BATCH SHAPE", reward_batch.shape)

            # Compute Q
            # Q_next = torch.zeros((batch_size, num_actions))
            # print("MODEL STATE BATCH SHAPE", model(state_batch).shape)
            Q_actual = policy_net(state_batch).gather(
                1, action_batch.view(action_batch.shape[0], 1))
            Q_next_pred = target_net(next_state_batch)
            Q_max = torch.max(Q_next_pred, dim=1)[0].detach()
            # print("Q_MAX shape", Q_max.shape)
            target = reward_batch + gamma * Q_max * final_mask.to(Q_max.dtype)
            # print("TARGET SIZE", target.shape)

            # Calculate loss
            loss = F.smooth_l1_loss(Q_actual, target.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()

            # Clamp gradient to avoid gradient explosion
            for param in policy_net.parameters():
                param.grad.data.clamp_(-1, 1)
            optimizer.step()

            # Store stats
            loss_acc += loss.item()
            reward_acc += raw_reward
            steps_done += 1

            if steps_done % target_net_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())

            # Exit if in terminal state
            if done:
                logging.debug(
                    f"Episode {episode} finished after {t} timesteps with reward {reward_acc}."
                )
                break

        logging.debug(f"Loss: {loss_acc / t}")

        # Save model checkpoint
        if (episode != 0) and (episode % checkpoint_frequency == 0):
            save_model_checkpoint(
                policy_net,
                optimizer,
                episode,
                loss,
                f"{out_dir}/checkpoints/{model_name}_{episode}",
            )

        # Log to tensorboard
        log_reward_acc += reward_acc
        log_steps_acc += t
        writer.add_scalar("Loss / Timestep", loss_acc / t, episode)
        if episode % log_freq == 0:
            writer.add_scalar("Reward", log_reward_acc / log_freq, episode)
            writer.add_scalar("Reward / Timestep",
                              log_reward_acc / log_steps_acc, episode)
            writer.add_scalar("Duration", log_steps_acc / log_freq, episode)
            writer.add_scalar("Steps", log_reward_acc / log_steps_acc,
                              steps_done)
            log_reward_acc = 0.0
            log_steps_acc = 0

    # Save model
    save_model(policy_net, f"{out_dir}/{model_name}.model")

    # Report final stats
    logging.info(f"Steps Done: {steps_done}")

    env.close()
    return policy_net