Example #1
0
 def __init__(
     self,
     action_str,
     model,
     gpu_id = -1,
     hidden_state_sz = 512
 ):
     self.actions = action_str
     self.gpu_id = gpu_id
     self.model = model
     #self.learned_input = None
     self.hidden_state_sz = hidden_state_sz
     self.done = False#智能体是否提出done?当动作中不含'Done'时,一定一直为False
     if self.gpu_id >= 0:
         with torch.cuda.device(self.gpu_id):
             self.model = self.model.cuda()
     
     ###################################################
     self.hidden = [
         gpuify(torch.zeros(1, self.hidden_state_sz), gpu_id),
         gpuify(torch.zeros(1, self.hidden_state_sz), gpu_id)
         ]
     self.probs = gpuify(torch.zeros((1, len(self.actions))), gpu_id)
     self.log_pi_batch = []
     self.v_batch = []
     self.entropies = []
     self.embedding = None
     self.i_act = None
     self.learned_input = None
Example #2
0
 def reset_hidden(self):
     self.hidden_batch[0][-1] = gpuify(torch.zeros(1, self.hidden_state_sz),
                                       self.gpu_id)
     self.hidden_batch[1][-1] = gpuify(torch.zeros(1, self.hidden_state_sz),
                                       self.gpu_id)
     self.probs_batch[-1] = gpuify(torch.zeros((1, len(self.actions))),
                                   self.gpu_id)
Example #3
0
def value_loss(batch_out,
               last_v,
               exps,
               gpu_id=-1,
               gamma=0.99,
               nsteps=100,
               on_off=1):
    """接受batch经验的带mask的简单loss计算,没有熵,没有gae"""
    value_loss = 0

    v_batch = batch_out['value']
    td_target = list()

    a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1)
    a_batch = gpuify(a_batch, gpu_id)

    threads = len(exps['action_idxs'][0])

    index_ = len(exps['rewards']) - 1
    if on_off:
        v2c = v_batch.gather(1, a_batch)
        last_a = torch.tensor(exps['last_action']).reshape(-1, 1)
        last_v = torch.tensor(last_v).reshape(threads, -1)
        R = last_v.gather(1, last_a).numpy().reshape(-1)
    else:
        v2c = v_batch.max(1)[0]
        R = last_v.reshape(threads, -1).max(1)

    v_batch = v_batch.gather(1, a_batch)

    v2c = v2c.detach().cpu().numpy().reshape(index_ + 1, -1)

    #v2c[-1]#last_v
    flag = 0
    for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]):
        index_ -= 1
        if flag < nsteps:
            R = r + gamma * R * mask
        else:
            tau = index_ + nsteps
            R = v2c[tau]
            for i in range(nsteps):
                R = exps['rewards'][tau - i - 1] + gamma * R * mask

        td_target.append(R)
        flag += 1

    td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1)
    td_target = gpuify(td_target, gpu_id)

    value_loss = F.smooth_l1_loss(v_batch, td_target.detach(), reduction='sum')
    total_loss = value_loss

    return dict(total_loss=total_loss, value_loss=value_loss)
Example #4
0
def a2c_loss(
    batch_out,
    last_v,
    exps,
    gpu_id=-1,
    gamma=0.99,  #discount factor for exps['rewards']
    nsteps=1,
):
    """接受batch经验的带mask的简单loss计算,没有熵,没有gae"""
    policy_loss = 0
    value_loss = 0

    pi_batch, v_batch = batch_out['policy'], batch_out['value']

    R = last_v
    td_target = list()

    index_ = len(exps['rewards']) - 1
    flag = 0

    v2c = v_batch.detach().cpu().numpy()

    for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]):
        index_ -= 1
        if flag < nsteps:
            R = r + gamma * R * mask
        else:
            tau = index_ + nsteps
            R = v2c[tau]
            for i in range(nsteps):
                R = exps['rewards'][tau - i - 1] + gamma * R * mask
        td_target.append(R)
        flag += 1

    td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1)
    td_target = gpuify(td_target, gpu_id)

    a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1)
    a_batch = gpuify(a_batch, gpu_id)

    advantage = td_target - v_batch.detach()
    advantage = gpuify(advantage, gpu_id)

    pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch)
    policy_loss = (-torch.log(pi_a) * advantage.detach()).sum()
    value_loss = 0.5 * F.smooth_l1_loss(
        v_batch, td_target.detach(), reduction='sum')
    total_loss = policy_loss + value_loss

    return dict(total_loss=total_loss,
                policy_loss=policy_loss,
                value_loss=value_loss)
Example #5
0
def loss_with_entro(
        batch_out,
        last_v,
        exps,
        gpu_id=-1,
        gamma=0.99,  #discount factor for exps['rewards']
        beta=1e-2,  #entropy regularization term
):
    """接受batch经验的带mask的简单loss计算,有动作熵"""
    policy_loss = 0
    value_loss = 0

    R = last_v
    td_target = list()
    pi_batch, v_batch = batch_out['policy'], batch_out['value']

    for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]):
        R = r + gamma * R * mask
        td_target.append(R)

    td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1)
    td_target = gpuify(td_target, gpu_id)

    a_batch = torch.tensor(exps['action_idxs'])
    threads = a_batch.shape[1]
    a_batch = gpuify(a_batch.reshape(-1, 1), gpu_id)

    advantage = td_target - v_batch.detach()
    advantage = gpuify(advantage, gpu_id)

    prob = F.softmax(pi_batch, dim=1)
    log_prob = torch.log(prob)  #F.log_softmax(pi_batch, dim = 1)
    log_pi_a = log_prob.gather(1, a_batch)
    entropies = -(log_prob * prob)

    policy_loss = (-log_pi_a *
                   advantage.detach()).sum() - beta * entropies.sum()
    value_loss = 0.5 * F.smooth_l1_loss(
        v_batch, td_target.detach(), reduction='sum')
    total_loss = policy_loss + value_loss

    return dict(total_loss=total_loss,
                policy_loss=policy_loss,
                value_loss=value_loss)
Example #6
0
def basic_loss(
        #v_batch,
        #pi_batch,
        batch_out,
        last_v,
        exps,
        gpu_id=-1,
        gamma=0.99,  #discount factor for exps['rewards']
):
    """接受batch经验的带mask的简单loss计算,没有熵,没有gae"""
    policy_loss = 0
    value_loss = 0

    pi_batch, v_batch = batch_out['policy'], batch_out['value']

    R = last_v
    td_target = list()

    for r, mask in zip(exps['rewards'][::-1], exps['masks'][::-1]):
        R = r + gamma * R * mask
        td_target.append(R)

    td_target = torch.FloatTensor(td_target[::-1]).reshape(-1, 1)
    td_target = gpuify(td_target, gpu_id)

    a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1)
    a_batch = gpuify(a_batch, gpu_id)

    advantage = td_target - v_batch.detach()
    advantage = gpuify(advantage, gpu_id)

    pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch)
    policy_loss = (-torch.log(pi_a) * advantage.detach()).sum()
    value_loss = 0.5 * F.smooth_l1_loss(
        v_batch, td_target.detach(), reduction='sum')
    total_loss = policy_loss + value_loss

    return dict(total_loss=total_loss,
                policy_loss=policy_loss,
                value_loss=value_loss)
Example #7
0
def a3c_loss(
    done,
    v_batch,
    pi_batch,
    last_v,
    exps,
    gpu_id=-1,
    gamma=0.99,  #discount factor for exps['rewards']
    #tau = 1.00,#parameter for GAE
    #beta = 1e-2,#entropy regularization term
):

    R = 0.0 if done else last_v
    policy_loss = 0
    value_loss = 0
    td_target_lst = []
    for reward in exps['rewards'][::-1]:
        R = gamma * R + reward
        td_target_lst.append([R])
    td_target_lst.reverse()

    a_batch = torch.tensor(exps['action_idxs'])
    a_batch = gpuify(a_batch, gpu_id)

    td_target = torch.FloatTensor(td_target_lst)
    td_target = gpuify(td_target, gpu_id)

    advantage = td_target - v_batch.detach()
    advantage = gpuify(advantage, gpu_id)

    pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch)
    policy_loss = -torch.log(pi_a) * advantage.detach()
    value_loss = 0.5 * F.smooth_l1_loss(v_batch, td_target.detach())
    total_loss = policy_loss + value_loss

    return dict(total_loss=total_loss,
                policy_loss=policy_loss,
                value_loss=value_loss)
Example #8
0
def basic_loss_no_mask(
        v_batch,
        pi_batch,
        last_v,
        exps,
        gpu_id=-1,
        gamma=0.99,  #discount factor for exps['rewards']
):
    """接受batch经验的不带mask的简单loss计算,没有熵,没有gae"""
    R = last_v
    policy_loss = 0
    value_loss = 0
    td_target_lst = []
    for reward in exps['rewards'][::-1]:
        R = gamma * R + reward
        td_target_lst.append([R])
    td_target_lst.reverse()

    a_batch = torch.tensor(exps['action_idxs']).reshape(-1, 1)
    a_batch = gpuify(a_batch, gpu_id)

    td_target = torch.FloatTensor(td_target_lst).reshape(-1, 1)
    td_target = gpuify(td_target, gpu_id)

    advantage = td_target - v_batch.detach()
    advantage = gpuify(advantage, gpu_id)

    pi_a = F.softmax(pi_batch, dim=1).gather(1, a_batch)
    policy_loss = (-torch.log(pi_a) * advantage.detach()).sum()
    value_loss = 0.5 * F.smooth_l1_loss(
        v_batch, td_target.detach(), reduction='sum')
    total_loss = policy_loss + value_loss

    return dict(total_loss=total_loss,
                policy_loss=policy_loss,
                value_loss=value_loss)
Example #9
0
def savn_loss(
        v_batch,
        log_pi_batch,
        entropies,
        last_v,
        exps,
        gpu_id=-1,
        gamma=0.99,  #discount factor for exps['rewards']
        tau=1.00,  #parameter for GAE
        beta=1e-2,  #entropy regularization term
):
    """savn使用的loss函数,经验是以list形式计算的,带熵和gae"""
    R = last_v
    v_batch.append(gpuify(torch.tensor(R), gpu_id))
    policy_loss = 0
    value_loss = 0
    gae = torch.zeros(1, 1)
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            gae = gae.cuda()
    for i in reversed(range(len(exps['rewards']))):
        R = gamma * R + exps['rewards'][i]
        advantage = R - v_batch[i]
        value_loss = value_loss + 0.5 * advantage.pow(2)

        delta_t = (exps['rewards'][i] + gamma * v_batch[i + 1].detach() -
                   v_batch[i].detach())

        gae = gae * gamma * tau + delta_t

        policy_loss = (policy_loss - log_pi_batch[i] * gae -
                       beta * entropies[i])

    return dict(total_loss=policy_loss + 0.5 * value_loss,
                policy_loss=policy_loss,
                value_loss=value_loss)
Example #10
0
 def reset_hidden(self):
     self.hidden = [
         gpuify(torch.zeros(1, self.hidden_state_sz), self.gpu_id),
         gpuify(torch.zeros(1, self.hidden_state_sz), self.gpu_id)
         ]
     self.probs = gpuify(torch.zeros((1, len(self.actions))), self.gpu_id)