Ejemplo n.º 1
0
def vinPredict(status, place, vin):
    if np.random.random() < e:
        action = np.random.randint(8)
        return action
    S1 = torch.Tensor([place[0]]).expand(1, 1)
    S2 = torch.Tensor([place[1]]).expand(1, 1)
    X = torch.Tensor(status).expand(1, len(status), status[0].shape[0],
                                    status[0].shape[1])
    config = cnn.Config()
    q1, q2 = vin(X, S1, S2, cnn.Config())
    q1 = q1

    return q1
Ejemplo n.º 2
0
def vinPolicy(status, place):
    if np.random.random() < e:
        action = np.random.randint(8)
        return action
    S1 = torch.Tensor([place[0]])  #.expand(1,1)
    S2 = torch.Tensor([place[1]])  #.expand(1,1)
    #print(torch.Tensor(status).shape)
    X = torch.Tensor(status).expand(1, len(status), status[0].shape[0],
                                    status[0].shape[1])
    config = cnn.Config()
    q1, q2 = CNN(X, S1, S2, cnn.Config())
    q1 = q1
    q2 = q2
    #print(q1)
    #print(q2.shape)
    _, action = torch.max(q2, dim=1)
    action = int(action)
    #print(action)
    assert 0 <= action and action < 9
    return action
Ejemplo n.º 3
0
def update(experience, vin, oldvin, p=False):
    #(action,state,place,next_state,next_place,reward,over)
    X = []
    S1 = []
    S2 = []
    oldS1 = []  #next action
    oldS2 = []  #next action
    oldX = []
    action = []
    Y = []  #torch.Tensor(reward[::-1])
    index = []
    for j in range(cnn.Config().batch_size):  # sample experience from replay
        x = np.random.randint(len(experience))
        #status,place,reward,over,action
        while experience[x][6] == True:
            x = np.random.randint(len(experience))
        Y.append(experience[x][5])
        action.append(experience[x][0])
        X.append(experience[x][1])
        oldX.append(experience[x][3])
        S1.append(experience[x][2][0])
        S2.append(experience[x][2][1])
        oldS1.append(experience[x][4][0])
        oldS2.append(experience[x][4][1])
        #index.append((x1,x2+1))

    X = torch.from_numpy(
        np.array(X)).float()  #do not change it to torch.Tensor(X).float()
    S1 = torch.from_numpy(np.array(S1)).float()  #.unsqueeze(1)
    S2 = torch.from_numpy(np.array(S2)).float()  #.unsqueeze(1)

    oldX = torch.from_numpy(np.array(oldX)).float()
    oldS1 = torch.from_numpy(np.array(oldS1)).float()  #.unsqueeze(1)
    oldS2 = torch.from_numpy(np.array(oldS2)).float()  #.unsqueeze(1)
    #print("here",S1.shape)
    action = torch.from_numpy(np.array(action)).unsqueeze(dim=1).long()

    Y = torch.from_numpy(np.array(Y)).float()
    #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float()

    oldoutputs, _ = oldvin(oldX, oldS1, oldS2, cnn.Config())
    oldouputs = oldoutputs.detach()
    Qmax = (torch.max(oldoutputs, dim=1)[0]).squeeze()

    outputs, _ = vin(X, S1, S2, cnn.Config())
    Qvalue = outputs.gather(index=action, dim=1).squeeze()
    #print(Qvalue.shape)
    #print(Y.shape)

    TDtarget = (Y + gamma * Qmax)
    bellman_error = -(TDtarget - Qvalue)

    optimizer = optim.RMSprop(CNN.parameters(), lr=cnn.Config().lr, eps=1e-6)
    optimizer.zero_grad()
    Qvalue.backward(bellman_error.data)
    optimizer.step()

    if p:
        print(outputs[0], Qvalue[0], TDtarget[0], Y[0].cpu().numpy())
        grid.plot2(X[0].cpu().numpy(), int(S1[0].item()), int(S2[0].item()))
    return loss
Ejemplo n.º 4
0
                env.plot()

            status, place, reward, over = env.step(action)

            t += 1
        total_reward += env.total_reward + 0.0
        if env.total_reward > Tmax * env.step_reward:
            success += 1
        if i % 100 == 0:
            print(i)
    return total_reward / iters, success / iters, time.time() - time2


device = 0

CNN = cnn.CNN(cnn.Config())
#CNN.load_state_dict(torch.load("vin_8x8.pth",map_location='cpu'))
print(CNN)
oldCNN = cnn.CNN(cnn.Config())
oldCNN.load_state_dict(CNN.state_dict())
grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=False)

#print(evaluate(grid,vinPolicy,1000))
#print(evaluate(grid,randomWalk))
maxStep = 5000000
episodes = 20000
gamma = 0.99
Tmax = 100
replay = []
max_exp = 100000
learning_begin = 10000
Ejemplo n.º 5
0
    total_reward = 0
    success = 0.0
    time2 = time.time()
    for i in range(iters):
        status, place, reward, over = env.reset()
        t = 0
        Tmax = 100
        while over == False and t < Tmax:
            action = policy(status, place)
            if iters % 100 == 0 and show:
                print(action)
                env.plot()

            status, place, reward, over = env.step(action)

            t += 1
        total_reward += env.total_reward + 0.0
        if env.total_reward > Tmax * env.step_reward:
            success += 1
        if i % 100 == 0:
            print(i)
    return total_reward / iters, success / iters, time.time() - time2


model = sys.argv[1]
CNN = cnn.CNN(cnn.Config())
CNN.load_state_dict(torch.load(model, map_location='cpu'))
grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=True)
e = 0
print(CNN)
print(evaluate(grid, cnnPolicy, iters=1000))