Ejemplo n.º 1
0
def vinPredict(status,place,vin):
    if np.random.random()<e:
        action=np.random.randint(8)
        return action
    S1=torch.Tensor([place[0]]).expand(1,1)
    S2=torch.Tensor([place[1]]).expand(1,1)
    X=torch.Tensor(status).expand(1, len(status),status[0].shape[0],status[0].shape[1])
    config=myvindyn.Config()
    q1,q2=vin(X,S1,S2,myvindyn.Config())
    q1=q1
    
    return q1
Ejemplo n.º 2
0
def vinPolicy(status,place):
    if np.random.random()<e:
        action=np.random.randint(8)
        return action
    S1=torch.Tensor([place[0]])#.expand(1,1)
    S2=torch.Tensor([place[1]])#.expand(1,1)
    #print(torch.Tensor(status).shape)
    X=torch.Tensor(status).expand(1, len(status),status[0].shape[0],status[0].shape[1])
    config=myvindyn.Config()
    q1,q2=VINinfer(X,S1,S2,myvindyn.Config())
    q1=q1
    q2=q2
    #print(q1)
    #print(q2.shape)
    _,action=torch.max(q2,dim=1)
    action=int(action)    
    #print(action)
    assert 0<=action and action<9
    return action
Ejemplo n.º 3
0
def update(experience,vin,oldvin,p=False):
    #(action,state,place,next_state,next_place,reward,over)
    X=[]
    S1=[]
    S2=[]
    oldS1=[]#next action
    oldS2=[]#next action
    oldX=[]
    action=[]
    Y=[]#torch.Tensor(reward[::-1])
    index=[]
    for j in range(myvindyn.Config().batch_size):# sample experience from replay
        x=np.random.randint(len(experience))
        #status,place,reward,over,action
        while experience[x][6]==True:
            x=np.random.randint(len(experience))
        Y.append(experience[x][5])
        action.append(experience[x][0])
        X.append(experience[x][1])
        oldX.append(experience[x][3])
        S1.append(experience[x][2][0])
        S2.append(experience[x][2][1])
        oldS1.append(experience[x][4][0])
        oldS2.append(experience[x][4][1])
        #index.append((x1,x2+1))

    X=torch.from_numpy(np.array(X)).float()#do not change it to torch.Tensor(X).float()
    S1=torch.from_numpy(np.array(S1)).float()#.unsqueeze(1)
    S2=torch.from_numpy(np.array(S2)).float()#.unsqueeze(1)

    oldX=torch.from_numpy(np.array(oldX)).float()
    oldS1=torch.from_numpy(np.array(oldS1)).float()#.unsqueeze(1)
    oldS2=torch.from_numpy(np.array(oldS2)).float()#.unsqueeze(1)
    #print("here",S1.shape)
    action=torch.from_numpy(np.array(action)).unsqueeze(dim=1).long()

    Y=torch.from_numpy(np.array(Y)).float()
    #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() 


    oldoutputs, _ = oldvin(oldX,oldS1,oldS2 ,  myvindyn.Config())
    oldouputs=oldoutputs.detach()
    Qmax=(torch.max(oldoutputs,dim=1)[0]).squeeze()

    outputs, _ = vin(X,S1,S2 ,  myvindyn.Config())
    Qvalue=outputs.gather(index=action,dim=1).squeeze()
    #print(Qvalue.shape)
    #print(Y.shape)

    TDtarget=(Y+gamma*Qmax)
    bellman_error=-(TDtarget-Qvalue)
    
    optimizer = optim.RMSprop(VINinfer.parameters(), lr=myvindyn.Config().lr, eps=1e-6) 
    optimizer.zero_grad()  
    Qvalue.backward(bellman_error.data)
    optimizer.step()

    if p:
        print(outputs[0],Qvalue[0],TDtarget[0],Y[0].cpu().numpy())
        grid.plot2(X[0].cpu().numpy(),int(S1[0].item()),int(S2[0].item()))
    return loss
Ejemplo n.º 4
0
                print(action)
                env.plot()
            
            status,place,reward,over=env.step(action)
            
            t+=1
        total_reward+=env.total_reward+0.0
        if env.total_reward>Tmax*env.step_reward:
            success+=1
        if i%100==0:
            print(i)
    return total_reward/iters,success/iters,time.time()-time2
device=0


VINinfer=myvindyn.VINinfer(myvindyn.Config())
#VINinfer.load_state_dict(torch.load("vin_8x8.pth",map_location='cpu'))
print(VINinfer)
oldVINinfer=myvindyn.VINinfer(myvindyn.Config())
oldVINinfer.load_state_dict(VINinfer.state_dict())
grid=gw.GridWorld2_8dir(8,8,nobstacle=4,moving=False)

#print(evaluate(grid,vinPolicy,1000))
#print(evaluate(grid,randomWalk))
maxStep=5000000
episodes=20000
gamma=0.99
Tmax=100
replay=[]
max_exp=100000
learning_begin=10000
Ejemplo n.º 5
0
    total_reward = 0
    success = 0.0
    time2 = time.time()
    for i in range(iters):
        status, place, reward, over = env.reset()
        t = 0
        Tmax = 100
        while over == False and t < Tmax:
            action = policy(status, place)
            if iters % 100 == 0 and show:
                print(action)
                env.plot()

            status, place, reward, over = env.step(action)

            t += 1
        total_reward += env.total_reward + 0.0
        if env.total_reward > Tmax * env.step_reward:
            success += 1
        if i % 100 == 0:
            print(i)
    return total_reward / iters, success / iters, time.time() - time2


model = sys.argv[1]
VINdyn = myvindyn.VINdyn(myvindyn.Config())
VINdyn.load_state_dict(torch.load(model, map_location='cpu'))
grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=True)
e = 0
print(VINdyn)
print(evaluate(grid, vinPolicy, iters=1000))