def vinPredict(status, place, vin): if np.random.random() < e: action = np.random.randint(8) return action S1 = torch.Tensor([place[0]]).expand(1, 1) S2 = torch.Tensor([place[1]]).expand(1, 1) X = torch.Tensor(status).expand(1, len(status), status[0].shape[0], status[0].shape[1]) config = cnn.Config() q1, q2 = vin(X, S1, S2, cnn.Config()) q1 = q1 return q1
def vinPolicy(status, place): if np.random.random() < e: action = np.random.randint(8) return action S1 = torch.Tensor([place[0]]) #.expand(1,1) S2 = torch.Tensor([place[1]]) #.expand(1,1) #print(torch.Tensor(status).shape) X = torch.Tensor(status).expand(1, len(status), status[0].shape[0], status[0].shape[1]) config = cnn.Config() q1, q2 = CNN(X, S1, S2, cnn.Config()) q1 = q1 q2 = q2 #print(q1) #print(q2.shape) _, action = torch.max(q2, dim=1) action = int(action) #print(action) assert 0 <= action and action < 9 return action
def update(experience, vin, oldvin, p=False): #(action,state,place,next_state,next_place,reward,over) X = [] S1 = [] S2 = [] oldS1 = [] #next action oldS2 = [] #next action oldX = [] action = [] Y = [] #torch.Tensor(reward[::-1]) index = [] for j in range(cnn.Config().batch_size): # sample experience from replay x = np.random.randint(len(experience)) #status,place,reward,over,action while experience[x][6] == True: x = np.random.randint(len(experience)) Y.append(experience[x][5]) action.append(experience[x][0]) X.append(experience[x][1]) oldX.append(experience[x][3]) S1.append(experience[x][2][0]) S2.append(experience[x][2][1]) oldS1.append(experience[x][4][0]) oldS2.append(experience[x][4][1]) #index.append((x1,x2+1)) X = torch.from_numpy( np.array(X)).float() #do not change it to torch.Tensor(X).float() S1 = torch.from_numpy(np.array(S1)).float() #.unsqueeze(1) S2 = torch.from_numpy(np.array(S2)).float() #.unsqueeze(1) oldX = torch.from_numpy(np.array(oldX)).float() oldS1 = torch.from_numpy(np.array(oldS1)).float() #.unsqueeze(1) oldS2 = torch.from_numpy(np.array(oldS2)).float() #.unsqueeze(1) #print("here",S1.shape) action = torch.from_numpy(np.array(action)).unsqueeze(dim=1).long() Y = torch.from_numpy(np.array(Y)).float() #Qmax=torch.Tensor([replay[x[0]][x[1]][4] for x in index]).float() oldoutputs, _ = oldvin(oldX, oldS1, oldS2, cnn.Config()) oldouputs = oldoutputs.detach() Qmax = (torch.max(oldoutputs, dim=1)[0]).squeeze() outputs, _ = vin(X, S1, S2, cnn.Config()) Qvalue = outputs.gather(index=action, dim=1).squeeze() #print(Qvalue.shape) #print(Y.shape) TDtarget = (Y + gamma * Qmax) bellman_error = -(TDtarget - Qvalue) optimizer = optim.RMSprop(CNN.parameters(), lr=cnn.Config().lr, eps=1e-6) optimizer.zero_grad() Qvalue.backward(bellman_error.data) optimizer.step() if p: print(outputs[0], Qvalue[0], TDtarget[0], Y[0].cpu().numpy()) grid.plot2(X[0].cpu().numpy(), int(S1[0].item()), int(S2[0].item())) return loss
env.plot() status, place, reward, over = env.step(action) t += 1 total_reward += env.total_reward + 0.0 if env.total_reward > Tmax * env.step_reward: success += 1 if i % 100 == 0: print(i) return total_reward / iters, success / iters, time.time() - time2 device = 0 CNN = cnn.CNN(cnn.Config()) #CNN.load_state_dict(torch.load("vin_8x8.pth",map_location='cpu')) print(CNN) oldCNN = cnn.CNN(cnn.Config()) oldCNN.load_state_dict(CNN.state_dict()) grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=False) #print(evaluate(grid,vinPolicy,1000)) #print(evaluate(grid,randomWalk)) maxStep = 5000000 episodes = 20000 gamma = 0.99 Tmax = 100 replay = [] max_exp = 100000 learning_begin = 10000
total_reward = 0 success = 0.0 time2 = time.time() for i in range(iters): status, place, reward, over = env.reset() t = 0 Tmax = 100 while over == False and t < Tmax: action = policy(status, place) if iters % 100 == 0 and show: print(action) env.plot() status, place, reward, over = env.step(action) t += 1 total_reward += env.total_reward + 0.0 if env.total_reward > Tmax * env.step_reward: success += 1 if i % 100 == 0: print(i) return total_reward / iters, success / iters, time.time() - time2 model = sys.argv[1] CNN = cnn.CNN(cnn.Config()) CNN.load_state_dict(torch.load(model, map_location='cpu')) grid = gw.GridWorld2_8dir(8, 8, nobstacle=4, moving=True) e = 0 print(CNN) print(evaluate(grid, cnnPolicy, iters=1000))