step += 1 #spatialize the action and then concatenate with state action_spatio = action_spa(action) v_new_state = Variable( torch.from_numpy(new_state)) # variable of new state v_action_spatio = Variable(torch.from_numpy( action_spatio)) # variable of spatilized action v_sa = torch.cat((v_state, v_action_spatio), 2) # variable of state-action reward = getReward( new_state ) # reward is not used in section 3.1, here is only used to end an episod memory.push(v_state.data, v_action_spatio.data, v_sa.data, action, v_new_state.data, reward) if (len(memory) < buffer): #if buffer not filled, add to it state = new_state if reward != -1: #if reached terminal state, update game status break if step > 31: break else: continue #print('************** starting here ****************') transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) # tuple batchx(11x11x5=605) state_batch = Variable(torch.stack(batch.state)) #batchx11x11x5 # action_batch =batch.action
amplifier.WriteAnalogScalarF64(1, 10.0, input_voltage, None) laser.append(dist) print('second %.1f -- voltage %.1f -- distance %.1f -- reward %.2f\n' % ((i + 1) * timeout, input_voltage, dist, reward)) i = i + 1 if i == n: sched.shutdown(wait=False) sched = BlockingScheduler() sched.add_job(iterate, 'interval', seconds=0.1) sched.start() for i in range(len(state_list) - 1): memory.push(state_list[i], action_list[i], state_list[i + 1], torch.Tensor([reward_list[i]])) amplifier.StopTask() laser_sensor.StopTask() # ======================================================================================== # plot and save readings laser = np.asarray(laser, dtype='float') plt.plot(range(n), laser) plt.plot(range(n), desired_traj) plt.legend(['laser output', 'desired output']) error = np.abs(desired - laser[10:-10]).sum() / len(desired) plt.title('error: %f' % error) plt.savefig('result/epoch_%d.png' % epoch) plt.close()
step = 0 #while game still in progress while (status == 1): v_state = Variable(torch.from_numpy(state)).view(1, -1) qval = model(v_state) if (np.random.random() < epsilon): #choose random action action = np.random.randint(0, 4) else: #choose best action from Q(s,a) values action = np.argmax(qval.data) #Take action, observe new state S' new_state = makeMove(state, action) step += 1 v_new_state = Variable(torch.from_numpy(new_state)).view(1, -1) #Observe reward reward = getReward(new_state) memory.push(v_state.data, action, v_new_state.data, reward) if (len(memory) < buffer): #if buffer not filled, add to it state = new_state if reward != -1: #if reached terminal state, update game status break else: continue transitions = memory.sample(BATCH_SIZE) batch = Transition(*zip(*transitions)) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.LongTensor(batch.action)).view(-1, 1) new_state_batch = Variable(torch.cat(batch.new_state)) reward_batch = Variable(torch.FloatTensor(batch.reward)) non_final_mask = (reward_batch == -1) #Let's run our Q function on S to get Q values for all possible actions qval_batch = model(state_batch)