def main(): """main method log runtime and print it at the end """ s_time = timeit.default_timer() global iteration env = TorcsEnv(vision=False, throttle=True, gear_change=False) memory = ReplayBuffer() epsilon = 1 train_indicator = True modelPATH = os.path.join('.',"models",'E0011.pt') q,q_target = QNet(state_dim,action_dim),QNet(state_dim,action_dim) q_target.load_state_dict(q.state_dict()) mu, mu_target = MuNet(state_dim), MuNet(state_dim) mu_target.load_state_dict(mu.state_dict()) steer_noise = OUN(np.zeros(1),theta = 0.6) accel_noise = OUN(np.zeros(1),theta = 0.6) mu_optimizer = optim.Adam(mu.parameters(), lr=lr_mu) q_optimizer = optim.Adam(q.parameters(), lr=lr_q) #tensorboard writer current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") log_dir = os.path.join("logs", "ddpg_torch", current_time+'E0011t') writer = SummaryWriter(log_dir) samplestate = torch.rand(1,29) sampleaction = torch.rand(1,2) #writer.add_graph(mu,samplestate) writer.add_graph(q,(samplestate,sampleaction)) writer.close if train_indicator ==False: mu = torch.load(modelPATH) mu.eval() ob = env.reset() score = 0 for n_step in range(100000): s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) a_t = mu(torch.from_numpy(s_t.reshape(1,-1)).float()).detach().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t if done: print("score:",score) break env.end() return 0 for n_epi in range(max_episode): print("Episode : " + str(n_epi) + " Replay Buffer " + str(memory.size())) if np.mod(n_epi, 3) == 0: ob = env.reset(relaunch=True) #relaunch TORCS every 3 episode because of the memory leak error else: ob = env.reset() a_t = np.zeros([1,action_dim]) s_t = np.hstack((ob.angle, ob.track,ob.trackPos,ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) score = 0 q_value_writer(q, mu, s_t, writer, 'Episode Start Q value') q_value_writer(q_target, mu_target, s_t, writer, 'Episode Start target Q value') #t_start = timeit.default_timer() for n_step in range(max_step): #epsilon -= 1.0/EXPLORE a_origin = mu(torch.from_numpy(s_t.reshape(1,-1)).float()) if train_indicator == True:#add noise for train # sn = max(epsilon,0)*steer_noise() sn = steer_noise() # an = max(epsilon,0)*accel_noise() an = accel_noise() a_s = a_origin.detach().numpy()[0][0] + sn a_t[0][0] = np.clip(a_s,-1,1) # fit in steer arange a_a = a_origin.detach().numpy()[0][1] + an a_t[0][1] = np.clip(a_a,0,1) # fit in accel arange #record noise movement if iteration%10==0: writer.add_scalar('Steer noise', sn, iteration) writer.add_scalar('Accel_noise', an, iteration) else: a_t = a_origin.detatch().numpy() ob,r_t,done,_ = env.step(a_t[0]) score += r_t s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm)) memory.put((s_t,a_t[0],r_t,s_t1,done)) s_temp = copy.deepcopy(s_t) # for end q value log s_t = s_t1 if train_indicator and memory.size()>train_start_size: train(mu, mu_target, q, q_target, memory, q_optimizer, mu_optimizer,writer) soft_update(mu, mu_target) soft_update(q, q_target) iteration+=1 if done: q_value_writer(q,mu,s_temp,writer,'Episode End Q value') q_value_writer(q_target,mu_target,s_temp,writer,'Episode End target Q value') break #t_end = timeit.default_timer() print("TOTAL REWARD @ " + str(n_epi) +"-th Episode : Reward " + str(score)) print("Total Step: " + str(n_step)) print("") #print('{}steps, {} time spent'.format(i,t_end-t_start)) torch.save(mu,modelPATH) env.end() e_time = timeit.default_timer() print("Total step {} and time spent {}".format(iteration, e_time-s_time))
max_step = 500 ob = env.reset() print("ob: ", ob) s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) print("s_t: ", s_t) print("s_t size: ", s_t.size) a = [[0, 1]] #t_start = timeit.default_timer() for i in range(max_step): ob, r_t, done, info = env.step(a[0]) if done: break s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm)) memory.put((s_t, a[0], r_t, s_t1, done)) s_t = s_t1 #t_end = timeit.default_timer() s_done = s_t print('done?: ', s_done) #print('{}steps, {} time spent'.format(i,t_end-t_start)) env.end() s, a, r, sp, d = memory.sample(3) print('s: ', s) print('a: ', a) print('r: ', r) print('sp: ', sp) print('d: ', d) # # --noise 테스트합니다.-- # noise = OrnsteinUhlenbeckNoise(mu = np.zeros(1),theta=0.1,dt=0.2,sigma = 0.1, x0 = np.array([0.5]))