def train(): env = ContinuousCartPoleEnv() state_dim = 4 action_dim = 2 # reproducible # env.seed(RANDOMSEED) np.random.seed(RANDOMSEED) torch.manual_seed(RANDOMSEED) ppo = PPO(state_dim, action_dim, method=METHOD) global all_ep_r, update_plot, stop_plot all_ep_r = [] for ep in range(EP_MAX): s = env.reset() ep_r = 0 t0 = time.time() for t in range(EP_LEN): if RENDER: env.render() a = ppo.choose_action(s) u = np.clip(gene_u(s, a, model_1, model_2), -1, 1) s_, _, done, _ = env.step(u) # print(s, a, s_, r, done) # assert False r = 5 r -= WEIGHT * abs(u[0]) # r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1])) if done and t != 199: r -= 50 ppo.store_transition( s, a, r ) # useful for pendulum since the nets are very small, normalization make it easier to learn s = s_ ep_r += r # update ppo if len(ppo.state_buffer) == BATCH_SIZE: ppo.finish_path(s_, done) ppo.update() # if done: # break ppo.finish_path(s_, done) print( 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'. format(ep + 1, EP_MAX, ep_r, time.time() - t0)) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) if PLOT_RESULT: update_plot.set() if (ep + 1) % 500 == 0 and ep >= 3000: ppo.save_model(path='ppo', ep=ep, weight=WEIGHT) if PLOT_RESULT: stop_plot.set() env.close()
# scores = ddpg() # assert False agent.actor_local.load_state_dict(torch.load('actor4850_1.pth')) # agent.critic_local.load_state_dict(torch.load('critic1.pth')) state_list = np.load('init_state.npy') fuel_list = [] for ep in range(500): total_reward = 0 fuel = 0 # state = state_list[ep] # state = env.reset(state=state, set_state=True) state = env.reset() for t in range(200): action = agent.act(state, add_noise=False) print(action, type(action)) assert False fuel += abs(action) state, reward, done, _ = env.step(action) total_reward += reward if done: break print(t, total_reward) if t == 199: fuel_list.append(fuel) # np.save('init_state.npy', np.array(state_list)) print(len(fuel_list) / 500, np.mean(fuel_list)) env.close()