def train():
    env = ContinuousCartPoleEnv()
    state_dim = 4
    action_dim = 2

    # reproducible
    # env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    torch.manual_seed(RANDOMSEED)

    ppo = PPO(state_dim, action_dim, method=METHOD)
    global all_ep_r, update_plot, stop_plot
    all_ep_r = []
    for ep in range(EP_MAX):
        s = env.reset()
        ep_r = 0
        t0 = time.time()
        for t in range(EP_LEN):
            if RENDER:
                env.render()
            a = ppo.choose_action(s)
            u = np.clip(gene_u(s, a, model_1, model_2), -1, 1)
            s_, _, done, _ = env.step(u)
            # print(s, a, s_, r, done)
            # assert False
            r = 5
            r -= WEIGHT * abs(u[0])
            # r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1]))
            if done and t != 199:
                r -= 50
            ppo.store_transition(
                s, a, r
            )  # useful for pendulum since the nets are very small, normalization make it easier to learn
            s = s_
            ep_r += r

            # update ppo
            if len(ppo.state_buffer) == BATCH_SIZE:
                ppo.finish_path(s_, done)
                ppo.update()
            # if done:
            #     break
        ppo.finish_path(s_, done)
        print(
            'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.
            format(ep + 1, EP_MAX, ep_r,
                   time.time() - t0))
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
        if PLOT_RESULT:
            update_plot.set()
        if (ep + 1) % 500 == 0 and ep >= 3000:
            ppo.save_model(path='ppo', ep=ep, weight=WEIGHT)
    if PLOT_RESULT:
        stop_plot.set()
    env.close()
# scores = ddpg()
# assert False

agent.actor_local.load_state_dict(torch.load('actor4850_1.pth'))
# agent.critic_local.load_state_dict(torch.load('critic1.pth'))

state_list = np.load('init_state.npy')
fuel_list = []
for ep in range(500):
    total_reward = 0
    fuel = 0
    # state = state_list[ep]
    # state = env.reset(state=state, set_state=True)
    state = env.reset()
    for t in range(200):
        action = agent.act(state, add_noise=False)
        print(action, type(action))
        assert False
        fuel += abs(action)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            break
    print(t, total_reward)
    if t == 199:
        fuel_list.append(fuel)
# np.save('init_state.npy', np.array(state_list))
print(len(fuel_list) / 500, np.mean(fuel_list))
env.close()