def collect_data():
    env = ContinuousCartPoleEnv()
    EP_NUM = 1000
    data_set = []
    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            action = ppo.choose_action(state.cpu().data.numpy(), True)
            with torch.no_grad():
                ca1 = model_1(state)
                ca2 = model_2(state)
            control_action = ca1 * action[0] + ca2 * action[1]
            control_action = np.clip(control_action.cpu().data.numpy(), -1, 1)
            next_state, reward, done, _ = env.step(control_action)
            data_set.append([
                state.cpu().data.numpy()[0],
                state.cpu().data.numpy()[1],
                state.cpu().data.numpy()[2],
                state.cpu().data.numpy()[3], control_action[0]
            ])
            state = next_state
            if done:
                break
        print(t)
    return np.array(data_set)
Ejemplo n.º 2
0
def train():
    env = ContinuousCartPoleEnv()
    state_dim = 4
    action_dim = 2

    # reproducible
    # env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    torch.manual_seed(RANDOMSEED)

    ppo = PPO(state_dim, action_dim, method=METHOD)
    global all_ep_r, update_plot, stop_plot
    all_ep_r = []
    for ep in range(EP_MAX):
        s = env.reset()
        ep_r = 0
        t0 = time.time()
        for t in range(EP_LEN):
            if RENDER:
                env.render()
            a = ppo.choose_action(s)
            u = np.clip(gene_u(s, a, model_1, model_2), -1, 1)
            s_, _, done, _ = env.step(u)
            # print(s, a, s_, r, done)
            # assert False
            r = 5
            r -= WEIGHT * abs(u[0])
            # r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1]))
            if done and t != 199:
                r -= 50
            ppo.store_transition(
                s, a, r
            )  # useful for pendulum since the nets are very small, normalization make it easier to learn
            s = s_
            ep_r += r

            # update ppo
            if len(ppo.state_buffer) == BATCH_SIZE:
                ppo.finish_path(s_, done)
                ppo.update()
            # if done:
            #     break
        ppo.finish_path(s_, done)
        print(
            'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.
            format(ep + 1, EP_MAX, ep_r,
                   time.time() - t0))
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
        if PLOT_RESULT:
            update_plot.set()
        if (ep + 1) % 500 == 0 and ep >= 3000:
            ppo.save_model(path='ppo', ep=ep, weight=WEIGHT)
    if PLOT_RESULT:
        stop_plot.set()
    env.close()
def train_switcher_DDQN():
    mkdir('./switch')
    env = ContinuousCartPoleEnv()
    model = DQN(4, 2).to(device)
    target_model = DQN(4, 2).to(device)
    optimizer = optim.Adam(model.parameters())
    EP_NUM = 2001
    frame_idx = 0
    fuel_list = []
    ep_reward = deque(maxlen=100)

    for ep in range(EP_NUM):
        state = env.reset()
        ep_r = 0
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            epsilon = epsilon_by_frame(frame_idx)
            action = model.act(state, epsilon)
            with torch.no_grad():
                if action == 0:
                    control_action = model_1(state).cpu().data.numpy()
                elif action == 1:
                    control_action = model_2(state).cpu().data.numpy()
                else:
                    assert False
                    control_action = 0
            next_state, _, done, _ = env.step(control_action)
            reward = 5
            reward -= weight * abs(control_action)
            if done and t != 199:
                reward -= 50
            replay_buffer.push(state.cpu().numpy(), action, reward, next_state,
                               done)
            fuel_list.append(abs(control_action))
            state = next_state
            ep_r += reward
            frame_idx += 1
            if len(replay_buffer) > batch_size:
                loss = compute_td_loss(model, target_model, batch_size,
                                       optimizer)
            if frame_idx % 100 == 0:
                update_target(model, target_model)
            if done:
                break
        ep_reward.append(ep_r)
        print('epoch:', ep, 'reward:', ep_r, 'average reward:',
              np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]),
              'epsilon:', epsilon, len(replay_buffer))
        if ep >= 100 and ep % 100 == 0:
            torch.save(model.state_dict(),
                       './switch/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
Ejemplo n.º 4
0
def continuos_cartpole_train(n_episodes=2000, max_t=700):
    env = ContinuousCartPoleEnv()

    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    model = Model(state_size=obs_size, action_size=action_size)
    target_model = Model(state_size=obs_size, action_size=action_size)

    alg = DDPG(model,
               target_model,
               gamma=0.99,
               tau=1e-3,
               actor_lr=1e-4,
               critic_lr=3e-4)
    agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)

    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        #agent.reset()
        score = 0

        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_deque.append(score)
        scores.append(score)
        #print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque) > 200:
            torch.save(agent.alg.model.actor_model.state_dict(),
                       'cart_pole_actor.pth')
            torch.save(agent.alg.model.critic_model.state_dict(),
                       'cart_pole_critic.pth')
            break
    return scores
Ejemplo n.º 5
0
def evaluate(render=True):
    env = ContinuousCartPoleEnv()

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    model = Model(state_size=obs_size, action_size=action_size)
    target_model = Model(state_size=obs_size, action_size=action_size)

    alg = DDPG(model,
               target_model,
               gamma=0.99,
               tau=1e-3,
               actor_lr=1e-4,
               critic_lr=3e-4)
    agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)
    agent.alg.model.actor_model.load_state_dict(
        torch.load("cart_pole_actor.pth"))
    agent.alg.model.critic_model.load_state_dict(
        torch.load("cart_pole_critic.pth"))

    eval_reward = []
    for i in range(10):
        obs = env.reset()
        total_reward = 0
        steps = 0
        while True:
            action = agent.act(obs)

            steps += 1
            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward

            if render:
                env.render()
            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
Ejemplo n.º 6
0

# scores = ddpg()
# assert False

agent.actor_local.load_state_dict(torch.load('actor4850_1.pth'))
# agent.critic_local.load_state_dict(torch.load('critic1.pth'))

state_list = np.load('init_state.npy')
fuel_list = []
for ep in range(500):
    total_reward = 0
    fuel = 0
    # state = state_list[ep]
    # state = env.reset(state=state, set_state=True)
    state = env.reset()
    for t in range(200):
        action = agent.act(state, add_noise=False)
        print(action, type(action))
        assert False
        fuel += abs(action)
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            break
    print(t, total_reward)
    if t == 199:
        fuel_list.append(fuel)
# np.save('init_state.npy', np.array(state_list))
print(len(fuel_list) / 500, np.mean(fuel_list))
env.close()
def test(adapter_name=None,
         state_list=None,
         renew=False,
         mode='switch',
         INDI_NAME=None):
    print(mode)
    env = ContinuousCartPoleEnv()
    EP_NUM = 500
    if mode == 'switch':
        model = DQN(4, 2).to(device)
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'individual':
        Individual.load_state_dict(torch.load(INDI_NAME))
    if renew:
        state_list = []
    fuel_list = []
    ep_reward = []
    trajectory = []
    safe = []
    unsafe = []
    control_action_list = []
    for ep in range(EP_NUM):
        if renew:
            state = env.reset()
            state_list.append(state)
        else:
            assert len(state_list) == EP_NUM
            state = env.reset(state=state_list[ep], set_state=True)
        ep_r = 0
        fuel = 0
        if ep == 0:
            trajectory.append(state)
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            if mode == 'switch':
                action = model.act(state, epsilon=0)
                with torch.no_grad():
                    if action == 0:
                        control_action = model_1(state).cpu().data.numpy()
                    elif action == 1:
                        control_action = model_2(state).cpu().data.numpy()
                    else:
                        assert False
                        control_action = 0

            elif mode == 'ppo':
                action = ppo.choose_action(state.cpu().data.numpy(), True)
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = np.array([action[0] * ca1 + action[1] * ca2])
                control_action = np.clip(control_action, -1, 1)
                if ep == 0:
                    print(t, state, control_action, action, ca1, ca2)

            elif mode == 'd1':
                control_action = model_1(state).cpu().data.numpy()

            elif mode == 'd2':
                control_action = model_2(state).cpu().data.numpy()

            elif mode == 'individual':
                if ATTACK:
                    if t % 15 == 0:
                        delta = fgsm(Individual, state)
                        # ele1 = np.random.uniform(low=-SCALE1, high=SCALE1, size=1)[0]
                        # ele2 = np.random.uniform(low=-SCALE2, high=SCALE2, size=1)[0]
                        # delta = torch.from_numpy(np.array([ele1, 0, ele2, 0])).float().to(device)
                    control_action = Individual(state +
                                                delta).cpu().data.numpy()
                else:
                    control_action = Individual(state).cpu().data.numpy()

            control_action = np.clip(control_action, -1, 1)
            next_state, reward, done, _ = env.step(control_action)
            fuel += abs(control_action)
            state = next_state
            if ep == 99:
                trajectory.append(state)
                control_action_list.append(control_action)
            ep_r += reward
            if done:
                break

        ep_reward.append(ep_r)
        if t == 199:
            fuel_list.append(fuel)
            safe.append(state_list[ep])
        else:
            print(ep, state_list[ep])
            unsafe.append(state_list[ep])
    safe = np.array(safe)
    unsafe = np.array(unsafe)
    np.save('./plot/' + mode + '_safe.npy', safe)
    np.save('./plot/' + mode + '_unsafe.npy', unsafe)
    return ep_reward, np.array(fuel_list), state_list, np.array(
        control_action_list)