def collect_data():
    env = ContinuousCartPoleEnv()
    EP_NUM = 1000
    data_set = []
    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            action = ppo.choose_action(state.cpu().data.numpy(), True)
            with torch.no_grad():
                ca1 = model_1(state)
                ca2 = model_2(state)
            control_action = ca1 * action[0] + ca2 * action[1]
            control_action = np.clip(control_action.cpu().data.numpy(), -1, 1)
            next_state, reward, done, _ = env.step(control_action)
            data_set.append([
                state.cpu().data.numpy()[0],
                state.cpu().data.numpy()[1],
                state.cpu().data.numpy()[2],
                state.cpu().data.numpy()[3], control_action[0]
            ])
            state = next_state
            if done:
                break
        print(t)
    return np.array(data_set)
def train_switcher_DDQN():
    mkdir('./switch')
    env = ContinuousCartPoleEnv()
    model = DQN(4, 2).to(device)
    target_model = DQN(4, 2).to(device)
    optimizer = optim.Adam(model.parameters())
    EP_NUM = 2001
    frame_idx = 0
    fuel_list = []
    ep_reward = deque(maxlen=100)

    for ep in range(EP_NUM):
        state = env.reset()
        ep_r = 0
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            epsilon = epsilon_by_frame(frame_idx)
            action = model.act(state, epsilon)
            with torch.no_grad():
                if action == 0:
                    control_action = model_1(state).cpu().data.numpy()
                elif action == 1:
                    control_action = model_2(state).cpu().data.numpy()
                else:
                    assert False
                    control_action = 0
            next_state, _, done, _ = env.step(control_action)
            reward = 5
            reward -= weight * abs(control_action)
            if done and t != 199:
                reward -= 50
            replay_buffer.push(state.cpu().numpy(), action, reward, next_state,
                               done)
            fuel_list.append(abs(control_action))
            state = next_state
            ep_r += reward
            frame_idx += 1
            if len(replay_buffer) > batch_size:
                loss = compute_td_loss(model, target_model, batch_size,
                                       optimizer)
            if frame_idx % 100 == 0:
                update_target(model, target_model)
            if done:
                break
        ep_reward.append(ep_r)
        print('epoch:', ep, 'reward:', ep_r, 'average reward:',
              np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]),
              'epsilon:', epsilon, len(replay_buffer))
        if ep >= 100 and ep % 100 == 0:
            torch.save(model.state_dict(),
                       './switch/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
Ejemplo n.º 3
0
def train():
    env = ContinuousCartPoleEnv()
    state_dim = 4
    action_dim = 2

    # reproducible
    # env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    torch.manual_seed(RANDOMSEED)

    ppo = PPO(state_dim, action_dim, method=METHOD)
    global all_ep_r, update_plot, stop_plot
    all_ep_r = []
    for ep in range(EP_MAX):
        s = env.reset()
        ep_r = 0
        t0 = time.time()
        for t in range(EP_LEN):
            if RENDER:
                env.render()
            a = ppo.choose_action(s)
            u = np.clip(gene_u(s, a, model_1, model_2), -1, 1)
            s_, _, done, _ = env.step(u)
            # print(s, a, s_, r, done)
            # assert False
            r = 5
            r -= WEIGHT * abs(u[0])
            # r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1]))
            if done and t != 199:
                r -= 50
            ppo.store_transition(
                s, a, r
            )  # useful for pendulum since the nets are very small, normalization make it easier to learn
            s = s_
            ep_r += r

            # update ppo
            if len(ppo.state_buffer) == BATCH_SIZE:
                ppo.finish_path(s_, done)
                ppo.update()
            # if done:
            #     break
        ppo.finish_path(s_, done)
        print(
            'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.
            format(ep + 1, EP_MAX, ep_r,
                   time.time() - t0))
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
        if PLOT_RESULT:
            update_plot.set()
        if (ep + 1) % 500 == 0 and ep >= 3000:
            ppo.save_model(path='ppo', ep=ep, weight=WEIGHT)
    if PLOT_RESULT:
        stop_plot.set()
    env.close()
Ejemplo n.º 4
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    model = Model(act_dim)
    algorithm = parl.algorithms.DDPG(model,
                                     gamma=GAMMA,
                                     tau=TAU,
                                     actor_lr=ACTOR_LR,
                                     critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, act_dim)

    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    while rpm.size() < MEMORY_WARMUP_SIZE:
        run_train_episode(env, agent, rpm)

    episode = 0
    while episode < 30000:
        for i in range(50):
            train_reward = run_train_episode(env, agent, rpm)
            episode += 1
            # logger.info('Episode: {} Reward: {}'.format(episode, train_reward))

        evaluate_reward = run_evaluate_episode(env, agent, False)
        logger.info('Episode {}, Evaluate reward: {}'.format(
            episode, evaluate_reward))
        if (evaluate_reward == 200):
            break
    agent.save('./model_dir')
Ejemplo n.º 5
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # 使用PARL框架创建agent
    model = Model(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, act_dim)

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)
    # 往经验池中预存数据
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(agent, env, rpm)

    episode = 0
    is_render = False
    while episode < TRAIN_EPISODE:
        for i in range(50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        # eval_reward = evaluate(env, agent, render=False)
        eval_reward = evaluate(env, agent, is_render)
        logger.info('episode:{}    Test reward:{}'.format(
            episode, eval_reward))
Ejemplo n.º 6
0
def continuos_cartpole_train(n_episodes=2000, max_t=700):
    env = ContinuousCartPoleEnv()

    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    model = Model(state_size=obs_size, action_size=action_size)
    target_model = Model(state_size=obs_size, action_size=action_size)

    alg = DDPG(model,
               target_model,
               gamma=0.99,
               tau=1e-3,
               actor_lr=1e-4,
               critic_lr=3e-4)
    agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)

    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        #agent.reset()
        score = 0

        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_deque.append(score)
        scores.append(score)
        #print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque) > 200:
            torch.save(agent.alg.model.actor_model.state_dict(),
                       'cart_pole_actor.pth')
            torch.save(agent.alg.model.critic_model.state_dict(),
                       'cart_pole_critic.pth')
            break
    return scores
Ejemplo n.º 7
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    #obs_dim += 1  # add 1 to obs dim for time step feature

    logger.info('observation_dim {}, action_dim {}'.format(obs_dim, act_dim))

    scaler = Scaler(obs_dim)

    model = Model(obs_dim, act_dim)
    alg = parl.algorithms.PPO(
        model,
        act_dim=act_dim,
        policy_lr=model.policy_lr,
        value_lr=model.value_lr)
    agent = Agent(alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type)

    # run a few episodes to initialize scaler
    collect_trajectories(env, agent, scaler, episodes=5)

    test_flag = 0
    total_steps = 0
    while total_steps < args.train_total_steps:
        trajectories = collect_trajectories(
            env, agent, scaler, episodes=args.episodes_per_batch)
        total_steps += sum([t['obs'].shape[0] for t in trajectories])
        total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories])

        train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data(
            trajectories, agent)

        policy_loss, kl = agent.policy_learn(train_obs, train_actions,
                                             train_advantages)
        value_loss = agent.value_learn(train_obs, train_discount_sum_rewards)

        logger.info(
            'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}'
            .format(total_steps, total_train_rewards / args.episodes_per_batch,
                    policy_loss, kl, value_loss))

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
                test_flag += 1
            eval_reward = run_evaluate_episode(env, agent, scaler, render=True)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, eval_reward))
Ejemplo n.º 8
0
def test():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    model = Model(act_dim)
    algorithm = parl.algorithms.DDPG(model,
                                     gamma=GAMMA,
                                     tau=TAU,
                                     actor_lr=ACTOR_LR,
                                     critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, act_dim)

    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')

    eval_reward = run_evaluate_episode(env, agent, True)
    logger.info('test_reward:{}'.format(eval_reward))
Ejemplo n.º 9
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # 使用PARL框架创建agent
    model = Model(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, act_dim)

    # 加载模型
    if os.path.exists('./model.ckpt'):
        agent.restore('./model.ckpt')
        eval_reward = evaluate(env, agent, render=True)
        print("eval_reward=", eval_reward)
        exit()

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)
    # 往经验池中预存数据
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(agent, env, rpm)

    episode = 0
    while episode < TRAIN_EPISODE:
        print("start training, episode=", episode)
        for i in range(50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1
            print("episode=", episode, "total_reward=", total_reward)

        eval_reward = evaluate(env, agent, render=False)
        logger.info('episode:{}    Test reward:{}'.format(
            episode, eval_reward))

    agent.save('./model.ckpt')
Ejemplo n.º 10
0
def evaluate(render=True):
    env = ContinuousCartPoleEnv()

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    model = Model(state_size=obs_size, action_size=action_size)
    target_model = Model(state_size=obs_size, action_size=action_size)

    alg = DDPG(model,
               target_model,
               gamma=0.99,
               tau=1e-3,
               actor_lr=1e-4,
               critic_lr=3e-4)
    agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)
    agent.alg.model.actor_model.load_state_dict(
        torch.load("cart_pole_actor.pth"))
    agent.alg.model.critic_model.load_state_dict(
        torch.load("cart_pole_critic.pth"))

    eval_reward = []
    for i in range(10):
        obs = env.reset()
        total_reward = 0
        steps = 0
        while True:
            action = agent.act(obs)

            steps += 1
            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward

            if render:
                env.render()
            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
Ejemplo n.º 11
0
import sys

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from Agent import Agent

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("Device: ", device)

random_seed = int(sys.argv[2])
print(random_seed)

# env = gym.make('MountainCarContinuous-v0')
env = ContinuousCartPoleEnv()
env.seed(random_seed)

# size of each action
action_size = env.action_space.shape[0]
print('Size of each action:', action_size)

# examine the state space
state_size = env.observation_space.shape[0]
print('Size of state:', state_size)

action_low = env.action_space.low
print('Action low:', action_low)

action_high = env.action_space.high
print('Action high: ', action_high)
Ejemplo n.º 12
0
def test(adapter_name=None,
         state_list=None,
         renew=False,
         mode='switch',
         INDI_NAME=None):
    print(mode)
    env = ContinuousCartPoleEnv()
    EP_NUM = 500
    if mode == 'switch':
        model = DQN(4, 2).to(device)
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'individual':
        Individual.load_state_dict(torch.load(INDI_NAME))
    if renew:
        state_list = []
    fuel_list = []
    ep_reward = []
    trajectory = []
    safe = []
    unsafe = []
    control_action_list = []
    for ep in range(EP_NUM):
        if renew:
            state = env.reset()
            state_list.append(state)
        else:
            assert len(state_list) == EP_NUM
            state = env.reset(state=state_list[ep], set_state=True)
        ep_r = 0
        fuel = 0
        if ep == 0:
            trajectory.append(state)
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            if mode == 'switch':
                action = model.act(state, epsilon=0)
                with torch.no_grad():
                    if action == 0:
                        control_action = model_1(state).cpu().data.numpy()
                    elif action == 1:
                        control_action = model_2(state).cpu().data.numpy()
                    else:
                        assert False
                        control_action = 0

            elif mode == 'ppo':
                action = ppo.choose_action(state.cpu().data.numpy(), True)
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = np.array([action[0] * ca1 + action[1] * ca2])
                control_action = np.clip(control_action, -1, 1)
                if ep == 0:
                    print(t, state, control_action, action, ca1, ca2)

            elif mode == 'd1':
                control_action = model_1(state).cpu().data.numpy()

            elif mode == 'd2':
                control_action = model_2(state).cpu().data.numpy()

            elif mode == 'individual':
                if ATTACK:
                    if t % 15 == 0:
                        delta = fgsm(Individual, state)
                        # ele1 = np.random.uniform(low=-SCALE1, high=SCALE1, size=1)[0]
                        # ele2 = np.random.uniform(low=-SCALE2, high=SCALE2, size=1)[0]
                        # delta = torch.from_numpy(np.array([ele1, 0, ele2, 0])).float().to(device)
                    control_action = Individual(state +
                                                delta).cpu().data.numpy()
                else:
                    control_action = Individual(state).cpu().data.numpy()

            control_action = np.clip(control_action, -1, 1)
            next_state, reward, done, _ = env.step(control_action)
            fuel += abs(control_action)
            state = next_state
            if ep == 99:
                trajectory.append(state)
                control_action_list.append(control_action)
            ep_r += reward
            if done:
                break

        ep_reward.append(ep_r)
        if t == 199:
            fuel_list.append(fuel)
            safe.append(state_list[ep])
        else:
            print(ep, state_list[ep])
            unsafe.append(state_list[ep])
    safe = np.array(safe)
    unsafe = np.array(unsafe)
    np.save('./plot/' + mode + '_safe.npy', safe)
    np.save('./plot/' + mode + '_unsafe.npy', unsafe)
    return ep_reward, np.array(fuel_list), state_list, np.array(
        control_action_list)
Ejemplo n.º 13
0
if __name__ == '__main__':

    # if args.train:
    #     thread = threading.Thread(target=train)
    #     thread.daemon = True
    #     thread.start()
    #     if PLOT_RESULT:
    #         drawer = Drawer()
    #         drawer.plot()
    #         drawer.save()
    #     thread.join()
    train()
    assert False
    # test
    env = ContinuousCartPoleEnv()
    state_dim = 2
    action_dim = 2
    ppo = PPO(state_dim, action_dim, method=METHOD)
    ppo.load_model()
    mean_epoch_reward = 0
    for _ in range(TEST_EP):
        state = env.reset()
        for i in range(EP_LEN):
            if RENDER:
                env.render()
            action = ppo.choose_action(state, True)
            u = gene_u(state, action, model_1, model_2)
            next_state, reward, done = env.step(u)
            mean_epoch_reward += reward
            state = next_state