Example #1
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # 使用PARL框架创建agent
    model = Model(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, act_dim)

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)
    # 往经验池中预存数据
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(agent, env, rpm)

    episode = 0
    is_render = False
    while episode < TRAIN_EPISODE:
        for i in range(50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        # eval_reward = evaluate(env, agent, render=False)
        eval_reward = evaluate(env, agent, is_render)
        logger.info('episode:{}    Test reward:{}'.format(
            episode, eval_reward))
Example #2
0
def train(n_episodes=5000, max_t=700):
    env = gym.make('BipedalWalker-v2')
    env.seed(10)

    scores_deque = deque(maxlen=100)
    scores = []
    max_score = -np.Inf

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    model = Model(state_size=obs_size, action_size=action_size)
    target_model = Model(state_size=obs_size, action_size=action_size)

    alg = DDPG(model,
               target_model,
               gamma=0.99,
               tau=1e-3,
               actor_lr=1e-4,
               critic_lr=3e-4)
    agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)

    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        #agent.reset()
        score = 0

        for t in range(max_t):
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward * 0.01, next_state, done)
            state = next_state
            score += reward
            if done:
                break

        scores_deque.append(score)
        scores.append(score)
        #print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(
                i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque) > 200:
            torch.save(agent.alg.model.actor_model.state_dict(),
                       'walker_actor.pth')
            torch.save(agent.alg.model.critic_model.state_dict(),
                       'walker_critic.pth')
            break

    return scores
Example #3
0
def main():
    env = ContinuousCartPoleEnv()

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # 使用PARL框架创建agent
    model = Model(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = Agent(algorithm, obs_dim, act_dim)

    # 加载模型
    if os.path.exists('./model.ckpt'):
        agent.restore('./model.ckpt')
        eval_reward = evaluate(env, agent, render=True)
        print("eval_reward=", eval_reward)
        exit()

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)
    # 往经验池中预存数据
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(agent, env, rpm)

    episode = 0
    while episode < TRAIN_EPISODE:
        print("start training, episode=", episode)
        for i in range(50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1
            print("episode=", episode, "total_reward=", total_reward)

        eval_reward = evaluate(env, agent, render=False)
        logger.info('episode:{}    Test reward:{}'.format(
            episode, eval_reward))

    agent.save('./model.ckpt')
Example #4
0
def evaluate(render=True):
    env = ContinuousCartPoleEnv()

    obs_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    model = Model(state_size=obs_size, action_size=action_size)
    target_model = Model(state_size=obs_size, action_size=action_size)

    alg = DDPG(model,
               target_model,
               gamma=0.99,
               tau=1e-3,
               actor_lr=1e-4,
               critic_lr=3e-4)
    agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)
    agent.alg.model.actor_model.load_state_dict(
        torch.load("cart_pole_actor.pth"))
    agent.alg.model.critic_model.load_state_dict(
        torch.load("cart_pole_critic.pth"))

    eval_reward = []
    for i in range(10):
        obs = env.reset()
        total_reward = 0
        steps = 0
        while True:
            action = agent.act(obs)

            steps += 1
            next_obs, reward, done, info = env.step(action)

            obs = next_obs
            total_reward += reward

            if render:
                env.render()
            if done:
                break
        eval_reward.append(total_reward)
    return np.mean(eval_reward)
Example #5
0
def main():

    env = StockTradingEnv(df)
    env.reset()
    act_dim = env.action_space.shape[0]
    obs_dim = env.observation_space.shape[0]
    rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)

    model = StockModel(act_dim=act_dim)
    algorithm = DDPG(model, gamma=dc, tau=TAU, actor_lr=al, critic_lr=cl)
    agent = Agent(algorithm, obs_dim, act_dim)

    total_steps, test_flag = 0, 0
    while total_steps < TRAIN_TOTAL_STEPS:
        # train part
        train_reward, steps, aloss, closs = run_episode(env, agent, rpm, bs)
        total_steps += steps
        logger.info("Step {}, Train Reward {}.".format(total_steps,
                                                       train_reward))
        tb_logger.add_scalar(tag="Train/Reward",
                             step=total_steps,
                             value=train_reward)
        tb_logger.add_scalar(tag="Train/Actor", step=total_steps, value=aloss)
        tb_logger.add_scalar(tag="Train/Critic", step=total_steps, value=closs)

        # test part
        if total_steps // TEST_EVERY_STEPS >= test_flag:
            while total_steps // TEST_EVERY_STEPS >= test_flag:
                test_flag += 1  # keep increment until condition is violated
            eval_reward = evaluate(env, agent)
            logger.info('Step:{}, Test Reward:{}'.format(
                total_steps, eval_reward))
            tb_logger.add_scalar(tag="Test/Reward",
                                 step=total_steps,
                                 value=eval_reward)

            # 训练结束,保存模型
            save_path = 'check_point/ddpg_%s_%s_%s_%s.ckpt' % (bs, dc, al, cl)
            agent.save(save_path)
def test_ddpg():
    args = DDPGInput()
    DDPG(args)
Example #7
0
            break

    return total_reward, steps


env = make_env("Quadrotor", task="hovering_control")
env.reset()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
set_trace()
model = Model(state_size=obs_dim, action_size=act_dim)
target_model = Model(state_size=obs_dim, action_size=act_dim)

alg = DDPG(model,
           target_model,
           gamma=GAMMA,
           tau=TAU,
           actor_lr=ACTOR_LR,
           critic_lr=CRITIC_LR)
agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10)

total_steps = 0
while total_steps < TRAIN_TOTAL_STEPS:
    train_reward, steps = run_episode(env, agent)
    total_steps += steps
    print('Steps: {} Reward: {}'.format(total_steps,
                                        train_reward))  # 打印训练reward

    if total_steps % TEST_EVERY_STEPS == 0:  # 每隔一定step数,评估一次模型
        torch.save(agent.alg.model.actor_model.state_dict(),
                   f'flighter_actor_{total_steps}.pth')
        torch.save(agent.alg.model.critic_model.state_dict(),
Example #8
0
from algorithm import DDPG, Actor, Critic

import gym
import torch.optim as optim
import torch.nn as nn

env = gym.make("MountainCarContinuous-v0")

actor = Actor(env.observation_space.shape[0], 24, 24, 1)
t_actor = Actor(env.observation_space.shape[0], 24, 24, 1)
optimA = optim.Adam(actor.parameters(), lr=0.00001)
critic = Critic(env.observation_space.shape[0], 24, 24)
t_critic = Critic(env.observation_space.shape[0], 24, 24)
optimC = optim.Adam(critic.parameters(), lr=0.00005)
loss = nn.MSELoss()
agent = DDPG(env, actor, t_actor, optimA, critic, t_critic, optimC, loss,
             10000)
agent.run(50, 10, 0.5, 64, 0.99, 0.001)