Beispiel #1
0
def main():
    # 创建飞行器环境
    env = make_env("Quadrotor", task="no_collision", seed=1)
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0] + 1
    max_action = float(env.action_space.high[0])

    model = QuadrotorModel(act_dim, max_action)
    algorithm = parl.algorithms.TD3(model,
                                    max_action=max_action,
                                    gamma=GAMMA,
                                    tau=TAU,
                                    actor_lr=ACTOR_LR,
                                    critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim)
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

    agent.restore_critic('model_dir/critic.ckpt')
    agent.restore_actor('model_dir/actor.ckpt')
    for epics in range(1, 5):
        evaluate_reward = run_evaluate_episode(env,
                                               agent,
                                               max_action,
                                               is_render=True)
        print("evaluate_reward: ", evaluate_reward)
Beispiel #2
0
def main():
    # 创建飞行器环境
    env = make_env("Quadrotor", task="velocity_control", seed=1)
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0] + 1
    model = QuadrotorModel(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim)
    ckpt = 'steps_490883_reward_-20.52.ckpt'
    agent.restore(ckpt)
    evaluate_reward = evaluate(env, agent, True)
    logger.info('Evaluate reward: {}'.format(evaluate_reward))  # 打印评估的reward
Beispiel #3
0
def main():
    # 创建飞行器环境
    env = make_env("Quadrotor", task="no_collision", seed=1)
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0] + 1
    model = QuadrotorModel(act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim)
    ckpt = 'steps_970464_reward_467.17.ckpt'
    agent.restore(ckpt)
    evaluate_reward = evaluate(env, agent, render=True)
    logger.info('Evaluate reward: {}'.format(evaluate_reward))
Beispiel #4
0
def main():
    # 创建飞行器环境
    env = make_env("Quadrotor_hovering_control", task="hovering_control")
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print(obs_dim, act_dim)

    model = QuadrotorModel(act_dim + 1)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1)
    ckpt = 'steps_700176.ckpt'  # 请设置ckpt为你训练中效果最好的一次评估保存的模型文件名称
    agent.restore(ckpt)
    evaluate_reward = evaluate(env, agent)
    logger.info('Evaluate reward: {}'.format(evaluate_reward))  # 打印评估的reward
def main():
    # 创建飞行器环境
    env = make_env("Quadrotor_hovering_control", task="hovering_control")
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    print(obs_dim, act_dim)
    model = QuadrotorModel(act_dim + 1)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm, obs_dim, act_dim + 1)
    # if os.path.exists('model_dir/steps_140848.ckpt'):
    #     agent.restore('model_dir/steps_140848.ckpt')
    #     print("Restore succeed")
    # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim + 1)
    # 启动训练
    test_flag = 0
    total_steps = 0
    while total_steps < TRAIN_TOTAL_STEPS:
        train_reward, steps = run_episode(env, agent, rpm)
        total_steps += steps
        # logger.info('Steps: {} Train reward: {}'.format(total_steps, train_reward)) # 打印训练reward

        if total_steps // TEST_EVERY_STEPS >= test_flag:  # 每隔一定step数,评估一次模型
            while total_steps // TEST_EVERY_STEPS >= test_flag:
                test_flag += 1

            evaluate_reward = evaluate(env, agent)
            logger.info('Steps {}, Test reward: {}'.format(
                total_steps, evaluate_reward))  # 打印评估的reward

            # 每评估一次,就保存一次模型,以训练的step数命名
            ckpt = 'model_dir/steps_{}.ckpt'.format(total_steps)
            agent.save(ckpt)
Beispiel #6
0
            env.render()
        eval_reward.append(total_reward)
    return np.mean(eval_reward)


if __name__ == "__main__":
    # 创建飞行器环境
    # env = make_env("Quadrotor", task="velocity_control", seed=0)
    env = Quadrotor(task="velocity_control", seed=0)
    env.reset()
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # 根据parl框架构建agent

    model = QuadrotorModel(act_dim=act_dim)
    algorithm = DDPG(model,
                     gamma=GAMMA,
                     tau=TAU,
                     actor_lr=ACTOR_LR,
                     critic_lr=CRITIC_LR)
    agent = QuadrotorAgent(algorithm=algorithm,
                           obs_dim=obs_dim + 3,
                           act_dim=act_dim)

    # parl库也为DDPG算法内置了ReplayMemory,可直接从 parl.utils 引入使用
    rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim + 3, act_dim)

    best_test_reward = -10000
    # agent.restore('model_dir/best.ckpt')
Beispiel #7
0
# 创建飞行器环境
env = make_env("Quadrotor", task="no_collision", seed=1)
env.reset()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0] + 1

#CHANGE
max_action = float(env.action_space.high[0])
print("max action: ", max_action)

#model = QuadrotorModel(act_dim)
#algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)

#CHANGE
model = QuadrotorModel(act_dim, max_action)
algorithm = parl.algorithms.TD3(model,
                                max_action=max_action,
                                gamma=GAMMA,
                                tau=TAU,
                                actor_lr=ACTOR_LR,
                                critic_lr=CRITIC_LR)
agent = QuadrotorAgent(algorithm, obs_dim, act_dim)
rpm = ReplayMemory(int(MEMORY_SIZE), obs_dim, act_dim)

# 启动训练
test_flag = 0
total_steps = 0
best_reward = -float('inf')
while total_steps < TRAIN_TOTAL_STEPS:
    train_reward, steps = run_episode(env, agent, rpm)