예제 #1
0
def test():
    env = gym.make('CartPole-v0')
    action_dim = env.action_space.n
    obs_shape = env.observation_space.shape

    rpm = ReplayMemory(MEMORY_SIZE)

    model = CartpoleModel(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(model,
                                    act_dim=action_dim,
                                    gamma=GAMMA,
                                    lr=LEARNING_RATE)
    agent = CartpoleAgent(
        algorithm,
        obs_dim=obs_shape[0],
        act_dim=action_dim,
        e_greed=0.1,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    if os.path.exists('./model_dir'):
        agent.restore('./model_dir')

    eval_reward = evaluate(agent, env, True)
    logger.info('test_reward:{}'.format(eval_reward))
예제 #2
0
파일: train.py 프로젝트: YuechengLiu/PARL
def main():
    env = gym.make("CartPole-v0")
    model = CartpoleModel(act_dim=ACT_DIM)
    alg = parl.algorithms.PolicyGradient(model, lr=LEARNING_RATE)
    agent = CartpoleAgent(alg, obs_dim=OBS_DIM, act_dim=ACT_DIM)

    # if the file already exists, restore parameters from it
    if os.path.exists('./model.ckpt'):
        agent.restore('./model.ckpt')

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            _, _, reward_list = run_episode(env, agent, train_or_test='test')
            total_reward = np.sum(reward_list)
            logger.info('Test reward: {}'.format(total_reward))

    # save the parameters to ./model.ckpt
    agent.save('./model.ckpt')
예제 #3
0
def main():
    env = gym.make('CartPole-v0')
    action_dim = env.action_space.n
    obs_shape = env.observation_space.shape

    rpm = ReplayMemory(MEMORY_SIZE)

    model = CartpoleModel(act_dim=action_dim)
    algorithm = parl.algorithms.DQN(
        model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = CartpoleAgent(
        algorithm,
        obs_dim=obs_shape[0],
        act_dim=action_dim,
        e_greed=0.1,  # explore
        e_greed_decrement=1e-6
    )  # probability of exploring is decreasing during training

    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
        run_episode(agent, env, rpm)

    max_episode = 2000

    # start train
    episode = 0
    while episode < max_episode:
        # train part
        for i in range(0, 50):
            total_reward = run_episode(agent, env, rpm)
            episode += 1

        eval_reward = evaluate(agent, env)
        logger.info('episode:{}    test_reward:{}'.format(
            episode, eval_reward))
예제 #4
0
파일: train.py 프로젝트: universea/MoRL
def main():
    logger.info("------------------- SAC ---------------------")
    logger.info('Env: {}, Seed: {}'.format(args.env, args.seed))
    logger.info("---------------------------------------------")
    logger.set_dir('./{}_{}'.format(args.env, args.seed))

    env = ContinuousCartPoleEnv()  #gym.make(args.env)
    env.seed(args.seed)
    obs_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    print('obs_dim, action_dim', (obs_dim, action_dim))

    # Initialize model, algorithm, agent, replay_memory
    model = CartpoleModel(obs_dim, action_dim)
    algorithm = SAC(model,
                    gamma=GAMMA,
                    tau=TAU,
                    alpha=args.alpha,
                    actor_lr=ACTOR_LR,
                    critic_lr=CRITIC_LR)
    agent = CartpoleAgent(algorithm)
    rpm = ReplayMemory(max_size=MEMORY_SIZE,
                       obs_dim=obs_dim,
                       act_dim=action_dim)

    # load model and evaluate
    if os.path.exists('sac_model.ckpt'):
        logger.info("restore model succeed and test !")
        agent.restore('sac_model.ckpt')
        run_evaluate_episodes(agent, env, EVAL_EPISODES, render=True)
        exit()

    total_steps = 0
    test_flag = 0
    while total_steps < args.train_total_steps:
        # Train episode
        episode_reward, episode_steps = run_train_episode(agent, env, rpm)
        total_steps += episode_steps

        tensorboard.add_scalar('train/episode_reward', episode_reward,
                               total_steps)
        logger.info('Total Steps: {} Reward: {}'.format(
            total_steps, episode_reward))

        # Evaluate episode
        if (total_steps + 1) // args.test_every_steps >= test_flag:
            while (total_steps + 1) // args.test_every_steps >= test_flag:
                test_flag += 1
            avg_reward = run_evaluate_episodes(agent,
                                               env,
                                               EVAL_EPISODES,
                                               render=False)
            tensorboard.add_scalar('eval/episode_reward', avg_reward,
                                   total_steps)
            logger.info('Evaluation over: {} episodes, Reward: {}'.format(
                EVAL_EPISODES, avg_reward))
    agent.save('sac_model.ckpt')
예제 #5
0
파일: train.py 프로젝트: YuechengLiu/PARL
def main():
    env = gym.make('CartPole-v0')
    model = CartpoleModel(obs_dim=OBS_DIM, act_dim=ACT_DIM)
    alg = parl.algorithms.PolicyGradient(model, LEARNING_RATE)
    agent = CartpoleAgent(alg)

    for i in range(1000):  # 1000 episodes
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            _, _, reward_list = run_episode(env, agent, train_or_test='test')
            total_reward = np.sum(reward_list)
            logger.info('Test reward: {}'.format(total_reward))
예제 #6
0
def main():
    env = gym.make('CartPole-v0')
    # env = env.unwrapped # Cancel the minimum score limit
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # build an agent
    model = CartpoleModel(obs_dim=obs_dim, act_dim=act_dim)
    alg = morl.algorithms.PolicyGradient(model, lr=LEARNING_RATE)
    agent = CartpoleAgent(alg)

    # load model and evaluate
    if os.path.exists('simple_model.ckpt'):
        logger.info("restore model succeed and test !")
        agent.restore('simple_model.ckpt')
        evaluate(env, agent, render=True)
        exit()

    for i in range(1000):
        obs_list, action_list, reward_list = run_episode(env, agent)
        if i % 10 == 0:
            logger.info("Episode {}, Reward Sum {}.".format(
                i, sum(reward_list)))

        batch_obs = np.array(obs_list)
        batch_action = np.array(action_list)
        batch_reward = calc_reward_to_go(reward_list)

        agent.learn(batch_obs, batch_action, batch_reward)
        if (i + 1) % 100 == 0:
            total_reward = evaluate(env, agent, render=True)
            logger.info('Test reward: {}'.format(total_reward))

    # save the parameters to model.ckpt
    agent.save('simple_model.ckpt')