コード例 #1
0
ファイル: run_trpo.py プロジェクト: JacobImai/learning_to_run
def train(config, trial_dir=None, visualize=False):
    pid = os.getpid()
    logger, log_dir = prepare_for_logging("pid_{}".format(pid))

    # create environment
    env = NIPS(visualize)
    logger.info("pid={}, env={}".format(pid, id(env)))
    if trial_dir is not None and os.path.exists(
            trial_dir) and config['agent'] == 'DDPG':
        logger.info("Loading config from {} ...".format(trial_dir))
        with open(os.path.join(trial_dir, "config.pk"), "rb") as f:
            config = pickle.load(f)
    # config["scale_action"] = scale_action
    config["title_prefix"] = "RunEnv"

    # observation processor
    if "ob_processor" not in config or config["ob_processor"] == "dummy":
        ob_processor = ObservationProcessor()
    elif config["ob_processor"] == "2ndorder":
        ob_processor = SecondOrderAugmentor()
    else:
        ob_processor = BodySpeedAugmentor()
    config["ob_aug_dim"] = ob_processor.get_aug_dim()

    # snapshot info
    if "save_snapshot_every" not in config:
        config["save_snapshot_every"] = 500
    save_snapshot_every = config["save_snapshot_every"]

    # save config
    with open(os.path.join(log_dir, "config.pk"), "wb") as f:
        pickle.dump(config, f)
    util.print_settings(logger, config, env)

    # DDPG
    if config['agent'] == 'DDPG':
        # create random process
        oup = create_rand_process(env, config)

        # create replay buffer
        memory = create_memory(env, config)

        # create ddpg agent
        agent = DDPG(env, memory, oup, ob_processor, config)
        agent.build_nets(actor_hiddens=config["actor_hiddens"],
                         scale_action=config["scale_action"],
                         critic_hiddens=config["critic_hiddens"])

        # print networks
        agent.actor.summary()
        agent.target_actor.summary()
        agent.critic.summary()

        # add callbacks
        def p_info(episode_info):
            util.print_episode_info(logger, episode_info, pid)

        def save_nets(episode_info):
            paths = {}
            paths["actor"] = os.path.join(log_dir, "actor.h5")
            paths["critic"] = os.path.join(log_dir, "critic.h5")
            paths["target"] = os.path.join(log_dir, "target.h5")
            agent = episode_info["agent"]
            agent.save_models(paths)

        def save_snapshots(episode_info):
            agent = episode_info["agent"]
            episode = episode_info["episode"]
            if episode % save_snapshot_every == 0:
                paths = {}
                paths["actor"] = os.path.join(log_dir,
                                              "actor_{}.h5".format(episode))
                paths["critic"] = os.path.join(log_dir,
                                               "critic_{}.h5".format(episode))
                paths["target"] = os.path.join(log_dir,
                                               "target_{}.h5".format(episode))
                agent.save_models(paths)
                memory_path = os.path.join(log_dir, "replaybuffer.npz")
                agent.save_memory(memory_path)
                logger.info("Snapshots saved. (pid={})".format(pid))

        agent.on_episode_end.append(p_info)
        agent.on_episode_end.append(save_nets)
        agent.on_episode_end.append(save_snapshots)

        # load existing model
        if trial_dir is not None and os.path.exists(trial_dir):
            logger.info("Loading networks from {} ...".format(trial_dir))
            paths = {}
            paths["actor"] = "actor.h5"
            paths["critic"] = "critic.h5"
            paths["target"] = "target.h5"
            paths = {
                k: os.path.join(trial_dir, v)
                for k, v in paths.iteritems()
            }
            logger.info("Paths to models: {}".format(paths))
            agent.load_models(paths)
            memory_path = os.path.join(trial_dir, "replaybuffer.npz")
            if os.path.exists(memory_path):
                agent.load_memory(memory_path)
                logger.info("Replay buffer loaded.")

        # learn
        util.print_sec_header(logger, "Training")
        reward_hist, steps_hist = agent.learn(
            total_episodes=config["total_episodes"],
            max_steps=config["max_steps"])
        env.close()

        # send result
        img_file = os.path.join(log_dir, "train_stats.png")
        util.plot_stats(reward_hist, steps_hist, img_file)
        log_file = os.path.join(log_dir, "train.log")
        title = log_dir + "_" + config["title_prefix"]
        util.send_email(title, [img_file], [log_file], SMTP_SERVER)

    # TRPO
    elif config['agent'] == 'TRPO':

        def ob_processor_maker():
            if config["ob_processor"] == "normal":
                return ObservationProcessor()
            elif config["ob_processor"] == "2ndorder":
                return SecondOrderAugmentor()
            elif config['ob_processor'] == 'bodyspeed':
                return BodySpeedAugmentor()
            else:
                raise ValueError('invalid ob processor type')

        def env_maker(visualize=False):
            env = NIPS(visualize=visualize)
            monitor_dir = os.path.join(log_dir, "gym_monitor")
            env = gym.wrappers.Monitor(env,
                                       directory=monitor_dir,
                                       video_callable=False,
                                       force=False,
                                       resume=True,
                                       write_upon_reset=True)
            return env

        del env
        env = env_maker()

        agent = TRPO(
            env,
            env_maker,
            logger,
            log_dir,
            ob_processor_maker,
            policy_hiddens=config['policy_hiddens'],
            baseline_hiddens=config['baseline_hiddens'],
            n_envs=config['n_envs'],
            batch_size=config['batch_size'],
            n_iters=config['n_iters'],
        )

        if trial_dir is not None and os.path.exists(trial_dir):
            agent.load_models(trial_dir)
        agent.learn()

    logger.info("Finished (pid={}).".format(pid))
コード例 #2
0
ファイル: run_trpo.py プロジェクト: JacobImai/learning_to_run
def test(agent, trial_dir, test_episode, visual_flag, submit_flag):
    pid = os.getpid()
    logger, _ = prepare_for_logging("pid_{}".format(pid), False)

    logger.info("trial_dir={}".format(trial_dir))
    if not os.path.exists(trial_dir):
        logger.info("trial_dir does not exist")
        return

    # create environment
    env = NIPS(visualize=visual_flag)

    # load config
    with open(os.path.join(trial_dir, "config.pk"), "rb") as f:
        config = pickle.load(f)

    if agent == 'DDPG':
        config["scale_action"] = scale_action

        # observation processor
        if "ob_processor" not in config or config["ob_processor"] == "dummy":
            ob_processor = ObservationProcessor()
        elif config["ob_processor"] == "2ndorder":
            ob_processor = SecondOrderAugmentor()
        else:
            ob_processor = BodySpeedAugmentor()
        config["ob_aug_dim"] = ob_processor.get_aug_dim()
        util.print_settings(logger, config, env)

        # create random process
        oup = create_rand_process(env, config)

        # create replay buffer
        memory = create_memory(env, config)

        # create ddpg agent
        agent = DDPG(env, memory, oup, ob_processor, config)
        agent.build_nets(actor_hiddens=config["actor_hiddens"],
                         scale_action=config["scale_action"],
                         critic_hiddens=config["critic_hiddens"])

        # load weights
        paths = {}
        if test_episode > 0:
            paths["actor"] = "actor_{}.h5".format(test_episode)
            paths["critic"] = "critic_{}.h5".format(test_episode)
            paths["target"] = "target_{}.h5".format(test_episode)
        else:
            paths["actor"] = "actor.h5"
            paths["critic"] = "critic.h5"
            paths["target"] = "target.h5"
        paths = {k: os.path.join(trial_dir, v) for k, v in paths.iteritems()}
        logger.info("Paths to models: {}".format(paths))
        agent.load_models(paths)

    elif agent == 'TRPO':

        def ob_processor_maker():
            if config["ob_processor"] == "normal":
                return ObservationProcessor()
            elif config["ob_processor"] == "2ndorder":
                return SecondOrderAugmentor()
            elif config['ob_processor'] == 'bodyspeed':
                return BodySpeedAugmentor()
            else:
                raise ValueError('invalid ob processor type')

        config = {
            "agent": 'TRPO',
            "batch_size": 5000,
            "n_envs": 16,
            "n_iters": 5000,
            "ob_processor": "bodyspeed",
            # "hidden_nonlinearity": "relu",
            # "action_nonlinearity": "tanh",
            # "policy_hiddens": [128, 128, 64, 64],
            # "baseline_hiddens": [128, 128, 64, 64],
            "policy_hiddens": [256, 128, 64],
            "baseline_hiddens": [256, 128, 64],
            "hidden_nonlinearity": "tanh",
            "action_nonlinearity": None,
        }

        agent = TRPO(
            env,
            env_maker=None,
            logger=logger,
            log_dir=None,
            ob_processor_maker=ob_processor_maker,
            policy_hiddens=config['policy_hiddens'],
            baseline_hiddens=config['baseline_hiddens'],
            hidden_nonlinearity=config['hidden_nonlinearity'],
            action_nonlinearity=config['action_nonlinearity'],
            n_envs=config['n_envs'],
            batch_size=config['batch_size'],
            n_iters=config['n_iters'],
        )
        agent.load_models(trial_dir)
    else:
        raise ValueError('invalid agent type')

    if submit_flag:
        submit(agent, logger)
    else:
        rewards = []
        for i in xrange(10):
            steps, reward = agent.test(max_steps=1000)
            logger.info("episode={}, steps={}, reward={}".format(
                i, steps, reward))
            rewards.append(reward)
        logger.info("avg_reward={}".format(np.mean(rewards)))