def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = create_env(FLAGS.env.id,
                     seed=FLAGS.seed,
                     log_dir=FLAGS.log_dir,
                     absorbing_state=FLAGS.GAIL.learn_absorbing,
                     rescale_action=FLAGS.env.rescale_action)
    env_eval = create_env(FLAGS.env.id,
                          seed=FLAGS.seed + 1000,
                          log_dir=FLAGS.log_dir,
                          absorbing_state=FLAGS.GAIL.learn_absorbing,
                          rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               normalizer=normalizers.state)
    bc_loss = BehavioralCloningLoss(dim_state,
                                    dim_action,
                                    policy,
                                    lr=float(FLAGS.BC.lr),
                                    train_std=FLAGS.BC.train_std)

    expert_actor = Actor(dim_state, dim_action, FLAGS.SAC.actor_hidden_sizes)
    tf.get_default_session().run(tf.global_variables_initializer())

    loader = nn.ModuleDict({'actor': expert_actor})
    if FLAGS.BC.dagger:
        loader.load_state_dict(
            np.load(FLAGS.ckpt.policy_load, allow_pickle=True)[()])
        logger.warning('Load expert policy from %s' % FLAGS.ckpt.policy_load)
    runner = Runner(env, max_steps=env.max_episode_steps, rescale_action=False)

    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    # load expert dataset
    set_random_seed(2020)
    expert_dataset = load_expert_dataset(FLAGS.GAIL.buf_load)
    expert_reward = expert_dataset.get_average_reward()
    logger.info('Expert Reward %f', expert_reward)
    if FLAGS.GAIL.learn_absorbing:
        expert_dataset.add_absorbing_states(env)
    expert_dataset.subsample_trajectories(FLAGS.GAIL.traj_limit)
    logger.info('Original dataset size {}'.format(len(expert_dataset)))
    expert_dataset.subsample_transitions(subsampling_rate)
    logger.info('Subsampled dataset size {}'.format(len(expert_dataset)))
    logger.info('np random: %d random : %d', np.random.randint(1000),
                random.randint(0, 1000))
    expert_batch = expert_dataset.sample(10)
    expert_state = np.stack([t.obs for t in expert_batch])
    expert_action = np.stack([t.action for t in expert_batch])
    logger.info('Sampled obs: %.4f, acs: %.4f', np.mean(expert_state),
                np.mean(expert_action))
    del expert_batch, expert_state, expert_action
    set_random_seed(FLAGS.seed)

    saver = nn.ModuleDict({'policy': policy, 'normalizers': normalizers})
    print(saver)

    batch_size = FLAGS.BC.batch_size
    eval_gamma = 0.999
    for t in range(FLAGS.BC.max_iters):
        if t % FLAGS.BC.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(policy, env_eval)
            eval_returns_discount, eval_lengths_discount = evaluate(
                policy, env_eval, gamma=eval_gamma)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths))),
                             discounted_episode=dict(
                                 returns=np.mean(eval_returns_discount),
                                 lengths=int(np.mean(eval_lengths_discount)))))

        expert_batch = expert_dataset.sample(batch_size)
        expert_state = np.stack([t.obs for t in expert_batch])
        expert_action = np.stack([t.action for t in expert_batch])
        _, loss, grad_norm = bc_loss.get_loss(expert_state,
                                              expert_action,
                                              fetch='train loss grad_norm')

        if FLAGS.BC.dagger and t % FLAGS.BC.collect_freq == 0 and t > 0:
            if t // FLAGS.BC.collect_freq == 1:
                collect_policy = expert_actor
                stochastic = False
                logger.info('Collect samples with expert actor...')
            else:
                collect_policy = policy
                stochastic = True
                logger.info('Collect samples with learned policy...')
            runner.reset()
            data, ep_infos = runner.run(collect_policy,
                                        FLAGS.BC.n_collect_samples, stochastic)
            data.action = expert_actor.get_actions(data.state,
                                                   fetch='actions_mean')
            returns = [info['return'] for info in ep_infos]
            lengths = [info['length'] for info in ep_infos]
            for i in range(len(data)):
                expert_dataset.push_back(data[i].state, data[i].action,
                                         data[i].next_state, data[i].reward,
                                         data[i].mask, data[i].timeout)
            logger.info('Collect %d samples avg return = %.4f avg length = %d',
                        len(data), np.mean(returns), np.mean(lengths))
        if t % 100 == 0:
            mse_loss = policy.get_mse_loss(expert_state, expert_action)
            log_kvs(prefix='BC',
                    kvs=dict(iter=t,
                             loss=loss,
                             grad_norm=grad_norm,
                             mse_loss=mse_loss))

    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    dict_result = dict()
    for gamma in [0.9, 0.99, 0.999, 1.0]:
        eval_returns, eval_lengths = evaluate(policy, env_eval, gamma=gamma)
        dict_result[gamma] = [float(np.mean(eval_returns)), eval_returns]
        logger.info('[%s]: %.4f', gamma, np.mean(eval_returns))

    save_path = os.path.join(FLAGS.log_dir, 'evaluate.yml')
    yaml.dump(dict_result, open(save_path, 'w'), default_flow_style=False)
コード例 #2
0
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = make_env(FLAGS.env.id,
                   FLAGS.env.env_type,
                   num_env=FLAGS.env.num_env,
                   seed=FLAGS.seed,
                   log_dir=FLAGS.log_dir,
                   rescale_action=FLAGS.env.rescale_action)
    env_eval = make_env(FLAGS.env.id,
                        FLAGS.env.env_type,
                        num_env=4,
                        seed=FLAGS.seed + 1000,
                        log_dir=FLAGS.log_dir,
                        rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               normalizer=normalizers.state)
    vfn = MLPVFunction(dim_state, FLAGS.TRPO.vf_hidden_sizes,
                       normalizers.state)
    algo = TRPO(vfn=vfn,
                policy=policy,
                dim_state=dim_state,
                dim_action=dim_action,
                **FLAGS.TRPO.algo.as_dict())

    discriminator = Discriminator(dim_state,
                                  dim_action,
                                  normalizers=normalizers,
                                  **FLAGS.GAIL.discriminator.as_dict())

    tf.get_default_session().run(tf.global_variables_initializer())

    # load expert dataset
    if not os.path.exists(FLAGS.GAIL.buf_load):
        raise FileNotFoundError('Expert dataset (%s) doest not exist' %
                                FLAGS.GAIL.buf_load)
    expert_dataset = Mujoco_Dset(FLAGS.GAIL.buf_load,
                                 train_fraction=FLAGS.GAIL.train_frac,
                                 traj_limitation=FLAGS.GAIL.traj_limit)

    saver = nn.ModuleDict({
        'policy': policy,
        'vfn': vfn,
        'normalizers': normalizers
    })
    runner = Runner(env,
                    max_steps=env.max_episode_steps,
                    gamma=FLAGS.TRPO.gamma,
                    lambda_=FLAGS.TRPO.lambda_)
    print(saver)

    max_ent_coef = FLAGS.TRPO.algo.ent_coef
    for t in range(0, FLAGS.GAIL.total_timesteps,
                   FLAGS.TRPO.rollout_samples * FLAGS.GAIL.g_iters):
        time_st = time.time()
        if t % FLAGS.GAIL.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(policy, env_eval)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths)))))

        # Generator
        generator_dataset = None
        for n_update in range(FLAGS.GAIL.g_iters):
            data, ep_infos = runner.run(policy, FLAGS.TRPO.rollout_samples)
            if FLAGS.TRPO.normalization:
                normalizers.state.update(data.state)
                normalizers.action.update(data.action)
                normalizers.diff.update(data.next_state - data.state)
            if t == 0 and n_update == 0:
                data_ = data.copy()
                data_ = data_.reshape(
                    [FLAGS.TRPO.rollout_samples // env.n_envs, env.n_envs])
                for e in range(env.n_envs):
                    samples = data_[:, e]
                    masks = 1 - (samples.done | samples.timeout)[...,
                                                                 np.newaxis]
                    masks = masks[:-1]
                    assert np.allclose(samples.state[1:] * masks,
                                       samples.next_state[:-1] * masks)
            t += FLAGS.TRPO.rollout_samples
            data.reward = discriminator.get_reward(data.state, data.action)
            advantages, values = runner.compute_advantage(vfn, data)
            train_info = algo.train(max_ent_coef, data, advantages, values)
            fps = int(FLAGS.TRPO.rollout_samples / (time.time() - time_st))
            train_info['reward'] = np.mean(data.reward)
            train_info['fps'] = fps
            log_kvs(prefix='TRPO', kvs=dict(iter=t, **train_info))

            generator_dataset = data

        # Discriminator
        for n_update in range(FLAGS.GAIL.d_iters):
            batch_size = FLAGS.GAIL.d_batch_size
            d_train_infos = dict()
            for generator_subset in generator_dataset.iterator(batch_size):
                expert_state, expert_action = expert_dataset.get_next_batch(
                    batch_size)
                train_info = discriminator.train(expert_state, expert_action,
                                                 generator_subset.state,
                                                 generator_subset.action)
                for k, v in train_info.items():
                    if k not in d_train_infos:
                        d_train_infos[k] = []
                    d_train_infos[k].append(v)
            d_train_infos = {k: np.mean(v) for k, v in d_train_infos.items()}
            if n_update == FLAGS.GAIL.d_iters - 1:
                log_kvs(prefix='Discriminator',
                        kvs=dict(iter=t, **d_train_infos))

        if t % FLAGS.TRPO.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
コード例 #3
0
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = create_env(FLAGS.env.id,
                     FLAGS.seed,
                     rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    # load expert dataset
    set_random_seed(2020)
    expert_dataset = load_expert_dataset(FLAGS.GAIL.buf_load)
    expert_state = np.stack([t.obs for t in expert_dataset.buffer()])
    expert_next_state = np.stack([t.next_obs for t in expert_dataset.buffer()])
    expert_done = np.stack([t.done for t in expert_dataset.buffer()])
    np.testing.assert_allclose(
        expert_next_state[:-1] * (1 - expert_done[:-1][:, None]),
        expert_state[1:] * (1 - expert_done[:-1][:, None]))
    del expert_state, expert_next_state, expert_done
    expert_reward = expert_dataset.get_average_reward()
    logger.info('Expert Reward %f', expert_reward)
    if FLAGS.GAIL.learn_absorbing:
        expert_dataset.add_absorbing_states(env)
    eval_batch = expert_dataset.sample(1024)
    eval_state = np.stack([t.obs for t in eval_batch])
    eval_action = np.stack([t.action for t in eval_batch])
    eval_next_state = np.stack([t.next_obs for t in eval_batch])
    logger.info('Sampled obs: %.4f, acs: %.4f', np.mean(eval_state),
                np.mean(eval_action))
    expert_dataset.subsample_trajectories(FLAGS.GAIL.traj_limit)
    logger.info('Original dataset size {}'.format(len(expert_dataset)))
    expert_dataset.subsample_transitions(subsampling_rate)
    logger.info('Subsampled dataset size {}'.format(len(expert_dataset)))
    logger.info('np random: %d random : %d', np.random.randint(1000),
                random.randint(0, 1000))
    set_random_seed(FLAGS.seed)

    # expert actor
    actor = Actor(dim_state,
                  dim_action,
                  hidden_sizes=FLAGS.SAC.actor_hidden_sizes)
    # generator
    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               output_diff=FLAGS.TRPO.output_diff,
                               normalizers=normalizers)
    vfn = MLPVFunction(dim_state, dim_action, FLAGS.TRPO.vf_hidden_sizes,
                       normalizers.state)
    algo = TRPO(vfn=vfn,
                policy=policy,
                dim_state=dim_state,
                dim_action=dim_action,
                **FLAGS.TRPO.algo.as_dict())

    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    if FLAGS.GAIL.reward_type == 'nn':
        expert_batch = expert_dataset.buffer()
        expert_state = np.stack([t.obs for t in expert_batch])
        loc, scale = np.mean(expert_state, axis=0,
                             keepdims=True), np.std(expert_state,
                                                    axis=0,
                                                    keepdims=True)
        del expert_batch, expert_state
        logger.info('loc = {}\nscale={}'.format(loc, scale))
        discriminator = Discriminator(dim_state,
                                      dim_action,
                                      normalizers=normalizers,
                                      subsampling_rate=subsampling_rate,
                                      loc=loc,
                                      scale=scale,
                                      **FLAGS.GAIL.discriminator.as_dict())
    else:
        raise NotImplementedError
    bc_loss = BehavioralCloningLoss(dim_state,
                                    dim_action,
                                    policy,
                                    lr=FLAGS.BC.lr,
                                    train_std=FLAGS.BC.train_std)
    tf.get_default_session().run(tf.global_variables_initializer())

    loader = nn.ModuleDict({'actor': actor})
    loader.load_state_dict(
        np.load(FLAGS.ckpt.policy_load, allow_pickle=True)[()])
    logger.info('Load policy from %s' % FLAGS.ckpt.policy_load)
    saver = nn.ModuleDict({
        'policy': policy,
        'vfn': vfn,
        'normalizers': normalizers,
        'discriminator': discriminator
    })
    print(saver)

    # updater normalizer
    expert_state = np.stack([t.obs for t in expert_dataset.buffer()])
    expert_action = np.stack([t.action for t in expert_dataset.buffer()])
    expert_next_state = np.stack([t.next_obs for t in expert_dataset.buffer()])
    normalizers.state.update(expert_state)
    normalizers.action.update(expert_action)
    normalizers.diff.update(expert_next_state - expert_state)
    del expert_state, expert_action, expert_next_state

    eval_gamma = 0.999
    eval_returns, eval_lengths = evaluate_on_true_env(actor,
                                                      env,
                                                      gamma=eval_gamma)
    logger.warning(
        'Test policy true value = %.4f true length = %d (gamma = %f)',
        np.mean(eval_returns), np.mean(eval_lengths), eval_gamma)

    # pretrain
    for n_updates in range(FLAGS.GAIL.pretrain_iters):
        expert_batch = expert_dataset.sample(FLAGS.BC.batch_size)
        expert_state = np.stack([t.obs for t in expert_batch])
        expert_action = np.stack([t.action for t in expert_batch])
        expert_next_state = np.stack([t.next_obs for t in expert_batch])
        _, loss, grad_norm = bc_loss.get_loss(expert_state,
                                              expert_action,
                                              expert_next_state,
                                              fetch='train loss grad_norm')
        if n_updates % 100 == 0:
            mse_loss = policy.get_mse_loss(expert_state, expert_action,
                                           expert_next_state)
            logger.info(
                '[Pretrain] iter = %d grad_norm = %.4f loss = %.4f mse_loss = %.4f',
                n_updates, grad_norm, loss, mse_loss)

    # virtual env
    virtual_env = VirtualEnv(policy,
                             env,
                             n_envs=FLAGS.env.num_env,
                             stochastic_model=True)
    virtual_runner = VirtualRunner(virtual_env,
                                   max_steps=env.max_episode_steps,
                                   gamma=FLAGS.TRPO.gamma,
                                   lambda_=FLAGS.TRPO.lambda_,
                                   rescale_action=False)
    env_eval_stochastic = VirtualEnv(policy,
                                     env,
                                     n_envs=4,
                                     stochastic_model=True)
    env_eval_deterministic = VirtualEnv(policy,
                                        env,
                                        n_envs=4,
                                        stochastic_model=False)

    max_ent_coef = FLAGS.TRPO.algo.ent_coef
    true_return = np.mean(eval_returns)
    for t in range(0, FLAGS.GAIL.total_timesteps,
                   FLAGS.TRPO.rollout_samples * FLAGS.GAIL.g_iters):
        time_st = time.time()
        if t % FLAGS.GAIL.eval_freq == 0:
            eval_returns_stochastic, eval_lengths_stochastic = evaluate_on_virtual_env(
                actor, env_eval_stochastic, gamma=eval_gamma)
            eval_returns_deterministic, eval_lengths_deterministic = evaluate_on_virtual_env(
                actor, env_eval_deterministic, gamma=eval_gamma)
            log_kvs(
                prefix='Evaluate',
                kvs=dict(
                    iter=t,
                    stochastic_episode=dict(
                        returns=np.mean(eval_returns_stochastic),
                        lengths=int(np.mean(eval_lengths_stochastic))),
                    episode=dict(returns=np.mean(eval_returns_deterministic),
                                 lengths=int(
                                     np.mean(eval_lengths_deterministic))),
                    evaluation_error=dict(
                        stochastic_error=true_return -
                        np.mean(eval_returns_stochastic),
                        stochastic_abs=np.abs(
                            true_return - np.mean(eval_returns_stochastic)),
                        stochastic_rel=np.abs(true_return -
                                              np.mean(eval_returns_stochastic))
                        / true_return,
                        deterministic_error=true_return -
                        np.mean(eval_returns_deterministic),
                        deterministic_abs=np.abs(
                            true_return - np.mean(eval_returns_deterministic)),
                        deterministic_rel=np.abs(true_return - np.mean(
                            eval_returns_deterministic)) / true_return)))
        # Generator
        generator_dataset = None
        for n_update in range(FLAGS.GAIL.g_iters):
            data, ep_infos = virtual_runner.run(actor,
                                                FLAGS.TRPO.rollout_samples,
                                                stochastic=False)
            # if FLAGS.TRPO.normalization:
            #     normalizers.state.update(data.state)
            #     normalizers.action.update(data.action)
            #     normalizers.diff.update(data.next_state - data.state)
            if t == 0:
                np.testing.assert_allclose(data.reward,
                                           env.mb_step(data.state, data.action,
                                                       data.next_state)[0],
                                           atol=1e-4,
                                           rtol=1e-4)
            if t == 0 and n_update == 0 and not FLAGS.GAIL.learn_absorbing:
                data_ = data.copy()
                data_ = data_.reshape(
                    [FLAGS.TRPO.rollout_samples // env.n_envs, env.n_envs])
                for e in range(env.n_envs):
                    samples = data_[:, e]
                    masks = 1 - (samples.done | samples.timeout)[...,
                                                                 np.newaxis]
                    masks = masks[:-1]
                    assert np.allclose(samples.state[1:] * masks,
                                       samples.next_state[:-1] * masks)
            t += FLAGS.TRPO.rollout_samples
            data.reward = discriminator.get_reward(data.state, data.action,
                                                   data.next_state)
            advantages, values = virtual_runner.compute_advantage(vfn, data)
            train_info = algo.train(max_ent_coef, data, advantages, values)
            fps = int(FLAGS.TRPO.rollout_samples / (time.time() - time_st))
            train_info['reward'] = np.mean(data.reward)
            train_info['fps'] = fps

            expert_batch = expert_dataset.sample(256)
            expert_state = np.stack([t.obs for t in expert_batch])
            expert_action = np.stack([t.action for t in expert_batch])
            expert_next_state = np.stack([t.next_obs for t in expert_batch])
            train_mse_loss = policy.get_mse_loss(expert_state, expert_action,
                                                 expert_next_state)
            eval_mse_loss = policy.get_mse_loss(eval_state, eval_action,
                                                eval_next_state)
            train_info['mse_loss'] = dict(train=train_mse_loss,
                                          eval=eval_mse_loss)
            log_kvs(prefix='TRPO', kvs=dict(iter=t, **train_info))

            generator_dataset = data

        # Discriminator
        for n_update in range(FLAGS.GAIL.d_iters):
            batch_size = FLAGS.GAIL.d_batch_size
            d_train_infos = dict()
            for generator_subset in generator_dataset.iterator(batch_size):
                expert_batch = expert_dataset.sample(batch_size)
                expert_state = np.stack([t.obs for t in expert_batch])
                expert_action = np.stack([t.action for t in expert_batch])
                expert_next_state = np.stack(
                    [t.next_obs for t in expert_batch])
                expert_mask = None
                train_info = discriminator.train(
                    expert_state,
                    expert_action,
                    expert_next_state,
                    generator_subset.state,
                    generator_subset.action,
                    generator_subset.next_state,
                    expert_mask,
                )
                for k, v in train_info.items():
                    if k not in d_train_infos:
                        d_train_infos[k] = []
                    d_train_infos[k].append(v)
            d_train_infos = {k: np.mean(v) for k, v in d_train_infos.items()}
            if n_update == FLAGS.GAIL.d_iters - 1:
                log_kvs(prefix='Discriminator',
                        kvs=dict(iter=t, **d_train_infos))

        if t % FLAGS.TRPO.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    dict_result = dict()
    for gamma in [0.9, 0.99, 0.999, 1.0]:
        eval_returns, eval_lengths = evaluate_on_virtual_env(
            actor, env_eval_stochastic, gamma=gamma)
        dict_result[gamma] = [float(np.mean(eval_returns)), eval_returns]
        logger.info('[%s]: %.4f', gamma, np.mean(eval_returns))

    save_path = os.path.join(FLAGS.log_dir, 'evaluate.yml')
    yaml.dump(dict_result, open(save_path, 'w'), default_flow_style=False)
コード例 #4
0
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = make_env(FLAGS.env.id,
                   FLAGS.env.env_type,
                   num_env=FLAGS.env.num_env,
                   seed=FLAGS.seed,
                   log_dir=FLAGS.log_dir)
    state_spec = env.observation_space
    action_spec = env.action_space

    logger.info('[{}]: state_spec:{}, action_spec:{}'.format(
        FLAGS.env.id, state_spec.shape, action_spec.n))

    dtype = gen_dtype(env,
                      'state action next_state mu reward done timeout info')
    buffer = ReplayBuffer(env.n_envs,
                          FLAGS.ACER.n_steps,
                          stacked_frame=FLAGS.env.env_type == 'atari',
                          dtype=dtype,
                          size=FLAGS.ACER.buffer_size)

    if len(state_spec.shape) == 3:
        policy = CNNPolicy(state_spec, action_spec)
    else:
        policy = MLPPolicy(state_spec, action_spec)

    algo = ACER(state_spec,
                action_spec,
                policy,
                lr=FLAGS.ACER.lr,
                lrschedule=FLAGS.ACER.lrschedule,
                total_timesteps=FLAGS.ACER.total_timesteps,
                ent_coef=FLAGS.ACER.ent_coef,
                q_coef=FLAGS.ACER.q_coef,
                trust_region=FLAGS.ACER.trust_region)
    runner = Runner(env,
                    max_steps=env.max_episode_steps,
                    gamma=FLAGS.ACER.gamma)
    saver = nn.ModuleDict({'policy': policy})
    print(saver)

    tf.get_default_session().run(tf.global_variables_initializer())
    algo.update_old_policy(0.)

    n_steps = FLAGS.ACER.n_steps
    n_batches = n_steps * env.n_envs
    n_stages = FLAGS.ACER.total_timesteps // n_batches

    returns = collections.deque(maxlen=40)
    lengths = collections.deque(maxlen=40)
    replay_reward = collections.deque(maxlen=40)
    time_st = time.time()
    for t in range(n_stages):
        data, ep_infos = runner.run(policy, n_steps)
        returns.extend([info['return'] for info in ep_infos])
        lengths.extend([info['length'] for info in ep_infos])

        if t == 0:  # check runner
            indices = np.arange(0, n_batches, env.n_envs)
            for _ in range(env.n_envs):
                samples = data[indices]
                masks = 1 - (samples.done | samples.timeout)
                masks = masks[:-1]
                masks = np.reshape(masks,
                                   [-1] + [1] * len(samples.state.shape[1:]))
                np.testing.assert_allclose(samples.state[1:] * masks,
                                           samples.next_state[:-1] * masks)
                indices += 1

        buffer.store_episode(data)
        if t == 1:  # check buffer
            data_ = buffer.sample(idx=[1 for _ in range(env.n_envs)])
            check_data_equal(data_, data, ('state', 'action', 'next_state',
                                           'mu', 'reward', 'done', 'timeout'))

        # on-policy training
        qret = runner.compute_qret(policy, data)
        train_info = algo.train(data, qret, t * n_batches)
        replay_reward.append(np.mean(data.reward))
        # off-policy training
        if t * n_batches > FLAGS.ACER.replay_start:
            n = np.random.poisson(FLAGS.ACER.replay_ratio)
            for _ in range(n):
                data = buffer.sample()
                qret = runner.compute_qret(policy, data)
                algo.train(data, qret, t * n_batches)
                replay_reward.append(np.mean(data.reward))

        if t * n_batches % FLAGS.ACER.log_interval == 0:
            fps = int(t * n_batches / (time.time() - time_st))
            kvs = dict(iter=t * n_batches,
                       episode=dict(
                           returns=np.mean(returns) if len(returns) > 0 else 0,
                           lengths=np.mean(lengths).astype(np.int32)
                           if len(lengths) > 0 else 0),
                       **train_info,
                       replay_reward=np.mean(replay_reward)
                       if len(replay_reward) > 0 else 0.,
                       fps=fps)
            log_kvs(prefix='ACER', kvs=kvs)

        if t * n_batches % FLAGS.ACER.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = create_env(FLAGS.env.id,
                     seed=FLAGS.seed,
                     rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               output_diff=FLAGS.TRPO.output_diff,
                               normalizers=normalizers)
    bc_loss = BehavioralCloningLoss(dim_state,
                                    dim_action,
                                    policy,
                                    lr=float(FLAGS.BC.lr),
                                    train_std=FLAGS.BC.train_std)

    actor = Actor(dim_state, dim_action, FLAGS.SAC.actor_hidden_sizes)
    tf.get_default_session().run(tf.global_variables_initializer())

    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    # load expert dataset
    set_random_seed(2020)
    expert_dataset = load_expert_dataset(FLAGS.GAIL.buf_load)
    expert_state = np.stack([t.obs for t in expert_dataset.buffer()])
    expert_next_state = np.stack([t.next_obs for t in expert_dataset.buffer()])
    expert_done = np.stack([t.done for t in expert_dataset.buffer()])
    np.testing.assert_allclose(
        expert_next_state[:-1] * (1 - expert_done[:-1][:, None]),
        expert_state[1:] * (1 - expert_done[:-1][:, None]))
    del expert_state, expert_next_state, expert_done
    expert_reward = expert_dataset.get_average_reward()
    logger.info('Expert Reward %f', expert_reward)
    if FLAGS.GAIL.learn_absorbing:
        expert_dataset.add_absorbing_states(env)
    eval_batch = expert_dataset.sample(1024)
    eval_state = np.stack([t.obs for t in eval_batch])
    eval_action = np.stack([t.action for t in eval_batch])
    eval_next_state = np.stack([t.next_obs for t in eval_batch])
    logger.info('Sampled obs: %.4f, acs: %.4f', np.mean(eval_state),
                np.mean(eval_action))
    expert_dataset.subsample_trajectories(FLAGS.GAIL.traj_limit)
    logger.info('Original dataset size {}'.format(len(expert_dataset)))
    expert_dataset.subsample_transitions(subsampling_rate)
    logger.info('Subsampled dataset size {}'.format(len(expert_dataset)))
    logger.info('np random: %d random : %d', np.random.randint(1000),
                random.randint(0, 1000))
    set_random_seed(FLAGS.seed)

    loader = nn.ModuleDict({'actor': actor})
    loader.load_state_dict(
        np.load(FLAGS.ckpt.policy_load, allow_pickle=True)[()])
    logger.warning('Load expert policy from %s' % FLAGS.ckpt.policy_load)
    saver = nn.ModuleDict({'policy': policy, 'normalizers': normalizers})
    print(saver)

    # updater normalizer
    expert_state = np.stack([t.obs for t in expert_dataset.buffer()])
    expert_action = np.stack([t.action for t in expert_dataset.buffer()])
    expert_next_state = np.stack([t.next_obs for t in expert_dataset.buffer()])
    normalizers.state.update(expert_state)
    normalizers.action.update(expert_action)
    normalizers.diff.update(expert_next_state - expert_state)
    del expert_state, expert_action, expert_next_state

    eval_gamma = 0.999
    eval_returns, eval_lengths = evaluate_on_true_env(actor,
                                                      env,
                                                      gamma=eval_gamma)
    logger.warning(
        'Test policy true value = %.4f true length = %d (gamma = %f)',
        np.mean(eval_returns), np.mean(eval_lengths), eval_gamma)

    # virtual env
    env_eval_stochastic = VirtualEnv(policy,
                                     env,
                                     n_envs=4,
                                     stochastic_model=True)
    env_eval_deterministic = VirtualEnv(policy,
                                        env,
                                        n_envs=4,
                                        stochastic_model=False)

    batch_size = FLAGS.BC.batch_size
    true_return = np.mean(eval_returns)
    for t in range(FLAGS.BC.max_iters):
        if t % FLAGS.BC.eval_freq == 0:
            eval_returns_stochastic, eval_lengths_stochastic = evaluate_on_virtual_env(
                actor, env_eval_stochastic, gamma=eval_gamma)
            eval_returns_deterministic, eval_lengths_deterministic = evaluate_on_virtual_env(
                actor, env_eval_deterministic, gamma=eval_gamma)
            log_kvs(
                prefix='Evaluate',
                kvs=dict(
                    iter=t,
                    stochastic_episode=dict(
                        returns=np.mean(eval_returns_stochastic),
                        lengths=int(np.mean(eval_lengths_stochastic))),
                    episode=dict(returns=np.mean(eval_returns_deterministic),
                                 lengths=int(
                                     np.mean(eval_lengths_deterministic))),
                    evaluation_error=dict(
                        stochastic_error=true_return -
                        np.mean(eval_returns_stochastic),
                        stochastic_abs=np.abs(
                            true_return - np.mean(eval_returns_stochastic)),
                        stochastic_rel=np.abs(true_return -
                                              np.mean(eval_returns_stochastic))
                        / true_return,
                        deterministic_error=true_return -
                        np.mean(eval_returns_deterministic),
                        deterministic_abs=np.abs(
                            true_return - np.mean(eval_returns_deterministic)),
                        deterministic_rel=np.abs(true_return - np.mean(
                            eval_returns_deterministic)) / true_return)))

        expert_batch = expert_dataset.sample(batch_size)
        expert_state = np.stack([t.obs for t in expert_batch])
        expert_action = np.stack([t.action for t in expert_batch])
        expert_next_state = np.stack([t.next_obs for t in expert_batch])
        _, loss, grad_norm = bc_loss.get_loss(expert_state,
                                              expert_action,
                                              expert_next_state,
                                              fetch='train loss grad_norm')

        if t % 100 == 0:
            train_mse_loss = policy.get_mse_loss(expert_state, expert_action,
                                                 expert_next_state)
            eval_mse_loss = policy.get_mse_loss(eval_state, eval_action,
                                                eval_next_state)
            log_kvs(prefix='BC',
                    kvs=dict(iter=t,
                             grad_norm=grad_norm,
                             loss=loss,
                             mse_loss=dict(train=train_mse_loss,
                                           eval=eval_mse_loss)))

    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    dict_result = dict()
    for gamma in [0.9, 0.99, 0.999, 1.0]:
        eval_returns, eval_lengths = evaluate_on_virtual_env(
            actor, env_eval_stochastic, gamma=gamma)
        dict_result[gamma] = [float(np.mean(eval_returns)), eval_returns]
        logger.info('[%s]: %.4f', gamma, np.mean(eval_returns))

    save_path = os.path.join(FLAGS.log_dir, 'evaluate.yml')
    yaml.dump(dict_result, open(save_path, 'w'), default_flow_style=False)
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = create_env(FLAGS.env.id, seed=FLAGS.seed, rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    # expert actor
    actor = Actor(dim_state, dim_action, init_std=0.)
    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    expert_state, expert_action, expert_next_state, expert_reward = collect_samples_from_true_env(
        env=env, actor=actor, nb_episode=FLAGS.GAIL.traj_limit, subsampling_rate=subsampling_rate)
    logger.info('Collect % d samples avg return = %.4f', len(expert_state), np.mean(expert_reward))
    eval_state, eval_action, eval_next_state, eval_reward = collect_samples_from_true_env(
        env=env, actor=actor, nb_episode=3, seed=FLAGS.seed)
    loc, scale = np.mean(expert_state, axis=0, keepdims=True), np.std(expert_state, axis=0, keepdims=True)
    logger.info('loc = {}\nscale={}'.format(loc, scale))

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state, dim_action, FLAGS.TRPO.policy_hidden_sizes,
                               output_diff=FLAGS.TRPO.output_diff, normalizers=normalizers)
    bc_loss = BehavioralCloningLoss(dim_state, dim_action, policy, lr=float(FLAGS.BC.lr), train_std=FLAGS.BC.train_std)

    tf.get_default_session().run(tf.global_variables_initializer())
    set_random_seed(FLAGS.seed)

    saver = nn.ModuleDict({'policy': policy, 'normalizers': normalizers})
    print(saver)

    # updater normalizer
    normalizers.state.update(expert_state)
    normalizers.action.update(expert_action)
    normalizers.diff.update(expert_next_state - expert_state)

    eval_gamma = 0.999
    eval_returns, eval_lengths = evaluate_on_true_env(actor, env, gamma=eval_gamma)
    logger.warning('Test policy true value = %.4f true length = %d (gamma = %f)',
                   np.mean(eval_returns), np.mean(eval_lengths), eval_gamma)

    # virtual env
    env_eval_stochastic = VirtualEnv(policy, env, n_envs=4, stochastic_model=True)
    env_eval_deterministic = VirtualEnv(policy, env, n_envs=4, stochastic_model=False)

    batch_size = FLAGS.BC.batch_size
    true_return = np.mean(eval_returns)
    for t in range(FLAGS.BC.max_iters):
        if t % FLAGS.BC.eval_freq == 0:
            eval_returns_stochastic, eval_lengths_stochastic = evaluate_on_virtual_env(
                actor, env_eval_stochastic, gamma=eval_gamma)
            eval_returns_deterministic, eval_lengths_deterministic = evaluate_on_virtual_env(
                actor, env_eval_deterministic, gamma=eval_gamma)
            log_kvs(prefix='Evaluate', kvs=dict(
                iter=t, stochastic_episode=dict(
                    returns=np.mean(eval_returns_stochastic), lengths=int(np.mean(eval_lengths_stochastic))
                ), episode=dict(
                    returns=np.mean(eval_returns_deterministic), lengths=int(np.mean(eval_lengths_deterministic))
                ),  evaluation_error=dict(
                    stochastic_error=true_return-np.mean(eval_returns_stochastic),
                    stochastic_abs=np.abs(true_return-np.mean(eval_returns_stochastic)),
                    stochastic_rel=np.abs(true_return-np.mean(eval_returns_stochastic))/true_return,
                    deterministic_error=true_return-np.mean(eval_returns_deterministic),
                    deterministic_abs=np.abs(true_return - np.mean(eval_returns_deterministic)),
                    deterministic_rel=np.abs(true_return-np.mean(eval_returns_deterministic))/true_return
                )
            ))

        indices = np.random.randint(low=0, high=len(expert_state), size=batch_size)
        expert_state_ = expert_state[indices]
        expert_action_ = expert_action[indices]
        expert_next_state_ = expert_next_state[indices]
        _, loss, grad_norm = bc_loss.get_loss(expert_state_, expert_action_, expert_next_state_,
                                              fetch='train loss grad_norm')

        if t % 100 == 0:
            train_mse_loss = policy.get_mse_loss(expert_state_, expert_action_, expert_next_state_)
            eval_mse_loss = policy.get_mse_loss(eval_state, eval_action, eval_next_state)
            log_kvs(prefix='BC', kvs=dict(
                iter=t, grad_norm=grad_norm, loss=loss, mse_loss=dict(train=train_mse_loss, eval=eval_mse_loss)
            ))

    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    dict_result = dict()
    for gamma in [0.9, 0.99, 0.999, 1.0]:
        eval_returns, eval_lengths = evaluate_on_virtual_env(actor, env_eval_deterministic, gamma=gamma)
        dict_result[gamma] = [float(np.mean(eval_returns)), eval_returns]
        logger.info('[%s]: %.4f', gamma, np.mean(eval_returns))

    save_path = os.path.join(FLAGS.log_dir, 'evaluate.yml')
    yaml.dump(dict_result, open(save_path, 'w'), default_flow_style=False)
コード例 #7
0
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = create_env(FLAGS.env.id,
                     seed=FLAGS.seed,
                     log_dir=FLAGS.log_dir,
                     absorbing_state=FLAGS.GAIL.learn_absorbing,
                     rescale_action=FLAGS.env.rescale_action)
    env_eval = create_env(FLAGS.env.id,
                          seed=FLAGS.seed + 1000,
                          log_dir=FLAGS.log_dir,
                          absorbing_state=FLAGS.GAIL.learn_absorbing,
                          rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    # load expert dataset
    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    set_random_seed(2020)
    expert_dataset = load_expert_dataset(FLAGS.GAIL.buf_load)
    expert_reward = expert_dataset.get_average_reward()
    logger.info('Expert Reward %f', expert_reward)
    if FLAGS.GAIL.learn_absorbing:
        expert_dataset.add_absorbing_states(env)
    expert_dataset.subsample_trajectories(FLAGS.GAIL.traj_limit)
    logger.info('Original dataset size {}'.format(len(expert_dataset)))
    expert_dataset.subsample_transitions(subsampling_rate)
    logger.info('Subsampled dataset size {}'.format(len(expert_dataset)))
    logger.info('np random: %d random : %d', np.random.randint(1000),
                random.randint(0, 1000))
    expert_batch = expert_dataset.sample(10)
    expert_state = np.stack([t.obs for t in expert_batch])
    expert_action = np.stack([t.action for t in expert_batch])
    logger.info('Sampled obs: %.4f, acs: %.4f', np.mean(expert_state),
                np.mean(expert_action))
    del expert_batch, expert_state, expert_action
    set_random_seed(FLAGS.seed)

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               normalizer=normalizers.state)
    vfn = MLPVFunction(dim_state, FLAGS.TRPO.vf_hidden_sizes,
                       normalizers.state)
    algo = TRPO(vfn=vfn,
                policy=policy,
                dim_state=dim_state,
                dim_action=dim_action,
                **FLAGS.TRPO.algo.as_dict())

    if FLAGS.GAIL.reward_type == 'nn':
        expert_batch = expert_dataset.buffer()
        expert_state = np.stack([t.obs for t in expert_batch])
        loc, scale = np.mean(expert_state, axis=0,
                             keepdims=True), np.std(expert_state,
                                                    axis=0,
                                                    keepdims=True)
        del expert_batch, expert_state
        discriminator = Discriminator(dim_state,
                                      dim_action,
                                      normalizers=normalizers,
                                      subsampling_rate=subsampling_rate,
                                      loc=loc,
                                      scale=scale,
                                      **FLAGS.GAIL.discriminator.as_dict())
    elif FLAGS.GAIL.reward_type in {'simplex', 'l2'}:
        discriminator = LinearReward(
            dim_state, dim_action, simplex=FLAGS.GAIL.reward_type == 'simplex')
    else:
        raise NotImplementedError
    tf.get_default_session().run(tf.global_variables_initializer())

    if not FLAGS.GAIL.reward_type == 'nn':
        expert_batch = expert_dataset.buffer()
        expert_state = np.stack([t.obs for t in expert_batch])
        expert_action = np.stack([t.action for t in expert_batch])
        discriminator.build(expert_state, expert_action)
        del expert_batch, expert_state, expert_action

    saver = nn.ModuleDict({
        'policy': policy,
        'vfn': vfn,
        'normalizers': normalizers,
        'discriminator': discriminator
    })
    runner = Runner(env,
                    max_steps=env.max_episode_steps,
                    gamma=FLAGS.TRPO.gamma,
                    lambda_=FLAGS.TRPO.lambda_,
                    add_absorbing_state=FLAGS.GAIL.learn_absorbing)
    print(saver)

    max_ent_coef = FLAGS.TRPO.algo.ent_coef
    eval_gamma = 0.999
    for t in range(0, FLAGS.GAIL.total_timesteps,
                   FLAGS.TRPO.rollout_samples * FLAGS.GAIL.g_iters):
        time_st = time.time()
        if t % FLAGS.GAIL.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(policy, env_eval)
            eval_returns_discount, eval_lengths_discount = evaluate(
                policy, env_eval, gamma=eval_gamma)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths))),
                             discounted_episode=dict(
                                 returns=np.mean(eval_returns_discount),
                                 lengths=int(np.mean(eval_lengths_discount)))))

        # Generator
        generator_dataset = None
        for n_update in range(FLAGS.GAIL.g_iters):
            data, ep_infos = runner.run(policy, FLAGS.TRPO.rollout_samples)
            if FLAGS.TRPO.normalization:
                normalizers.state.update(data.state)
                normalizers.action.update(data.action)
                normalizers.diff.update(data.next_state - data.state)
            if t == 0 and n_update == 0 and not FLAGS.GAIL.learn_absorbing:
                data_ = data.copy()
                data_ = data_.reshape(
                    [FLAGS.TRPO.rollout_samples // env.n_envs, env.n_envs])
                for e in range(env.n_envs):
                    samples = data_[:, e]
                    masks = 1 - (samples.done | samples.timeout)[...,
                                                                 np.newaxis]
                    masks = masks[:-1]
                    assert np.allclose(samples.state[1:] * masks,
                                       samples.next_state[:-1] * masks)
            t += FLAGS.TRPO.rollout_samples
            data.reward = discriminator.get_reward(data.state, data.action)
            advantages, values = runner.compute_advantage(vfn, data)
            train_info = algo.train(max_ent_coef, data, advantages, values)
            fps = int(FLAGS.TRPO.rollout_samples / (time.time() - time_st))
            train_info['reward'] = np.mean(data.reward)
            train_info['fps'] = fps

            expert_batch = expert_dataset.sample(256)
            expert_state = np.stack([t.obs for t in expert_batch])
            expert_action = np.stack([t.action for t in expert_batch])
            train_info['mse_loss'] = policy.get_mse_loss(
                expert_state, expert_action)
            log_kvs(prefix='TRPO', kvs=dict(iter=t, **train_info))

            generator_dataset = data

        # Discriminator
        if FLAGS.GAIL.reward_type in {'nn', 'vb'}:
            for n_update in range(FLAGS.GAIL.d_iters):
                batch_size = FLAGS.GAIL.d_batch_size
                d_train_infos = dict()
                for generator_subset in generator_dataset.iterator(batch_size):
                    expert_batch = expert_dataset.sample(batch_size)
                    expert_state = np.stack([t.obs for t in expert_batch])
                    expert_action = np.stack([t.action for t in expert_batch])
                    expert_mask = np.stack([
                        t.mask for t in expert_batch
                    ]).flatten() if FLAGS.GAIL.learn_absorbing else None
                    train_info = discriminator.train(
                        expert_state,
                        expert_action,
                        generator_subset.state,
                        generator_subset.action,
                        expert_mask,
                    )
                    for k, v in train_info.items():
                        if k not in d_train_infos:
                            d_train_infos[k] = []
                        d_train_infos[k].append(v)
                d_train_infos = {
                    k: np.mean(v)
                    for k, v in d_train_infos.items()
                }
                if n_update == FLAGS.GAIL.d_iters - 1:
                    log_kvs(prefix='Discriminator',
                            kvs=dict(iter=t, **d_train_infos))
        else:
            train_info = discriminator.train(generator_dataset.state,
                                             generator_dataset.action)
            log_kvs(prefix='Discriminator', kvs=dict(iter=t, **train_info))

        if t % FLAGS.TRPO.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    dict_result = dict()
    for gamma in [0.9, 0.99, 0.999, 1.0]:
        eval_returns, eval_lengths = evaluate(policy, env_eval, gamma=gamma)
        dict_result[gamma] = [float(np.mean(eval_returns)), eval_returns]
        logger.info('[%s]: %.4f', gamma, np.mean(eval_returns))

    save_path = os.path.join(FLAGS.log_dir, 'evaluate.yml')
    yaml.dump(dict_result, open(save_path, 'w'), default_flow_style=False)
コード例 #8
0
ファイル: main.py プロジェクト: liziniu/RLX
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = make_env(FLAGS.env.id,
                   FLAGS.env.env_type,
                   num_env=FLAGS.env.num_env,
                   seed=FLAGS.seed,
                   log_dir=FLAGS.log_dir,
                   rescale_action=FLAGS.env.rescale_action)
    env_eval = make_env(FLAGS.env.id,
                        FLAGS.env.env_type,
                        num_env=4,
                        seed=FLAGS.seed + 1000,
                        log_dir=FLAGS.log_dir)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    actor = Actor(dim_state,
                  dim_action,
                  hidden_sizes=FLAGS.TD3.actor_hidden_sizes)
    critic = Critic(dim_state,
                    dim_action,
                    hidden_sizes=FLAGS.TD3.critic_hidden_sizes)
    td3 = TD3(dim_state,
              dim_action,
              actor=actor,
              critic=critic,
              **FLAGS.TD3.algo.as_dict())

    tf.get_default_session().run(tf.global_variables_initializer())
    td3.update_actor_target(tau=0.0)
    td3.update_critic_target(tau=0.0)

    dtype = gen_dtype(env, 'state action next_state reward done timeout')
    buffer = Dataset(dtype=dtype, max_size=FLAGS.TD3.buffer_size)
    saver = nn.ModuleDict({'actor': actor, 'critic': critic})
    print(saver)

    n_steps = np.zeros(env.n_envs)
    n_returns = np.zeros(env.n_envs)

    train_returns = collections.deque(maxlen=40)
    train_lengths = collections.deque(maxlen=40)
    states = env.reset()
    time_st = time.time()
    for t in range(FLAGS.TD3.total_timesteps):
        if t < FLAGS.TD3.init_random_steps:
            actions = np.array(
                [env.action_space.sample() for _ in range(env.n_envs)])
        else:
            raw_actions = actor.get_actions(states)
            noises = np.random.normal(loc=0.,
                                      scale=FLAGS.TD3.explore_noise,
                                      size=raw_actions.shape)
            actions = np.clip(raw_actions + noises, -1, 1)
        next_states, rewards, dones, infos = env.step(actions)
        n_returns += rewards
        n_steps += 1
        timeouts = n_steps == env.max_episode_steps
        terminals = np.copy(dones)
        for e, info in enumerate(infos):
            if info.get('TimeLimit.truncated', False):
                terminals[e] = False

        transitions = [
            states, actions,
            next_states.copy(), rewards, terminals,
            timeouts.copy()
        ]
        buffer.extend(np.rec.fromarrays(transitions, dtype=dtype))

        indices = np.where(dones | timeouts)[0]
        if len(indices) > 0:
            next_states[indices] = env.partial_reset(indices)

            train_returns.extend(n_returns[indices])
            train_lengths.extend(n_steps[indices])
            n_returns[indices] = 0
            n_steps[indices] = 0
        states = next_states.copy()

        if t == 2000:
            assert env.n_envs == 1
            samples = buffer.sample(size=None, indices=np.arange(2000))
            masks = 1 - (samples.done | samples.timeout)[..., np.newaxis]
            masks = masks[:-1]
            assert np.allclose(samples.state[1:] * masks,
                               samples.next_state[:-1] * masks)

        if t >= FLAGS.TD3.init_random_steps:
            samples = buffer.sample(FLAGS.TD3.batch_size)
            train_info = td3.train(samples)
            if t % FLAGS.TD3.log_freq == 0:
                fps = int(t / (time.time() - time_st))
                train_info['fps'] = fps
                log_kvs(prefix='TD3',
                        kvs=dict(iter=t,
                                 episode=dict(
                                     returns=np.mean(train_returns)
                                     if len(train_returns) > 0 else 0.,
                                     lengths=int(
                                         np.mean(train_lengths)
                                         if len(train_lengths) > 0 else 0)),
                                 **train_info))

        if t % FLAGS.TD3.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(actor,
                                                  env_eval,
                                                  deterministic=False)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths)))))

        if t % FLAGS.TD3.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
コード例 #9
0
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = make_env(FLAGS.env.id,
                   FLAGS.env.env_type,
                   num_env=FLAGS.env.num_env,
                   seed=FLAGS.seed,
                   log_dir=FLAGS.log_dir,
                   rescale_action=FLAGS.env.rescale_action)
    env_eval = make_env(FLAGS.env.id,
                        FLAGS.env.env_type,
                        num_env=4,
                        seed=FLAGS.seed + 1000,
                        log_dir=FLAGS.log_dir)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               normalizer=normalizers.state)
    vfn = MLPVFunction(dim_state, FLAGS.TRPO.vf_hidden_sizes,
                       normalizers.state)
    algo = TRPO(vfn=vfn,
                policy=policy,
                dim_state=dim_state,
                dim_action=dim_action,
                **FLAGS.TRPO.algo.as_dict())

    tf.get_default_session().run(tf.global_variables_initializer())

    saver = nn.ModuleDict({
        'policy': policy,
        'vfn': vfn,
        'normalizers': normalizers
    })
    runner = Runner(env,
                    max_steps=env.max_episode_steps,
                    gamma=FLAGS.TRPO.gamma,
                    lambda_=FLAGS.TRPO.lambda_,
                    partial_episode_bootstrapping=FLAGS.TRPO.peb)
    print(saver)

    max_ent_coef = FLAGS.TRPO.algo.ent_coef
    train_returns = collections.deque(maxlen=40)
    train_lengths = collections.deque(maxlen=40)
    for t in range(0, FLAGS.TRPO.total_timesteps, FLAGS.TRPO.rollout_samples):
        time_st = time.time()
        if t % FLAGS.TRPO.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(policy, env_eval)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths)))))

        data, ep_infos = runner.run(policy, FLAGS.TRPO.rollout_samples)
        if t == 0:
            data_ = data.copy()
            data_ = data_.reshape(
                [FLAGS.TRPO.rollout_samples // env.n_envs, env.n_envs])
            for e in range(env.n_envs):
                samples = data_[:, e]
                masks = 1 - (samples.done | samples.timeout)[..., np.newaxis]
                masks = masks[:-1]
                assert np.allclose(samples.state[1:] * masks,
                                   samples.next_state[:-1] * masks)

        if FLAGS.TRPO.normalization:
            normalizers.state.update(data.state)
            normalizers.action.update(data.action)
            normalizers.diff.update(data.next_state - data.state)
        advantages, values = runner.compute_advantage(vfn, data)
        train_info = algo.train(max_ent_coef, data, advantages, values)
        train_returns.extend([info['return'] for info in ep_infos])
        train_lengths.extend([info['length'] for info in ep_infos])
        fps = int(FLAGS.TRPO.rollout_samples / (time.time() - time_st))
        train_info['fps'] = fps
        log_kvs(prefix='TRPO',
                kvs=dict(iter=t,
                         episode=dict(
                             returns=np.mean(train_returns)
                             if len(train_returns) > 0 else 0.,
                             lengths=int(
                                 np.mean(train_lengths
                                         ) if len(train_lengths) > 0 else 0)),
                         **train_info))

        t += FLAGS.TRPO.rollout_samples
        if t % FLAGS.TRPO.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
コード例 #10
0
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = make_env(FLAGS.env.id,
                   FLAGS.env.env_type,
                   num_env=FLAGS.env.num_env,
                   seed=FLAGS.seed,
                   log_dir=FLAGS.log_dir,
                   rescale_action=FLAGS.env.rescale_action)
    env_eval = make_env(FLAGS.env.id,
                        FLAGS.env.env_type,
                        num_env=4,
                        seed=FLAGS.seed + 1000,
                        log_dir=FLAGS.log_dir)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    actor = Actor(dim_state,
                  dim_action,
                  hidden_sizes=FLAGS.SAC.actor_hidden_sizes)
    critic = Critic(dim_state,
                    dim_action,
                    hidden_sizes=FLAGS.SAC.critic_hidden_sizes)
    target_entropy = FLAGS.SAC.target_entropy
    if target_entropy is None:
        target_entropy = -dim_action
    sac = SAC(dim_state,
              dim_action,
              actor=actor,
              critic=critic,
              target_entropy=target_entropy,
              **FLAGS.SAC.algo.as_dict())

    tf.get_default_session().run(tf.global_variables_initializer())
    sac.update_critic_target(tau=0.0)

    dtype = gen_dtype(env, 'state action next_state reward done')
    buffer = Dataset(dtype=dtype, max_size=FLAGS.SAC.buffer_size)
    saver = nn.ModuleDict({'actor': actor, 'critic': critic})
    print(saver)

    n_steps = np.zeros(env.n_envs)
    n_returns = np.zeros(env.n_envs)

    train_returns = collections.deque(maxlen=40)
    train_lengths = collections.deque(maxlen=40)
    states = env.reset()
    time_st = time.time()
    for t in range(FLAGS.SAC.total_timesteps):
        if t < FLAGS.SAC.init_random_steps:
            actions = np.array(
                [env.action_space.sample() for _ in range(env.n_envs)])
        else:
            actions = actor.get_actions(states)
        next_states, rewards, dones, infos = env.step(actions)
        n_returns += rewards
        n_steps += 1
        timeouts = n_steps == env.max_episode_steps
        terminals = np.copy(dones)
        for e, info in enumerate(infos):
            if FLAGS.SAC.peb and info.get('TimeLimit.truncated', False):
                terminals[e] = False

        transitions = [states, actions, next_states.copy(), rewards, terminals]
        buffer.extend(np.rec.fromarrays(transitions, dtype=dtype))

        indices = np.where(dones | timeouts)[0]
        if len(indices) > 0:
            next_states[indices] = env.partial_reset(indices)

            train_returns.extend(n_returns[indices])
            train_lengths.extend(n_steps[indices])
            n_returns[indices] = 0
            n_steps[indices] = 0
        states = next_states.copy()

        if t >= FLAGS.SAC.init_random_steps:
            samples = buffer.sample(FLAGS.SAC.batch_size)
            train_info = sac.train(samples)
            if t % FLAGS.SAC.log_freq == 0:
                fps = int(t / (time.time() - time_st))
                train_info['fps'] = fps
                log_kvs(prefix='SAC',
                        kvs=dict(iter=t,
                                 episode=dict(
                                     returns=np.mean(train_returns)
                                     if len(train_returns) > 0 else 0.,
                                     lengths=int(
                                         np.mean(train_lengths)
                                         if len(train_lengths) > 0 else 0)),
                                 **train_info))

        if t % FLAGS.SAC.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(actor, env_eval)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths)))))

        if t % FLAGS.SAC.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())

    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())