コード例 #1
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)

    estimator = TabularDualDice(dataset.spec, gamma)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)
コード例 #2
0
ファイル: run_saddle.py プロジェクト: google-research/dice_rl
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    optimizer = tf.keras.optimizers.Adam(learning_rate)
    algo = TabularSaddlePoint(dataset.spec, optimizer, gamma=gamma)

    losses = []
    for step in range(num_steps):
        init_batch, _ = dataset.get_episode(batch_size, truncate_episode_at=1)
        init_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], init_batch)
        batch = dataset.get_step(batch_size, num_steps=2)
        loss, policy_loss = algo.train_step(init_batch, batch)
        losses.append(loss)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', np.mean(losses, 0))
            losses = []
            policy_fn, policy_info_spec = algo.get_policy()
            onpolicy_data = get_onpolicy_dataset(env_name, tabular_obs,
                                                 policy_fn, policy_info_spec)
            onpolicy_episodes, _ = onpolicy_data.get_episode(
                10, truncate_episode_at=40)
            print('estimated per step avg', np.mean(onpolicy_episodes.reward))

    print('Done!')
コード例 #3
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    step_encoding = FLAGS.step_encoding
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    max_trajectory_length_train = FLAGS.max_trajectory_length_train or max_trajectory_length

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_base = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                   'numtraj{NUM_TRAJ}').format(ENV_NAME=env_name,
                                               TAB=tabular_obs,
                                               ALPHA=alpha,
                                               SEED=seed,
                                               NUM_TRAJ=num_trajectory)
    hparam_data = hparam_base + '_maxtraj{MAX_TRAJ}'.format(
        MAX_TRAJ=max_trajectory_length_train)
    hparam_out = hparam_base + '_maxtraj{MAX_TRAJ}'.format(
        MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_data)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)

    estimator = TabularTeQDice(dataset.spec, gamma, max_trajectory_length,
                               step_encoding)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)

    print('Done!')

    if save_dir is not None:
        if not tf.io.gfile.isdir(save_dir):
            tf.io.gfile.makedirs(save_dir)
        out_fname = os.path.join(
            save_dir, hparam_out + '_enc{ENC}.npy'.format(ENC=step_encoding))
        print('Saving results to', out_fname)
        with tf.io.gfile.GFile(out_fname, 'w') as f:
            np.save(f, estimate.numpy())
コード例 #4
0
 def load_dataset_env(seed):
     name, wall_type = env_name.split('-')
     size = int(name.split(':')[-1])
     env = maze.Maze(size, wall_type, maze_seed=seed)
     hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                   'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                       ENV_NAME=env_name,
                       TAB=False,
                       ALPHA=1.0,
                       SEED=seed,
                       NUM_TRAJ=num_trajectory,
                       MAX_TRAJ=max_trajectory_length)
     directory = os.path.join(load_dir, hparam_str)
     dataset = Dataset.load(directory)
     return dataset, env
コード例 #5
0
ファイル: create_dataset.py プロジェクト: tianxusky/dice_rl
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  save_dir = FLAGS.save_dir
  load_dir = FLAGS.load_dir
  force = FLAGS.force

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)
  directory = os.path.join(save_dir, hparam_str)
  if tf.io.gfile.isdir(directory) and not force:
    raise ValueError('Directory %s already exists. Use --force to overwrite.' %
                     directory)

  np.random.seed(seed)
  tf.random.set_seed(seed)

  dataset = get_onpolicy_dataset(load_dir, env_name, tabular_obs,
                                 max_trajectory_length, alpha, seed)

  write_dataset = TFOffpolicyDataset(
      dataset.spec,
      capacity=num_trajectory * (max_trajectory_length + 1))

  batch_size = 20
  for batch_num in range(1 + (num_trajectory - 1) // batch_size):
    num_trajectory_after_batch = min(num_trajectory, batch_size * (batch_num + 1))
    num_trajectory_to_get = num_trajectory_after_batch - batch_num * batch_size
    episodes, valid_steps = dataset.get_episode(
        batch_size=num_trajectory_to_get)
    add_episodes_to_dataset(episodes, valid_steps, write_dataset)

    print('num episodes collected: %d', write_dataset.num_total_episodes)
    print('num steps collected: %d', write_dataset.num_steps)

    estimate = estimator_lib.get_fullbatch_average(write_dataset)
    print('per step avg on offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(write_dataset,
                                                   by_steps=False)
    print('per episode avg on offpolicy data', estimate)

  print('Saving dataset to %s.' % directory)
  if not tf.io.gfile.isdir(directory):
    tf.io.gfile.makedirs(directory)
  write_dataset.save(directory)

  print('Loading dataset.')
  new_dataset = Dataset.load(directory)
  print('num loaded steps', new_dataset.num_steps)
  print('num loaded total steps', new_dataset.num_total_steps)
  print('num loaded episodes', new_dataset.num_episodes)
  print('num loaded total episodes', new_dataset.num_total_episodes)

  estimate = estimator_lib.get_fullbatch_average(new_dataset)
  print('per step avg on saved and loaded offpolicy data', estimate)
  estimate = estimator_lib.get_fullbatch_average(new_dataset,
                                                 by_steps=False)
  print('per episode avg on saved and loaded offpolicy data', estimate)

  print('Done!')
コード例 #6
0
def main(argv):
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target
    gamma = FLAGS.gamma
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form

    primal_regularizer = FLAGS.primal_regularizer
    dual_regularizer = FLAGS.dual_regularizer
    kl_regularizer = FLAGS.kl_regularizer
    zero_reward = FLAGS.zero_reward
    norm_regularizer = FLAGS.norm_regularizer
    zeta_pos = FLAGS.zeta_pos

    scale_reward = FLAGS.scale_reward
    shift_reward = FLAGS.shift_reward
    transform_reward = FLAGS.transform_reward

    kl_regularizer = FLAGS.kl_regularizer
    eps_std = FLAGS.eps_std

    def reward_fn(env_step):
        reward = env_step.reward * scale_reward + shift_reward
        if transform_reward is None:
            return reward
        if transform_reward == 'exp':
            reward = tf.math.exp(reward)
        elif transform_reward == 'cuberoot':
            reward = tf.sign(reward) * tf.math.pow(tf.abs(reward), 1.0 / 3.0)
        else:
            raise ValueError(
                'Reward {} not implemented.'.format(transform_reward))
        return reward

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    train_hparam_str = (
        'nlr{NLR}_zlr{ZLR}_zeror{ZEROR}_preg{PREG}_dreg{DREG}_kreg{KREG}_nreg{NREG}_'
        'pform{PFORM}_fexp{FEXP}_zpos{ZPOS}_'
        'scaler{SCALER}_shiftr{SHIFTR}_transr{TRANSR}').format(
            NLR=nu_learning_rate,
            ZLR=zeta_learning_rate,
            ZEROR=zero_reward,
            PREG=primal_regularizer,
            DREG=dual_regularizer,
            KREG=kl_regularizer,
            NREG=norm_regularizer,
            PFORM=primal_form,
            FEXP=f_exponent,
            ZPOS=zeta_pos,
            SCALER=scale_reward,
            SHIFTR=shift_reward,
            TRANSR=transform_reward,
        )

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
        summary_writer.set_as_default()
    else:
        tf.summary.create_noop_writer()

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    #dataset = Dataset.load(directory.replace('alpha{}'.format(alpha), 'alpha0.0'))

    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    input_spec = (dataset.spec.observation, dataset.spec.action)
    nu_network = ValueNetwork(input_spec,
                              output_dim=2,
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=kernel_initializer)
    output_activation_fn = tf.math.square if zeta_pos else tf.identity
    zeta_network = ValueNetwork(input_spec,
                                output_dim=2,
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                output_activation_fn=output_activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=kernel_initializer)

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)
    lam_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)

    estimator = NeuralBayesDice(dataset.spec,
                                nu_network,
                                zeta_network,
                                nu_optimizer,
                                zeta_optimizer,
                                lam_optimizer,
                                gamma,
                                zero_reward=zero_reward,
                                f_exponent=f_exponent,
                                primal_form=primal_form,
                                reward_fn=reward_fn,
                                primal_regularizer=primal_regularizer,
                                dual_regularizer=dual_regularizer,
                                kl_regularizer=kl_regularizer,
                                eps_std=FLAGS.eps_std,
                                norm_regularizer=norm_regularizer,
                                nu_regularizer=nu_regularizer,
                                zeta_regularizer=zeta_regularizer)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    target_policy = get_target_policy(load_dir, env_name, tabular_obs,
                                      alpha_target)
    running_losses = []
    all_dual = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            num_samples = 100
            dual_ests = []
            for i in range(num_samples):
                dual_est = estimator.estimate_average_reward(
                    dataset, target_policy, write_summary=(i == 0))
                dual_ests.append(dual_est)
            tf.summary.scalar('dual/mean', tf.math.reduce_mean(dual_ests))
            tf.summary.scalar('dual/std', tf.math.reduce_std(dual_ests))

            tf.print('dual/mean =', tf.math.reduce_mean(dual_ests),
                     'dual/std =', tf.math.reduce_std(dual_ests))

            all_dual.append(dual_ests)
            running_losses = []
        global_step.assign_add(1)

    if save_dir is not None:
        np.save(tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w'),
                all_dual)

    print('Done!')
コード例 #7
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    num_steps = FLAGS.num_steps
    divergence_limit = FLAGS.divergence_limit
    algae_alpha = FLAGS.algae_alpha
    alpha_learning_rate = FLAGS.alpha_learning_rate
    train_nu_zeta_per_steps = FLAGS.train_nu_zeta_per_steps
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    train_hparam_str = ('alr{A_LR}_tnzs{TNZS}_limit{LIMIT}_'
                        'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format(
                            A_LR=alpha_learning_rate,
                            TNZS=train_nu_zeta_per_steps,
                            LIMIT=limit_episodes,
                            GAMMA=gamma,
                            ALGAE_ALPHA=algae_alpha,
                            DIV=divergence_limit)

    if save_dir is not None:
        save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    alpha_optimizer = tf.keras.optimizers.Adam(alpha_learning_rate,
                                               beta_1=0.0,
                                               beta_2=0.0)

    episodes, valid_steps = dataset.get_all_episodes(limit=limit_episodes)
    num_samples = tf.reduce_sum(
        tf.cast(
            tf.logical_and(valid_steps, episodes.discount > 0)[:, :-1],
            tf.float32))
    estimator = TabularRobustDice(
        dataset_spec=dataset.spec,
        alpha_optimizer=alpha_optimizer,
        gamma=gamma,
        divergence_limit=  #divergence_limit,
        divergence_limit / num_samples,
        algae_alpha=algae_alpha * np.array([1, 1]),
        limit_episodes=limit_episodes)
    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    def one_step(transitions_batch, initial_steps_batch, target_policy):
        global_step.assign_add(1)
        #initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
        #                                            initial_steps_batch)
        #losses, _ = estimator.train_alpha(initial_steps_batch, transitions_batch,
        #                                  target_policy)
        #return losses

    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            if step % train_nu_zeta_per_steps == 0:
                # first solve for the primal nu_loss,
                print('Step: {}. Solve for an updated tabular nu/zeta.'.format(
                    step))
                loss = estimator.solve_nu_zeta(dataset, target_policy)
                running_losses.append(loss)
            one_step(None, None, None)

            if step % 500 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = np.mean(running_losses, 0)[0]
                for idx, est in enumerate(estimate):
                    tf.summary.scalar('estimate%d' % idx, est)
                running_estimates.append(estimate)
                print('estimated per step avg %s' % estimate)
                print('avg last 3 estimated per step avg %s' %
                      np.mean(running_estimates[-3:], axis=0))
                running_losses = []

    if save_dir is not None:
        results_filename = os.path.join(save_dir, 'results.npy')
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, running_estimates)
    print('Done!')
コード例 #8
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  policy_learning_rate = FLAGS.policy_learning_rate
  q_learning_rate = FLAGS.q_learning_rate
  batch_size = FLAGS.batch_size
  mode = FLAGS.mode
  ci_method = FLAGS.ci_method
  delta = FLAGS.delta
  delta_tail = 1 - delta
  gamma = FLAGS.gamma
  num_steps = FLAGS.num_steps
  use_trained_policy = FLAGS.use_trained_policy
  use_doubly_robust = FLAGS.use_doubly_robust
  assert 0 <= gamma < 1.

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)

  if FLAGS.num_trajectory_data is not None:
    hparam_str_data = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                       'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                           ENV_NAME=env_name,
                           TAB=tabular_obs,
                           ALPHA=alpha,
                           SEED=seed,
                           NUM_TRAJ=FLAGS.num_trajectory_data,
                           MAX_TRAJ=max_trajectory_length)
  else:
    hparam_str_data = hparam_str

  directory = os.path.join(load_dir, hparam_str_data)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = (
      'plr{P_LR}_tp{TRAINED_P}_batch{BATCH_SIZE}_mode{MODE}_CI{CI_METHOD}_UTP{USE_TRAINED_POLICY}_gam{GAMMA}_del{DELTA}'
  ).format(
      P_LR=policy_learning_rate,
      TRAINED_P=use_trained_policy,
      BATCH_SIZE=batch_size,
      MODE=mode,
      CI_METHOD=ci_method,
      USE_TRAINED_POLICY=use_trained_policy,
      GAMMA=gamma,
      DELTA=delta)

  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  def non_negative_reward_translation(env_step):
    return env_step.reward - min_reward

  def inv_non_negative_estimate_translation(estimate):
    return estimate + min_reward

  if use_trained_policy:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    policy_optimizer = tf.keras.optimizers.Adam(
        policy_learning_rate, beta_1=0.0, beta_2=0.0)
    policy_network = PolicyNetwork(
        dataset.spec.observation,
        dataset.spec.action,
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    policy_optimizer = None
    policy_network = None

  if use_doubly_robust:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    q_optimizer = tf.keras.optimizers.Adam(q_learning_rate)
    q_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    q_optimizer = None
    q_network = None

  estimator = ImportanceSamplingCI(
      dataset_spec=dataset.spec,
      policy_optimizer=policy_optimizer,
      policy_network=policy_network,
      mode=mode,
      ci_method=ci_method,
      delta_tail=delta_tail,
      gamma=gamma,
      reward_fn=non_negative_reward_translation,
      q_network=q_network,
      q_optimizer=q_optimizer)
  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  # Following is for policy learning + IS confidence interval
  @tf.function
  def one_step(data_batch):
    global_step.assign_add(1)
    loss = estimator.train_step(data_batch, target_policy)
    return loss

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    running_estimate_cis = []
    for step in range(num_steps):
      data_batch = dataset.get_step(batch_size, num_steps=2)
      loss = one_step(data_batch)
      running_losses.append(loss)

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'loss', np.mean(running_losses, 0))
        running_losses = []
        estimate = estimator.estimate_average_reward(
            dataset, target_policy, episode_limit=num_trajectory)
        estimate = inv_non_negative_estimate_translation(estimate)
        running_estimates.append(estimate)
        print('estimated per step avg %s' % estimate)
        print('avg last 3 estimated per step avg %s' %
              np.mean(running_estimates[-3:], axis=0))

        estimate_ci = estimator.estimate_reward_ci(dataset, target_policy)
        estimate_ci = np.array(
            [inv_non_negative_estimate_translation(ele) for ele in estimate_ci])
        running_estimate_cis.append(estimate_ci)
        print('estimated CI per step avg %s' % estimate_ci)
        print('avg last 3 estimated CI per step avg %s' %
              np.mean(running_estimate_cis[-3:], axis=0))

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimate_cis)
  print('Done!')
コード例 #9
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  gamma = FLAGS.gamma
  assert 0 <= gamma < 1.
  nu_learning_rate = FLAGS.nu_learning_rate
  zeta_learning_rate = FLAGS.zeta_learning_rate
  nu_regularizer = FLAGS.nu_regularizer
  zeta_regularizer = FLAGS.zeta_regularizer
  weight_learning_rate = FLAGS.weight_learning_rate
  divergence_limit = FLAGS.divergence_limit
  algae_alpha = FLAGS.algae_alpha
  f_exponent = FLAGS.f_exponent
  primal_form = FLAGS.primal_form
  batch_size = FLAGS.batch_size
  num_steps = FLAGS.num_steps

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)
  directory = os.path.join(load_dir, hparam_str)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = ('nlr{NU_LR}_zlr{Z_LR}_batch{BATCH_SIZE}_'
                      'gam{GAMMA}_nreg{NU_REG}_zreg{Z_REG}_algae{ALGAE_ALPHA}_'
                      'prim{PRIMAL}_div{DIV}').format(
                          NU_LR=nu_learning_rate,
                          Z_LR=zeta_learning_rate,
                          BATCH_SIZE=batch_size,
                          GAMMA=gamma,
                          NU_REG=nu_regularizer,
                          Z_REG=zeta_regularizer,
                          ALGAE_ALPHA=algae_alpha,
                          PRIMAL=primal_form,
                          DIV=divergence_limit)
  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  activation_fn = tf.nn.relu
  kernel_initializer = tf.keras.initializers.TruncatedNormal(
      stddev=0.5, seed=1)
  hidden_dims = (64,)
  n_intervals = 1
  nu_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                            fc_layer_params=hidden_dims,
                            activation_fn=activation_fn,
                            kernel_initializer=kernel_initializer,
                            last_kernel_initializer=None,
                            output_dim=2 * 2 * n_intervals)
  zeta_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=None,
                              output_dim=2 * 2 * n_intervals)
  weight_network = ValueNetwork((dataset.spec.observation,  # initial state
                                 dataset.spec.observation,  # cur state
                                 dataset.spec.action,       # cur action
                                 dataset.spec.observation), # next state
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=None,
                                output_dim=2 * n_intervals)

  nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate, beta_2=0.99)
  zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate, beta_2=0.99)
  weight_optimizer = tf.keras.optimizers.Adam(weight_learning_rate, beta_2=0.99)

  estimator = NeuralCoinDice(dataset.spec,
                             nu_network, zeta_network,
                             weight_network,
                             nu_optimizer, zeta_optimizer,
                             weight_optimizer,
                             gamma=gamma,
                             divergence_limit=divergence_limit,
                             f_exponent=f_exponent,
                             primal_form=primal_form,
                             nu_regularizer=nu_regularizer,
                             zeta_regularizer=zeta_regularizer,
                             algae_alpha=algae_alpha * np.array([1, 1]),
                             unbias_algae_alpha=False,
                             closed_form_weights=True,
                             num_samples=None)

  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  @tf.function
  def one_step(transitions_batch, initial_steps_batch):
    global_step.assign_add(1)
    with tf.summary.record_if(tf.math.mod(global_step, 25) == 0):
      initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                  initial_steps_batch)
      losses, _ = estimator.train_step(initial_steps_batch, transitions_batch,
                                       target_policy)
    return losses

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    for step in range(num_steps):

      transitions_batch = dataset.get_step(batch_size, num_steps=2)
      initial_steps_batch, _ = dataset.get_episode(
          batch_size, truncate_episode_at=1)
      losses = one_step(transitions_batch, initial_steps_batch)
      running_losses.append([t.numpy() for t in losses])

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'losses', np.mean(running_losses, 0))
        estimate = np.mean(running_losses, 0)[0]
        for idx, est in enumerate(estimate):
          tf.summary.scalar('estimate%d' % idx, est)
        running_estimates.append(estimate)
        print('estimated confidence interval %s' % estimate)
        print('avg last 3 estimated confidence interval %s' %
              np.mean(running_estimates[-3:], axis=0))
        running_losses = []

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimates)
  print('Done!')
コード例 #10
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    nstep_returns = FLAGS.nstep_returns
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)
    dataset = PerturbedDataset(dataset,
                               num_perturbations=10,
                               perturbation_scale=1.)
    #estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    #print('perturbed data per step avg', estimate)

    value_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=(64, 64),
        output_dim=10)
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    estimator = NeuralQLearning(dataset.spec,
                                value_network,
                                optimizer,
                                gamma,
                                num_qvalues=10)
    for step in range(num_steps):
        batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1)
        loss, _ = estimator.train_step(batch, target_policy)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', loss)
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            print('estimated per step avg', estimate)

    print('Done!')
コード例 #11
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    train_hparam_str = ('gamma{GAM}_limit{LIMIT}').format(GAM=gamma,
                                                          LIMIT=limit_episodes)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    estimator = TabularQLearning(
        dataset.spec,
        gamma,
        num_qvalues=200,
        perturbation_scale=[0.0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 1.],
        default_reward_value=0.0,
        limit_episodes=limit_episodes)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)

    if save_dir is not None:
        results_dir = os.path.join(save_dir, hparam_str)
        if not tf.io.gfile.exists(results_dir):
            tf.io.gfile.makedirs(results_dir)
        results_filename = os.path.join(results_dir,
                                        'results_%s.npy' % train_hparam_str)
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, estimate)

    print('Done!')
コード例 #12
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    num_trajectory_train = FLAGS.num_trajectory_train
    if num_trajectory_train is None:
        num_trajectory_train = num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    max_trajectory_length_train = FLAGS.max_trajectory_length_train
    if max_trajectory_length_train is None:
        max_trajectory_length_train = max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form
    batch_size = FLAGS.batch_size
    num_steps = FLAGS.num_steps
    save_dir = FLAGS.save_dir
    network_dir = os.path.join(save_dir, 'networks') if save_dir else None
    estimate_dir = os.path.join(save_dir, 'estimates') if save_dir else None

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_base = '{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}'.format(
        ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed)

    hparam_data = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory if num_steps == 0 else num_trajectory_train,
        MAX_TRAJ=max_trajectory_length
        if num_steps == 0 else max_trajectory_length_train)
    hparam_net = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory_train, MAX_TRAJ=max_trajectory_length_train)
    hparam_result = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length)

    if estimate_dir is not None:
        if not tf.io.gfile.isdir(estimate_dir):
            tf.io.gfile.makedirs(estimate_dir)
        log_file = os.path.join(estimate_dir, hparam_result + '.log')
        print("Logging to '{0}'".format(log_file))
        sys.stdout = Logger(log_file)

    directory = os.path.join(load_dir, hparam_data)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    activation_fn = tf.nn.tanh
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, )
    step_encoding = None
    #step_encoding = 'one_hot'
    nu_network = StepValueNetwork(
        (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer,
        max_trajectory_length_train=max_trajectory_length_train,
        step_encoding=step_encoding)
    zeta_network = StepValueNetwork(
        (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer,
        max_trajectory_length_train=max_trajectory_length_train,
        step_encoding=step_encoding)
    nu_network.create_variables()
    zeta_network.create_variables()
    try:
        nu_network.load_weights(os.path.join(network_dir, hparam_net, 'nu'))
        zeta_network.load_weights(os.path.join(network_dir, hparam_net,
                                               'zeta'))
        print('loaded networks from', network_dir)
    except:
        print('initialized network from scratch')

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)

    estimator = NeuralTeQDice(dataset.spec,
                              nu_network,
                              zeta_network,
                              nu_optimizer,
                              zeta_optimizer,
                              gamma,
                              f_exponent=f_exponent,
                              primal_form=primal_form,
                              nu_regularizer=nu_regularizer,
                              zeta_regularizer=zeta_regularizer)

    running_losses = []
    running_estimates = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            print('step', step, 'losses', np.mean(running_losses, 0))
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            running_estimates.append(estimate)
            print('estimated per step avg %f' % estimate)
            print('avg last 3 estimated per step avg %f' %
                  np.mean(running_estimates[-3:]))
            if network_dir is not None:
                nu_network.save_weights(
                    os.path.join(network_dir, hparam_net, 'nu'))
                zeta_network.save_weights(
                    os.path.join(network_dir, hparam_net, 'zeta'))
                print('saved network weights to',
                      os.path.join(network_dir, hparam_net))
            running_losses = []

    if num_steps == 0:
        estimate = estimator.estimate_average_reward(dataset, target_policy)
        running_estimates.append(estimate)
        print('eval only per step avg %f' % np.mean(running_estimates[-3:]))

    if estimate_dir is not None:
        out_fname = os.path.join(estimate_dir, hparam_result + '.npy')
        print('Saving estimation results to', out_fname)
        with tf.io.gfile.GFile(out_fname, 'w') as f:
            np.save(f, running_estimates)

    print('Done!')
コード例 #13
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    num_steps = FLAGS.num_steps
    divergence_limit = FLAGS.divergence_limit
    algae_alpha = FLAGS.algae_alpha
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    train_hparam_str = ('limit{LIMIT}_'
                        'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format(
                            LIMIT=limit_episodes,
                            GAMMA=gamma,
                            ALGAE_ALPHA=algae_alpha,
                            DIV=divergence_limit)

    if save_dir is not None:
        save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    estimator = TabularCoinDice(dataset_spec=dataset.spec,
                                gamma=gamma,
                                divergence_limit=divergence_limit,
                                algae_alpha=algae_alpha * np.array([1, 1]),
                                limit_episodes=limit_episodes)
    estimator.prepare_dataset(dataset, target_policy)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)
    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            loss = estimator.train_step(dataset, target_policy)
            running_losses.append(loss)
            global_step.assign_add(1)

            if step % 10 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = np.mean(running_losses, 0)[0]
                for idx, est in enumerate(estimate):
                    tf.summary.scalar('estimate%d' % idx, est)
                running_estimates.append(estimate)
                print('estimated confidence interval %s' % estimate)
                print('avg last 3 estimated confidence interval %s' %
                      np.mean(running_estimates[-3:], axis=0))
                running_losses = []

    if save_dir is not None:
        results_filename = os.path.join(save_dir, 'results.npy')
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, running_estimates)
    print('Done!')
コード例 #14
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target

    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_learning_rate = FLAGS.nu_learning_rate
    solve_for_state_action_ratio = FLAGS.solve_for_state_action_ratio
    eps_std = FLAGS.eps_std
    kl_regularizer = FLAGS.kl_regularizer

    target_policy = get_target_policy(load_dir,
                                      env_name,
                                      tabular_obs,
                                      alpha=alpha_target)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        # Save for a specific alpha target
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    estimator = TabularBayesDice(
        dataset_spec=dataset.spec,
        gamma=gamma,
        solve_for_state_action_ratio=solve_for_state_action_ratio,
        zeta_learning_rate=zeta_learning_rate,
        nu_learning_rate=nu_learning_rate,
        kl_regularizer=kl_regularizer,
        eps_std=eps_std,
    )
    estimator.prepare_dataset(dataset, target_policy)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)
    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            loss = estimator.train_step()[0]
            running_losses.append(loss)
            global_step.assign_add(1)

            if step % 500 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = estimator.estimate_average_reward(
                    dataset, target_policy)
                tf.debugging.check_numerics(estimate, 'NaN in estimate')
                running_estimates.append(estimate)
                tf.print('est', tf.math.reduce_mean(estimate),
                         tf.math.reduce_std(estimate))

                running_losses = []

    if save_dir is not None:
        with tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'),
                               'w') as f:
            np.save(f, running_estimates)
        print('saved results to %s' % save_dir)

    print('Done!')
コード例 #15
0
ファイル: run_tf_env.py プロジェクト: tianxusky/dice_rl
def main(argv):
    seed = FLAGS.seed
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    gamma = FLAGS.gamma
    save_dir = FLAGS.save_dir

    np.random.seed(seed)
    dataset = get_env_dataset(False, 0.1, max_trajectory_length)

    write_dataset = TFOffpolicyDataset(dataset.spec)

    first_step = dataset.get_step()
    write_dataset.add_step(first_step)
    episode, valid_ids = dataset.get_episode()
    add_episodes_to_dataset(episode, valid_ids, write_dataset)
    episode_start, valid_steps = dataset.get_episode(truncate_episode_at=1)
    add_episodes_to_dataset(episode_start, valid_steps, write_dataset)

    episodes, valid_steps = dataset.get_episode(batch_size=num_trajectory)
    add_episodes_to_dataset(episodes, valid_steps, write_dataset)
    mask = (tf.cast(valid_steps, tf.float32) *
            (1 - tf.cast(episodes.is_last(), tf.float32)))
    episode_rewards = episodes.reward * mask
    print('avg step reward',
          tf.reduce_sum(episode_rewards) / tf.reduce_sum(mask))
    print('avg ep reward', tf.reduce_mean(tf.reduce_sum(episode_rewards, -1)))

    print('num steps', dataset.num_steps)
    print('num total steps', dataset.num_total_steps)
    print('num episodes', dataset.num_episodes)
    print('num total episodes', dataset.num_total_episodes)

    print('num write steps', write_dataset.num_steps)
    print('num write total steps', write_dataset.num_total_steps)
    print('num write episodes', write_dataset.num_episodes)
    print('num write total episodes', write_dataset.num_total_episodes)

    write_dataset.save(save_dir)
    new_dataset = Dataset.load(save_dir)
    print('num loaded steps', new_dataset.num_steps)
    print('num loaded total steps', new_dataset.num_total_steps)
    print('num loaded episodes', new_dataset.num_episodes)
    print('num loaded total episodes', new_dataset.num_total_episodes)

    estimate = estimator_lib.get_minibatch_average(dataset,
                                                   max_trajectory_length,
                                                   num_trajectory,
                                                   gamma=gamma)
    print('per step avg', estimate)
    estimate = estimator_lib.get_minibatch_average(dataset,
                                                   num_trajectory,
                                                   by_steps=False,
                                                   gamma=gamma)
    print('per episode avg', estimate)
    estimate = estimator_lib.get_fullbatch_average(write_dataset, gamma=gamma)
    print('per step avg on offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(write_dataset,
                                                   by_steps=False,
                                                   gamma=gamma)
    print('per episode avg on offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(new_dataset, gamma=gamma)
    print('per step avg on saved and loaded offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(new_dataset,
                                                   by_steps=False,
                                                   gamma=gamma)
    print('per episode avg on saved and loaded offpolicy data', estimate)
コード例 #16
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    num_expert_trajectory = FLAGS.num_expert_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    alpha_expert = FLAGS.alpha_expert
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    embed_dim = FLAGS.embed_dim
    fourier_dim = FLAGS.fourier_dim
    embed_learning_rate = FLAGS.embed_learning_rate
    learning_rate = FLAGS.learning_rate
    finetune = FLAGS.finetune
    latent_policy = FLAGS.latent_policy
    embed_learner = FLAGS.embed_learner
    num_steps = FLAGS.num_steps
    embed_pretraining_steps = FLAGS.embed_pretraining_steps
    batch_size = FLAGS.batch_size

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha_expert,
                      SEED=seed,
                      NUM_TRAJ=num_expert_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading expert dataset.')
    expert_dataset = Dataset.load(directory)
    print('num loaded expert steps', expert_dataset.num_steps)
    print('num loaded total expert steps', expert_dataset.num_total_steps)
    print('num loaded expert episodes', expert_dataset.num_episodes)
    print('num loaded total expert episodes',
          expert_dataset.num_total_episodes)
    expert_estimate = estimator_lib.get_fullbatch_average(expert_dataset,
                                                          gamma=gamma)
    print('expert data per step avg', expert_estimate)

    hparam_dict = {
        'env_name': env_name,
        'alpha_expert': alpha_expert,
        'seed': seed,
        'num_trajectory': num_trajectory,
        'num_expert_trajectory': num_expert_trajectory,
        'max_trajectory_length': max_trajectory_length,
        'embed_learner': embed_learner,
        'embed_dim': embed_dim,
        'fourier_dim': fourier_dim,
        'embed_learning_rate': embed_learning_rate,
        'learning_rate': learning_rate,
        'latent_policy': latent_policy,
        'finetune': finetune,
    }
    hparam_str = ','.join([
        '%s=%s' % (k, str(hparam_dict[k])) for k in sorted(hparam_dict.keys())
    ])
    summary_writer = tf.summary.create_file_writer(
        os.path.join(save_dir, hparam_str, 'train'))

    if embed_learner == 'sgd' or not embed_learner:
        algo = TabularBCSGD(dataset.spec,
                            gamma=gamma,
                            embed_dim=embed_dim,
                            embed_learning_rate=embed_learning_rate,
                            learning_rate=learning_rate,
                            finetune=finetune,
                            latent_policy=latent_policy)
    elif embed_learner == 'svd':
        algo = TabularBCSVD(dataset.spec,
                            gamma=gamma,
                            embed_dim=embed_dim,
                            learning_rate=learning_rate)
    elif embed_learner == 'energy':
        algo = TabularBCEnergy(dataset.spec,
                               gamma=gamma,
                               embed_dim=embed_dim,
                               fourier_dim=fourier_dim,
                               embed_learning_rate=embed_learning_rate,
                               learning_rate=learning_rate)
    else:
        raise ValueError('embed learner %s not supported' % embed_learner)

    if embed_learner == 'svd':
        embed_dict = algo.solve(dataset)
        with summary_writer.as_default():
            for k, v in embed_dict.items():
                tf.summary.scalar(f'embed/{k}', v, step=0)
                print('embed', k, v)
    else:
        algo.prepare_datasets(dataset, expert_dataset)
        if embed_learner is not None:
            for step in range(embed_pretraining_steps):
                batch = dataset.get_step(batch_size, num_steps=2)
                embed_dict = algo.train_embed(batch)
                if step % FLAGS.eval_interval == 0:
                    with summary_writer.as_default():
                        for k, v in embed_dict.items():
                            tf.summary.scalar(f'embed/{k}', v, step=step)
                            print('embed', step, k, v)

    for step in range(num_steps):
        batch = expert_dataset.get_step(batch_size, num_steps=2)
        info_dict = algo.train_step(batch)
        if step % FLAGS.eval_interval == 0:
            with summary_writer.as_default():
                for k, v in info_dict.items():
                    tf.summary.scalar(f'bc/{k}', v, step=step)
                    print('bc', k, v)

            policy_fn, policy_info_spec = algo.get_policy()
            onpolicy_data = get_onpolicy_dataset(env_name, tabular_obs,
                                                 policy_fn, policy_info_spec)
            onpolicy_episodes, _ = onpolicy_data.get_episode(
                100, truncate_episode_at=max_trajectory_length)
            with summary_writer.as_default():
                tf.print('eval/reward', np.mean(onpolicy_episodes.reward))
                tf.summary.scalar('eval/reward',
                                  np.mean(onpolicy_episodes.reward),
                                  step=step)
コード例 #17
0
def get_minibatch_average(
        dataset: Dataset,
        batch_size: int,
        num_batches: int = 1,
        by_steps: bool = True,
        truncate_episode_at: Optional[int] = None,
        reward_fn: Callable = None,
        weight_fn: Callable = None,
        gamma: Union[float, tf.Tensor] = 1.0) -> Union[float, tf.Tensor]:
    """Computes average reward via randomly sampled mini-batches.

    Samples steps or episodes from the dataset and computes average reward.

    Args:
      dataset: The dataset to sample experience from.
      batch_size: The number of episodes to sample per batch.
      num_batches: The number of batches to use for estimation.
      by_steps: Whether to sample batches of steps (default) or episodes.
      truncate_episode_at: If sampling by episodes, where to truncate episodes
        from the environment, if at all.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward. When
        sampling by episode, valid_steps is also passed into reward_fn.
      weight_fn: A function that takes in an EnvStep and returns a weight for
        that step. If not specified, defaults to gamma ** step_num. When
        sampling by episode, valid_steps is also passed into reward_fn.
      gamma: The discount factor to use for the default reward/weight functions.

    Returns:
      An estimate of the average reward.
    """
    if reward_fn is None:
        if by_steps:
            reward_fn = _default_by_steps_reward_fn
        else:
            reward_fn = lambda *args: _default_by_episodes_reward_fn(
                *args, gamma=gamma)

    if weight_fn is None:
        if by_steps:
            weight_fn = lambda *args: _default_by_steps_weight_fn(*args,
                                                                  gamma=gamma)
        else:
            weight_fn = _default_by_episodes_weight_fn

    total_reward = 0.
    total_weight = 0.
    for _ in range(num_batches):
        if by_steps:
            if isinstance(dataset, OnpolicyDataset):
                steps = dataset.get_step(num_steps=batch_size)
            else:
                steps = dataset.get_step(batch_size)
            rewards = reward_fn(steps)
            weights = weight_fn(steps)
        else:
            episodes, valid_steps = dataset.get_episode(
                batch_size, truncate_episode_at=truncate_episode_at)
            rewards = reward_fn(episodes, valid_steps)
            weights = weight_fn(episodes, valid_steps)

        rewards = common_lib.reverse_broadcast(rewards, weights)
        weights = common_lib.reverse_broadcast(weights, rewards)
        total_reward += tf.reduce_sum(rewards * weights, axis=0)
        total_weight += tf.reduce_sum(weights, axis=0)

    return total_reward / total_weight