Example #1
0
    def estimate_average_reward(self, dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy):
        """Estimates value (average per-step reward) of policy.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        def reward_fn(env_step, valid_steps, value_index=None):
            """Computes average initial Q-values of episodes."""
            # env_step is an episode, and we just want the first step.
            if tf.rank(valid_steps) == 1:
                first_step = tf.nest.map_structure(lambda t: t[0, ...],
                                                   env_step)
            else:
                first_step = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                   env_step)

            value = self._get_average_value(self._value_network, first_step,
                                            target_policy)
            if value_index is None:
                return value
            return value[..., value_index]

        def weight_fn(env_step, valid_steps):
            return tf.ones([tf.shape(valid_steps)[0]], dtype=tf.float32)

        if self._num_qvalues is None:
            return (1 - self._gamma) * estimator_lib.get_fullbatch_average(
                dataset,
                limit=None,
                by_steps=False,
                truncate_episode_at=1,
                reward_fn=reward_fn,
                weight_fn=weight_fn)
        else:
            estimates = []
            for i in range(self._num_qvalues):
                estimates.append(
                    (1 - self._gamma) * estimator_lib.get_fullbatch_average(
                        dataset,
                        limit=None,
                        by_steps=False,
                        truncate_episode_at=1,
                        reward_fn=lambda *args: reward_fn(*args, value_index=i
                                                          ),
                        weight_fn=weight_fn))
            return np.array(estimates)
Example #2
0
    def estimate_average_reward(self, dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy):
        """Estimates value (average per-step reward) of policy.

    The estimation is based on solved values of zeta, so one should call
    solve() before calling this function.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        def weight_fn(env_step):
            index = self._get_index(env_step.observation, env_step.action)
            zeta = self._zeta[index]

            policy_ratio = 1.0
            if not self._solve_for_state_action_ratio:
                tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(
                    env_step)
                target_log_probabilities = target_policy.distribution(
                    tfagents_timestep).action.log_prob(env_step.action)
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())

            return tf.cast(zeta * policy_ratio, tf.float32)

        return estimator_lib.get_fullbatch_average(dataset,
                                                   limit=None,
                                                   by_steps=True,
                                                   reward_fn=self._reward_fn,
                                                   weight_fn=weight_fn)
Example #3
0
    def estimate_average_reward(self, dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy):
        """Estimates value (average per-step reward) of policy.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        def weight_fn(env_step):
            zeta = self._get_value(self._zeta_network, env_step)

            policy_ratio = 1.0
            if not self._solve_for_state_action_ratio:
                tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(
                    env_step)
                target_log_probabilities = target_policy.distribution(
                    tfagents_timestep).action.log_prob(env_step.action)
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())

            return zeta * common_lib.reverse_broadcast(policy_ratio, zeta)

        return estimator_lib.get_fullbatch_average(dataset,
                                                   limit=None,
                                                   by_steps=True,
                                                   reward_fn=self._reward_fn,
                                                   weight_fn=weight_fn)
Example #4
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    optimizer = tf.keras.optimizers.Adam(learning_rate)
    algo = TabularSaddlePoint(dataset.spec, optimizer, gamma=gamma)

    losses = []
    for step in range(num_steps):
        init_batch, _ = dataset.get_episode(batch_size, truncate_episode_at=1)
        init_batch = tf.nest.map_structure(lambda t: t[:, 0, ...], init_batch)
        batch = dataset.get_step(batch_size, num_steps=2)
        loss, policy_loss = algo.train_step(init_batch, batch)
        losses.append(loss)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', np.mean(losses, 0))
            losses = []
            policy_fn, policy_info_spec = algo.get_policy()
            onpolicy_data = get_onpolicy_dataset(env_name, tabular_obs,
                                                 policy_fn, policy_info_spec)
            onpolicy_episodes, _ = onpolicy_data.get_episode(
                10, truncate_episode_at=40)
            print('estimated per step avg', np.mean(onpolicy_episodes.reward))

    print('Done!')
Example #5
0
    def estimate_average_reward(self, dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy):
        """Estimates value (average per-step reward) of policy.

    The estimation is based on solved values of zeta, so one should call
    solve() before calling this function.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        def weight_fn(env_step):
            if self._step_encoding is not None:
                zeta = 0.
                for step_num in range(self._max_trajectory_length):
                    index = self._get_index(env_step.observation,
                                            env_step.action, step_num)
                    zeta += self._gamma**step_num * self._zeta[index]
                zeta *= (1 - self._gamma) / (1 - self._gamma**
                                             (self._num_steps - 1))
            else:
                index = self._get_index(env_step.observation, env_step.action,
                                        env_step.step_num)
                zeta = self._zeta[index]
                zeta = tf.where(
                    env_step.step_num >= self._max_trajectory_length,
                    tf.zeros_like(zeta), zeta)

            policy_ratio = 1.0
            if not self._solve_for_state_action_ratio:
                tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(
                    env_step)
                target_log_probabilities = target_policy.distribution(
                    tfagents_timestep).action.log_prob(env_step.action)
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())

            return tf.cast(zeta, dtype=tf.float32) * tf.cast(policy_ratio,
                                                             dtype=tf.float32)

        return estimator_lib.get_fullbatch_average(dataset,
                                                   limit=None,
                                                   by_steps=True,
                                                   reward_fn=self._reward_fn,
                                                   weight_fn=weight_fn)
    def estimate_average_reward(self,
                                dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy,
                                num_samples=100):
        """Estimates value (average per-step reward) of policy.

    The estimation is based on solved values of zeta, so one should call
    solve() before calling this function.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.
      num_samples: number of posterior samples.

    Returns:
      A tensor with num_samples samples of estimated average per-step reward
      of the target policy.
    """
        nu_sigma = tf.sqrt(tf.exp(self._nu_log_sigma))
        eps = tf.random.normal(
            tf.concat([[num_samples], tf.shape(nu_sigma)], axis=-1), 0,
            self._eps_std)
        nu = self._nu_mu + nu_sigma * eps
        self._zeta = (tf.einsum('bn,nm->bm', nu, self._td_residuals) /
                      tf.math.sqrt(1e-8 + self._total_weights))

        def weight_fn(env_step):
            index = self._get_index(env_step.observation, env_step.action)
            zeta = tf.gather(self._zeta,
                             tf.tile(index[None, :], [num_samples, 1]),
                             batch_dims=1)
            policy_ratio = 1.0
            if not self._solve_for_state_action_ratio:
                tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(
                    env_step)
                target_log_probabilities = target_policy.distribution(
                    tfagents_timestep).action.log_prob(env_step.action)
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())

            return tf.cast(zeta * policy_ratio, tf.float32)

        return estimator_lib.get_fullbatch_average(dataset,
                                                   limit=None,
                                                   by_steps=True,
                                                   reward_fn=self._reward_fn,
                                                   weight_fn=weight_fn)
Example #7
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  save_dir = FLAGS.save_dir
  load_dir = FLAGS.load_dir
  force = FLAGS.force

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)
  directory = os.path.join(save_dir, hparam_str)
  if tf.io.gfile.isdir(directory) and not force:
    raise ValueError('Directory %s already exists. Use --force to overwrite.' %
                     directory)

  np.random.seed(seed)
  tf.random.set_seed(seed)

  dataset = get_onpolicy_dataset(load_dir, env_name, tabular_obs,
                                 max_trajectory_length, alpha, seed)

  write_dataset = TFOffpolicyDataset(
      dataset.spec,
      capacity=num_trajectory * (max_trajectory_length + 1))

  batch_size = 20
  for batch_num in range(1 + (num_trajectory - 1) // batch_size):
    num_trajectory_after_batch = min(num_trajectory, batch_size * (batch_num + 1))
    num_trajectory_to_get = num_trajectory_after_batch - batch_num * batch_size
    episodes, valid_steps = dataset.get_episode(
        batch_size=num_trajectory_to_get)
    add_episodes_to_dataset(episodes, valid_steps, write_dataset)

    print('num episodes collected: %d', write_dataset.num_total_episodes)
    print('num steps collected: %d', write_dataset.num_steps)

    estimate = estimator_lib.get_fullbatch_average(write_dataset)
    print('per step avg on offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(write_dataset,
                                                   by_steps=False)
    print('per episode avg on offpolicy data', estimate)

  print('Saving dataset to %s.' % directory)
  if not tf.io.gfile.isdir(directory):
    tf.io.gfile.makedirs(directory)
  write_dataset.save(directory)

  print('Loading dataset.')
  new_dataset = Dataset.load(directory)
  print('num loaded steps', new_dataset.num_steps)
  print('num loaded total steps', new_dataset.num_total_steps)
  print('num loaded episodes', new_dataset.num_episodes)
  print('num loaded total episodes', new_dataset.num_total_episodes)

  estimate = estimator_lib.get_fullbatch_average(new_dataset)
  print('per step avg on saved and loaded offpolicy data', estimate)
  estimate = estimator_lib.get_fullbatch_average(new_dataset,
                                                 by_steps=False)
  print('per episode avg on saved and loaded offpolicy data', estimate)

  print('Done!')
Example #8
0
def main(argv):
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target
    gamma = FLAGS.gamma
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form

    primal_regularizer = FLAGS.primal_regularizer
    dual_regularizer = FLAGS.dual_regularizer
    kl_regularizer = FLAGS.kl_regularizer
    zero_reward = FLAGS.zero_reward
    norm_regularizer = FLAGS.norm_regularizer
    zeta_pos = FLAGS.zeta_pos

    scale_reward = FLAGS.scale_reward
    shift_reward = FLAGS.shift_reward
    transform_reward = FLAGS.transform_reward

    kl_regularizer = FLAGS.kl_regularizer
    eps_std = FLAGS.eps_std

    def reward_fn(env_step):
        reward = env_step.reward * scale_reward + shift_reward
        if transform_reward is None:
            return reward
        if transform_reward == 'exp':
            reward = tf.math.exp(reward)
        elif transform_reward == 'cuberoot':
            reward = tf.sign(reward) * tf.math.pow(tf.abs(reward), 1.0 / 3.0)
        else:
            raise ValueError(
                'Reward {} not implemented.'.format(transform_reward))
        return reward

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    train_hparam_str = (
        'nlr{NLR}_zlr{ZLR}_zeror{ZEROR}_preg{PREG}_dreg{DREG}_kreg{KREG}_nreg{NREG}_'
        'pform{PFORM}_fexp{FEXP}_zpos{ZPOS}_'
        'scaler{SCALER}_shiftr{SHIFTR}_transr{TRANSR}').format(
            NLR=nu_learning_rate,
            ZLR=zeta_learning_rate,
            ZEROR=zero_reward,
            PREG=primal_regularizer,
            DREG=dual_regularizer,
            KREG=kl_regularizer,
            NREG=norm_regularizer,
            PFORM=primal_form,
            FEXP=f_exponent,
            ZPOS=zeta_pos,
            SCALER=scale_reward,
            SHIFTR=shift_reward,
            TRANSR=transform_reward,
        )

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
        summary_writer.set_as_default()
    else:
        tf.summary.create_noop_writer()

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    #dataset = Dataset.load(directory.replace('alpha{}'.format(alpha), 'alpha0.0'))

    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    input_spec = (dataset.spec.observation, dataset.spec.action)
    nu_network = ValueNetwork(input_spec,
                              output_dim=2,
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=kernel_initializer)
    output_activation_fn = tf.math.square if zeta_pos else tf.identity
    zeta_network = ValueNetwork(input_spec,
                                output_dim=2,
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                output_activation_fn=output_activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=kernel_initializer)

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)
    lam_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)

    estimator = NeuralBayesDice(dataset.spec,
                                nu_network,
                                zeta_network,
                                nu_optimizer,
                                zeta_optimizer,
                                lam_optimizer,
                                gamma,
                                zero_reward=zero_reward,
                                f_exponent=f_exponent,
                                primal_form=primal_form,
                                reward_fn=reward_fn,
                                primal_regularizer=primal_regularizer,
                                dual_regularizer=dual_regularizer,
                                kl_regularizer=kl_regularizer,
                                eps_std=FLAGS.eps_std,
                                norm_regularizer=norm_regularizer,
                                nu_regularizer=nu_regularizer,
                                zeta_regularizer=zeta_regularizer)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    target_policy = get_target_policy(load_dir, env_name, tabular_obs,
                                      alpha_target)
    running_losses = []
    all_dual = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            num_samples = 100
            dual_ests = []
            for i in range(num_samples):
                dual_est = estimator.estimate_average_reward(
                    dataset, target_policy, write_summary=(i == 0))
                dual_ests.append(dual_est)
            tf.summary.scalar('dual/mean', tf.math.reduce_mean(dual_ests))
            tf.summary.scalar('dual/std', tf.math.reduce_std(dual_ests))

            tf.print('dual/mean =', tf.math.reduce_mean(dual_ests),
                     'dual/std =', tf.math.reduce_std(dual_ests))

            all_dual.append(dual_ests)
            running_losses = []
        global_step.assign_add(1)

    if save_dir is not None:
        np.save(tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'), 'w'),
                all_dual)

    print('Done!')
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    num_steps = FLAGS.num_steps
    divergence_limit = FLAGS.divergence_limit
    algae_alpha = FLAGS.algae_alpha
    alpha_learning_rate = FLAGS.alpha_learning_rate
    train_nu_zeta_per_steps = FLAGS.train_nu_zeta_per_steps
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    train_hparam_str = ('alr{A_LR}_tnzs{TNZS}_limit{LIMIT}_'
                        'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format(
                            A_LR=alpha_learning_rate,
                            TNZS=train_nu_zeta_per_steps,
                            LIMIT=limit_episodes,
                            GAMMA=gamma,
                            ALGAE_ALPHA=algae_alpha,
                            DIV=divergence_limit)

    if save_dir is not None:
        save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    alpha_optimizer = tf.keras.optimizers.Adam(alpha_learning_rate,
                                               beta_1=0.0,
                                               beta_2=0.0)

    episodes, valid_steps = dataset.get_all_episodes(limit=limit_episodes)
    num_samples = tf.reduce_sum(
        tf.cast(
            tf.logical_and(valid_steps, episodes.discount > 0)[:, :-1],
            tf.float32))
    estimator = TabularRobustDice(
        dataset_spec=dataset.spec,
        alpha_optimizer=alpha_optimizer,
        gamma=gamma,
        divergence_limit=  #divergence_limit,
        divergence_limit / num_samples,
        algae_alpha=algae_alpha * np.array([1, 1]),
        limit_episodes=limit_episodes)
    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)

    def one_step(transitions_batch, initial_steps_batch, target_policy):
        global_step.assign_add(1)
        #initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
        #                                            initial_steps_batch)
        #losses, _ = estimator.train_alpha(initial_steps_batch, transitions_batch,
        #                                  target_policy)
        #return losses

    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            if step % train_nu_zeta_per_steps == 0:
                # first solve for the primal nu_loss,
                print('Step: {}. Solve for an updated tabular nu/zeta.'.format(
                    step))
                loss = estimator.solve_nu_zeta(dataset, target_policy)
                running_losses.append(loss)
            one_step(None, None, None)

            if step % 500 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = np.mean(running_losses, 0)[0]
                for idx, est in enumerate(estimate):
                    tf.summary.scalar('estimate%d' % idx, est)
                running_estimates.append(estimate)
                print('estimated per step avg %s' % estimate)
                print('avg last 3 estimated per step avg %s' %
                      np.mean(running_estimates[-3:], axis=0))
                running_losses = []

    if save_dir is not None:
        results_filename = os.path.join(save_dir, 'results.npy')
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, running_estimates)
    print('Done!')
Example #10
0
    def estimate_average_reward(self, dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy):
        """Estimates value (average per-step reward) of policy.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        def weight_fn(env_step):
            zeta = self._get_value(self._zeta_network, env_step)
            policy_ratio = 1.0
            if not self._solve_for_state_action_ratio:
                tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(
                    env_step)
                target_log_probabilities = target_policy.distribution(
                    tfagents_timestep).action.log_prob(env_step.action)
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())
            return zeta * common_lib.reverse_broadcast(policy_ratio, zeta)

        def init_nu_fn(env_step, valid_steps):
            """Computes average initial nu values of episodes."""
            # env_step is an episode, and we just want the first step.
            if tf.rank(valid_steps) == 1:
                first_step = tf.nest.map_structure(lambda t: t[0, ...],
                                                   env_step)
            else:
                first_step = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                   env_step)
            value = self._get_average_value(self._nu_network, first_step,
                                            target_policy)
            return value

        nu_zero = (1 - self._gamma) * estimator_lib.get_fullbatch_average(
            dataset,
            limit=None,
            by_steps=False,
            truncate_episode_at=1,
            reward_fn=init_nu_fn)

        dual_step = estimator_lib.get_fullbatch_average(
            dataset,
            limit=None,
            by_steps=True,
            reward_fn=self._reward_fn,
            weight_fn=weight_fn)

        tf.summary.scalar('nu_zero', nu_zero)
        tf.summary.scalar('lam', self._norm_regularizer * self._lam)
        tf.summary.scalar('dual_step', dual_step)

        constraint, f_nu, f_zeta = self._eval_constraint_and_regs(
            dataset, target_policy)
        lagrangian = nu_zero + self._norm_regularizer * self._lam + constraint
        overall = (lagrangian + self._primal_regularizer * f_nu -
                   self._dual_regularizer * f_zeta)
        tf.summary.scalar('constraint', constraint)
        tf.summary.scalar('nu_reg', self._primal_regularizer * f_nu)
        tf.summary.scalar('zeta_reg', self._dual_regularizer * f_zeta)
        tf.summary.scalar('lagrangian', lagrangian)
        tf.summary.scalar('overall', overall)
        tf.print('step', tf.summary.experimental.get_step(), 'nu_zero =',
                 nu_zero, 'lam =', self._norm_regularizer * self._lam,
                 'dual_step =', dual_step, 'constraint =', constraint,
                 'preg =', self._primal_regularizer * f_nu, 'dreg =',
                 self._dual_regularizer * f_zeta, 'lagrangian =', lagrangian,
                 'overall =', overall)

        return dual_step
Example #11
0
def main(argv):
    env_name = FLAGS.env_name
    data_name = FLAGS.data_name
    seed = FLAGS.seed
    policy_load_dir = FLAGS.policy_load_dir
    data_load_dir = FLAGS.data_load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    nstep_returns = FLAGS.nstep_returns
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    target_policy, env = get_target_policy(policy_load_dir, env_name)

    directory = os.path.join(data_load_dir,
                             'yifan_%s_%s' % (env_name, data_name))
    print('Loading dataset.')
    onpolicy_dataset = TFAgentsOnpolicyDataset(env, target_policy, 1000)
    write_dataset = TFOffpolicyDataset(onpolicy_dataset.spec)
    batch_size = 20
    num_trajectory = 10
    for batch_num in range(1 + (num_trajectory - 1) // batch_size):
        print(batch_num)
        num_trajectory_after_batch = min(num_trajectory,
                                         batch_size * (batch_num + 1))
        num_trajectory_to_get = num_trajectory_after_batch - batch_num * batch_size
        episodes, valid_steps = onpolicy_dataset.get_episode(
            batch_size=num_trajectory_to_get)
        add_episodes_to_dataset(episodes, valid_steps, write_dataset)
    dataset = write_dataset
    """
  dataset = Dataset.load(directory)
  """
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)

    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)
    dataset = PerturbedDataset(
        dataset,
        num_perturbations=None,  #10,
        perturbation_scale=1.)

    value_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=(64, 64),
        output_dim=None)  #10)
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    estimator = NeuralQLearning(
        dataset.spec,
        value_network,
        optimizer,
        gamma,
        num_qvalues=None,  #10,
        num_samples=1)
    for step in range(num_steps):
        batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1)
        loss, _ = estimator.train_step(batch, target_policy)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', loss)
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            print('estimated per step avg', estimate)

    print('Done!')
Example #12
0
    def estimate_average_reward(self,
                                dataset: dataset_lib.OffpolicyDataset,
                                target_policy: tf_policy.TFPolicy,
                                write_summary: bool = False):
        """Estimates value (average per-step reward) of policy.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """
        def weight_fn(env_step):
            zeta, _, _ = self._sample_value(self._zeta_network, env_step)
            policy_ratio = 1.0
            if not self._solve_for_state_action_ratio:
                tfagents_timestep = dataset_lib.convert_to_tfagents_timestep(
                    env_step)
                target_log_probabilities = target_policy.distribution(
                    tfagents_timestep).action.log_prob(env_step.action)
                policy_ratio = tf.exp(target_log_probabilities -
                                      env_step.get_log_probability())
            return zeta * common_lib.reverse_broadcast(policy_ratio, zeta)

        def init_nu_fn(env_step, valid_steps):
            """Computes average initial nu values of episodes."""
            # env_step is an episode, and we just want the first step.
            if tf.rank(valid_steps) == 1:
                first_step = tf.nest.map_structure(lambda t: t[0, ...],
                                                   env_step)
            else:
                first_step = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                   env_step)
            value, _, _ = self._sample_average_value(self._nu_network,
                                                     first_step, target_policy)
            return value

        dual_step = estimator_lib.get_fullbatch_average(
            dataset,
            limit=None,
            by_steps=True,
            reward_fn=self._reward_fn,
            weight_fn=weight_fn)

        nu_zero = (1 - self._gamma) * estimator_lib.get_fullbatch_average(
            dataset,
            limit=None,
            by_steps=False,
            truncate_episode_at=1,
            reward_fn=init_nu_fn)

        if not write_summary:
            return dual_step

        tf.summary.scalar('eval/dual_step', dual_step)
        tf.summary.scalar('eval/nu_zero', nu_zero)
        tf.print('step', tf.summary.experimental.get_step(), 'dual_step =',
                 dual_step, 'nu_zero =', nu_zero)

        return dual_step
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  gamma = FLAGS.gamma
  assert 0 <= gamma < 1.
  nu_learning_rate = FLAGS.nu_learning_rate
  zeta_learning_rate = FLAGS.zeta_learning_rate
  nu_regularizer = FLAGS.nu_regularizer
  zeta_regularizer = FLAGS.zeta_regularizer
  weight_learning_rate = FLAGS.weight_learning_rate
  divergence_limit = FLAGS.divergence_limit
  algae_alpha = FLAGS.algae_alpha
  f_exponent = FLAGS.f_exponent
  primal_form = FLAGS.primal_form
  batch_size = FLAGS.batch_size
  num_steps = FLAGS.num_steps

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)
  directory = os.path.join(load_dir, hparam_str)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = ('nlr{NU_LR}_zlr{Z_LR}_batch{BATCH_SIZE}_'
                      'gam{GAMMA}_nreg{NU_REG}_zreg{Z_REG}_algae{ALGAE_ALPHA}_'
                      'prim{PRIMAL}_div{DIV}').format(
                          NU_LR=nu_learning_rate,
                          Z_LR=zeta_learning_rate,
                          BATCH_SIZE=batch_size,
                          GAMMA=gamma,
                          NU_REG=nu_regularizer,
                          Z_REG=zeta_regularizer,
                          ALGAE_ALPHA=algae_alpha,
                          PRIMAL=primal_form,
                          DIV=divergence_limit)
  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  activation_fn = tf.nn.relu
  kernel_initializer = tf.keras.initializers.TruncatedNormal(
      stddev=0.5, seed=1)
  hidden_dims = (64,)
  n_intervals = 1
  nu_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                            fc_layer_params=hidden_dims,
                            activation_fn=activation_fn,
                            kernel_initializer=kernel_initializer,
                            last_kernel_initializer=None,
                            output_dim=2 * 2 * n_intervals)
  zeta_network = ValueNetwork((dataset.spec.observation, dataset.spec.action),
                              fc_layer_params=hidden_dims,
                              activation_fn=activation_fn,
                              kernel_initializer=kernel_initializer,
                              last_kernel_initializer=None,
                              output_dim=2 * 2 * n_intervals)
  weight_network = ValueNetwork((dataset.spec.observation,  # initial state
                                 dataset.spec.observation,  # cur state
                                 dataset.spec.action,       # cur action
                                 dataset.spec.observation), # next state
                                fc_layer_params=hidden_dims,
                                activation_fn=activation_fn,
                                kernel_initializer=kernel_initializer,
                                last_kernel_initializer=None,
                                output_dim=2 * n_intervals)

  nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate, beta_2=0.99)
  zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate, beta_2=0.99)
  weight_optimizer = tf.keras.optimizers.Adam(weight_learning_rate, beta_2=0.99)

  estimator = NeuralCoinDice(dataset.spec,
                             nu_network, zeta_network,
                             weight_network,
                             nu_optimizer, zeta_optimizer,
                             weight_optimizer,
                             gamma=gamma,
                             divergence_limit=divergence_limit,
                             f_exponent=f_exponent,
                             primal_form=primal_form,
                             nu_regularizer=nu_regularizer,
                             zeta_regularizer=zeta_regularizer,
                             algae_alpha=algae_alpha * np.array([1, 1]),
                             unbias_algae_alpha=False,
                             closed_form_weights=True,
                             num_samples=None)

  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  @tf.function
  def one_step(transitions_batch, initial_steps_batch):
    global_step.assign_add(1)
    with tf.summary.record_if(tf.math.mod(global_step, 25) == 0):
      initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                  initial_steps_batch)
      losses, _ = estimator.train_step(initial_steps_batch, transitions_batch,
                                       target_policy)
    return losses

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    for step in range(num_steps):

      transitions_batch = dataset.get_step(batch_size, num_steps=2)
      initial_steps_batch, _ = dataset.get_episode(
          batch_size, truncate_episode_at=1)
      losses = one_step(transitions_batch, initial_steps_batch)
      running_losses.append([t.numpy() for t in losses])

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'losses', np.mean(running_losses, 0))
        estimate = np.mean(running_losses, 0)[0]
        for idx, est in enumerate(estimate):
          tf.summary.scalar('estimate%d' % idx, est)
        running_estimates.append(estimate)
        print('estimated confidence interval %s' % estimate)
        print('avg last 3 estimated confidence interval %s' %
              np.mean(running_estimates[-3:], axis=0))
        running_losses = []

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimates)
  print('Done!')
Example #14
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    learning_rate = FLAGS.learning_rate
    nstep_returns = FLAGS.nstep_returns
    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)
    dataset = PerturbedDataset(dataset,
                               num_perturbations=10,
                               perturbation_scale=1.)
    #estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    #print('perturbed data per step avg', estimate)

    value_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=(64, 64),
        output_dim=10)
    optimizer = tf.keras.optimizers.Adam(learning_rate)

    estimator = NeuralQLearning(dataset.spec,
                                value_network,
                                optimizer,
                                gamma,
                                num_qvalues=10)
    for step in range(num_steps):
        batch = dataset.get_step(batch_size, num_steps=nstep_returns + 1)
        loss, _ = estimator.train_step(batch, target_policy)
        if step % 100 == 0 or step == num_steps - 1:
            print('step', step, 'loss', loss)
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            print('estimated per step avg', estimate)

    print('Done!')
Example #15
0
  def estimate_average_reward(self, dataset: dataset_lib.OffpolicyDataset,
                              target_policy: tf_policy.TFPolicy):
    """Estimates value (average per-step reward) of policy.

    Args:
      dataset: The dataset to sample experience from.
      target_policy: The policy whose value we want to estimate.

    Returns:
      Estimated average per-step reward of the target policy.
    """

    def reward_fn(env_step, valid_steps, qvalues=self._point_qvalues):
      """Computes average initial Q-values of episodes."""
      # env_step is an episode, and we just want the first step.
      if tf.rank(valid_steps) == 1:
        first_step = tf.nest.map_structure(lambda t: t[0, ...], env_step)
      else:
        first_step = tf.nest.map_structure(lambda t: t[:, 0, ...], env_step)

      if self._solve_for_state_action_value:
        indices = self._get_index(first_step.observation[:, None],
                                  np.arange(self._num_actions)[None, :])
        initial_qvalues = tf.cast(tf.gather(qvalues, indices), tf.float32)

        tfagents_first_step = dataset_lib.convert_to_tfagents_timestep(
            first_step)
        initial_target_probs = target_policy.distribution(
            tfagents_first_step).action.probs_parameter()
        value = tf.reduce_sum(initial_qvalues * initial_target_probs, axis=-1)
      else:
        indices = self._get_index(first_step.observation, first_step.action)
        value = tf.cast(tf.gather(qvalues, indices), tf.float32)

      return value

    def weight_fn(env_step, valid_steps):
      return tf.ones([tf.shape(valid_steps)[0]], dtype=tf.float32)

    if self._num_qvalues is None:
      return (1 - self._gamma) * estimator_lib.get_fullbatch_average(
          dataset,
          limit=None,
          by_steps=False,
          truncate_episode_at=1,
          reward_fn=reward_fn,
          weight_fn=weight_fn)
    else:
      estimates = []
      for i in range(self._num_qvalues):
        estimates.append([])
        for j in range(self._num_perturbations):
          estimates[-1].append(
              (1 - self._gamma) * estimator_lib.get_fullbatch_average(
                  dataset,
                  limit=None,
                  by_steps=False,
                  truncate_episode_at=1,
                  reward_fn=lambda *args: reward_fn(
                      *args, qvalues=self._ensemble_qvalues[i, :, j]),
                  weight_fn=weight_fn))
      return np.array(estimates)
Example #16
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    num_trajectory_train = FLAGS.num_trajectory_train
    if num_trajectory_train is None:
        num_trajectory_train = num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    max_trajectory_length_train = FLAGS.max_trajectory_length_train
    if max_trajectory_length_train is None:
        max_trajectory_length_train = max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    nu_learning_rate = FLAGS.nu_learning_rate
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_regularizer = FLAGS.nu_regularizer
    zeta_regularizer = FLAGS.zeta_regularizer
    f_exponent = FLAGS.f_exponent
    primal_form = FLAGS.primal_form
    batch_size = FLAGS.batch_size
    num_steps = FLAGS.num_steps
    save_dir = FLAGS.save_dir
    network_dir = os.path.join(save_dir, 'networks') if save_dir else None
    estimate_dir = os.path.join(save_dir, 'estimates') if save_dir else None

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_base = '{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}'.format(
        ENV_NAME=env_name, TAB=tabular_obs, ALPHA=alpha, SEED=seed)

    hparam_data = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory if num_steps == 0 else num_trajectory_train,
        MAX_TRAJ=max_trajectory_length
        if num_steps == 0 else max_trajectory_length_train)
    hparam_net = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory_train, MAX_TRAJ=max_trajectory_length_train)
    hparam_result = hparam_base + '_numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}'.format(
        NUM_TRAJ=num_trajectory, MAX_TRAJ=max_trajectory_length)

    if estimate_dir is not None:
        if not tf.io.gfile.isdir(estimate_dir):
            tf.io.gfile.makedirs(estimate_dir)
        log_file = os.path.join(estimate_dir, hparam_result + '.log')
        print("Logging to '{0}'".format(log_file))
        sys.stdout = Logger(log_file)

    directory = os.path.join(load_dir, hparam_data)
    print('Loading dataset from', directory)
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    activation_fn = tf.nn.tanh
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, )
    step_encoding = None
    #step_encoding = 'one_hot'
    nu_network = StepValueNetwork(
        (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer,
        max_trajectory_length_train=max_trajectory_length_train,
        step_encoding=step_encoding)
    zeta_network = StepValueNetwork(
        (dataset.spec.observation, dataset.spec.action, dataset.spec.step_num),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer,
        max_trajectory_length_train=max_trajectory_length_train,
        step_encoding=step_encoding)
    nu_network.create_variables()
    zeta_network.create_variables()
    try:
        nu_network.load_weights(os.path.join(network_dir, hparam_net, 'nu'))
        zeta_network.load_weights(os.path.join(network_dir, hparam_net,
                                               'zeta'))
        print('loaded networks from', network_dir)
    except:
        print('initialized network from scratch')

    nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
    zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)

    estimator = NeuralTeQDice(dataset.spec,
                              nu_network,
                              zeta_network,
                              nu_optimizer,
                              zeta_optimizer,
                              gamma,
                              f_exponent=f_exponent,
                              primal_form=primal_form,
                              nu_regularizer=nu_regularizer,
                              zeta_regularizer=zeta_regularizer)

    running_losses = []
    running_estimates = []
    for step in range(num_steps):
        transitions_batch = dataset.get_step(batch_size, num_steps=2)
        initial_steps_batch, _ = dataset.get_episode(batch_size,
                                                     truncate_episode_at=1)
        initial_steps_batch = tf.nest.map_structure(lambda t: t[:, 0, ...],
                                                    initial_steps_batch)
        losses = estimator.train_step(initial_steps_batch, transitions_batch,
                                      target_policy)
        running_losses.append(losses)
        if step % 500 == 0 or step == num_steps - 1:
            print('step', step, 'losses', np.mean(running_losses, 0))
            estimate = estimator.estimate_average_reward(
                dataset, target_policy)
            running_estimates.append(estimate)
            print('estimated per step avg %f' % estimate)
            print('avg last 3 estimated per step avg %f' %
                  np.mean(running_estimates[-3:]))
            if network_dir is not None:
                nu_network.save_weights(
                    os.path.join(network_dir, hparam_net, 'nu'))
                zeta_network.save_weights(
                    os.path.join(network_dir, hparam_net, 'zeta'))
                print('saved network weights to',
                      os.path.join(network_dir, hparam_net))
            running_losses = []

    if num_steps == 0:
        estimate = estimator.estimate_average_reward(dataset, target_policy)
        running_estimates.append(estimate)
        print('eval only per step avg %f' % np.mean(running_estimates[-3:]))

    if estimate_dir is not None:
        out_fname = os.path.join(estimate_dir, hparam_result + '.npy')
        print('Saving estimation results to', out_fname)
        with tf.io.gfile.GFile(out_fname, 'w') as f:
            np.save(f, running_estimates)

    print('Done!')
Example #17
0
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    num_steps = FLAGS.num_steps
    divergence_limit = FLAGS.divergence_limit
    algae_alpha = FLAGS.algae_alpha
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    train_hparam_str = ('limit{LIMIT}_'
                        'gam{GAMMA}_algae{ALGAE_ALPHA}_div{DIV}').format(
                            LIMIT=limit_episodes,
                            GAMMA=gamma,
                            ALGAE_ALPHA=algae_alpha,
                            DIV=divergence_limit)

    if save_dir is not None:
        save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    estimator = TabularCoinDice(dataset_spec=dataset.spec,
                                gamma=gamma,
                                divergence_limit=divergence_limit,
                                algae_alpha=algae_alpha * np.array([1, 1]),
                                limit_episodes=limit_episodes)
    estimator.prepare_dataset(dataset, target_policy)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)
    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            loss = estimator.train_step(dataset, target_policy)
            running_losses.append(loss)
            global_step.assign_add(1)

            if step % 10 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = np.mean(running_losses, 0)[0]
                for idx, est in enumerate(estimate):
                    tf.summary.scalar('estimate%d' % idx, est)
                running_estimates.append(estimate)
                print('estimated confidence interval %s' % estimate)
                print('avg last 3 estimated confidence interval %s' %
                      np.mean(running_estimates[-3:], axis=0))
                running_losses = []

    if save_dir is not None:
        results_filename = os.path.join(save_dir, 'results.npy')
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, running_estimates)
    print('Done!')
Example #18
0
def main(argv):
    seed = FLAGS.seed
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    gamma = FLAGS.gamma
    save_dir = FLAGS.save_dir

    np.random.seed(seed)
    dataset = get_env_dataset(False, 0.1, max_trajectory_length)

    write_dataset = TFOffpolicyDataset(dataset.spec)

    first_step = dataset.get_step()
    write_dataset.add_step(first_step)
    episode, valid_ids = dataset.get_episode()
    add_episodes_to_dataset(episode, valid_ids, write_dataset)
    episode_start, valid_steps = dataset.get_episode(truncate_episode_at=1)
    add_episodes_to_dataset(episode_start, valid_steps, write_dataset)

    episodes, valid_steps = dataset.get_episode(batch_size=num_trajectory)
    add_episodes_to_dataset(episodes, valid_steps, write_dataset)
    mask = (tf.cast(valid_steps, tf.float32) *
            (1 - tf.cast(episodes.is_last(), tf.float32)))
    episode_rewards = episodes.reward * mask
    print('avg step reward',
          tf.reduce_sum(episode_rewards) / tf.reduce_sum(mask))
    print('avg ep reward', tf.reduce_mean(tf.reduce_sum(episode_rewards, -1)))

    print('num steps', dataset.num_steps)
    print('num total steps', dataset.num_total_steps)
    print('num episodes', dataset.num_episodes)
    print('num total episodes', dataset.num_total_episodes)

    print('num write steps', write_dataset.num_steps)
    print('num write total steps', write_dataset.num_total_steps)
    print('num write episodes', write_dataset.num_episodes)
    print('num write total episodes', write_dataset.num_total_episodes)

    write_dataset.save(save_dir)
    new_dataset = Dataset.load(save_dir)
    print('num loaded steps', new_dataset.num_steps)
    print('num loaded total steps', new_dataset.num_total_steps)
    print('num loaded episodes', new_dataset.num_episodes)
    print('num loaded total episodes', new_dataset.num_total_episodes)

    estimate = estimator_lib.get_minibatch_average(dataset,
                                                   max_trajectory_length,
                                                   num_trajectory,
                                                   gamma=gamma)
    print('per step avg', estimate)
    estimate = estimator_lib.get_minibatch_average(dataset,
                                                   num_trajectory,
                                                   by_steps=False,
                                                   gamma=gamma)
    print('per episode avg', estimate)
    estimate = estimator_lib.get_fullbatch_average(write_dataset, gamma=gamma)
    print('per step avg on offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(write_dataset,
                                                   by_steps=False,
                                                   gamma=gamma)
    print('per episode avg on offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(new_dataset, gamma=gamma)
    print('per step avg on saved and loaded offpolicy data', estimate)
    estimate = estimator_lib.get_fullbatch_average(new_dataset,
                                                   by_steps=False,
                                                   gamma=gamma)
    print('per episode avg on saved and loaded offpolicy data', estimate)
Example #19
0
def main(argv):
  env_name = FLAGS.env_name
  seed = FLAGS.seed
  tabular_obs = FLAGS.tabular_obs
  num_trajectory = FLAGS.num_trajectory
  max_trajectory_length = FLAGS.max_trajectory_length
  alpha = FLAGS.alpha
  load_dir = FLAGS.load_dir
  save_dir = FLAGS.save_dir
  policy_learning_rate = FLAGS.policy_learning_rate
  q_learning_rate = FLAGS.q_learning_rate
  batch_size = FLAGS.batch_size
  mode = FLAGS.mode
  ci_method = FLAGS.ci_method
  delta = FLAGS.delta
  delta_tail = 1 - delta
  gamma = FLAGS.gamma
  num_steps = FLAGS.num_steps
  use_trained_policy = FLAGS.use_trained_policy
  use_doubly_robust = FLAGS.use_doubly_robust
  assert 0 <= gamma < 1.

  target_policy = get_target_policy(load_dir, env_name, tabular_obs)

  hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                    ENV_NAME=env_name,
                    TAB=tabular_obs,
                    ALPHA=alpha,
                    SEED=seed,
                    NUM_TRAJ=num_trajectory,
                    MAX_TRAJ=max_trajectory_length)

  if FLAGS.num_trajectory_data is not None:
    hparam_str_data = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                       'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                           ENV_NAME=env_name,
                           TAB=tabular_obs,
                           ALPHA=alpha,
                           SEED=seed,
                           NUM_TRAJ=FLAGS.num_trajectory_data,
                           MAX_TRAJ=max_trajectory_length)
  else:
    hparam_str_data = hparam_str

  directory = os.path.join(load_dir, hparam_str_data)
  print('Loading dataset.')
  dataset = Dataset.load(directory)
  all_steps = dataset.get_all_steps()
  max_reward = tf.reduce_max(all_steps.reward)
  min_reward = tf.reduce_min(all_steps.reward)
  print('num loaded steps', dataset.num_steps)
  print('num loaded total steps', dataset.num_total_steps)
  print('num loaded episodes', dataset.num_episodes)
  print('num loaded total episodes', dataset.num_total_episodes)
  print('min reward', min_reward, 'max reward', max_reward)

  estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
  print('data per step avg', estimate)

  train_hparam_str = (
      'plr{P_LR}_tp{TRAINED_P}_batch{BATCH_SIZE}_mode{MODE}_CI{CI_METHOD}_UTP{USE_TRAINED_POLICY}_gam{GAMMA}_del{DELTA}'
  ).format(
      P_LR=policy_learning_rate,
      TRAINED_P=use_trained_policy,
      BATCH_SIZE=batch_size,
      MODE=mode,
      CI_METHOD=ci_method,
      USE_TRAINED_POLICY=use_trained_policy,
      GAMMA=gamma,
      DELTA=delta)

  if save_dir is not None:
    save_dir = os.path.join(save_dir, hparam_str, train_hparam_str)
    summary_writer = tf.summary.create_file_writer(logdir=save_dir)
  else:
    summary_writer = tf.summary.create_noop_writer()

  def non_negative_reward_translation(env_step):
    return env_step.reward - min_reward

  def inv_non_negative_estimate_translation(estimate):
    return estimate + min_reward

  if use_trained_policy:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    policy_optimizer = tf.keras.optimizers.Adam(
        policy_learning_rate, beta_1=0.0, beta_2=0.0)
    policy_network = PolicyNetwork(
        dataset.spec.observation,
        dataset.spec.action,
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    policy_optimizer = None
    policy_network = None

  if use_doubly_robust:
    activation_fn = tf.nn.relu
    kernel_initializer = tf.keras.initializers.GlorotUniform()
    hidden_dims = (64, 64)
    q_optimizer = tf.keras.optimizers.Adam(q_learning_rate)
    q_network = ValueNetwork(
        (dataset.spec.observation, dataset.spec.action),
        fc_layer_params=hidden_dims,
        activation_fn=activation_fn,
        kernel_initializer=kernel_initializer,
        last_kernel_initializer=kernel_initializer)
  else:
    q_optimizer = None
    q_network = None

  estimator = ImportanceSamplingCI(
      dataset_spec=dataset.spec,
      policy_optimizer=policy_optimizer,
      policy_network=policy_network,
      mode=mode,
      ci_method=ci_method,
      delta_tail=delta_tail,
      gamma=gamma,
      reward_fn=non_negative_reward_translation,
      q_network=q_network,
      q_optimizer=q_optimizer)
  global_step = tf.Variable(0, dtype=tf.int64)
  tf.summary.experimental.set_step(global_step)

  # Following is for policy learning + IS confidence interval
  @tf.function
  def one_step(data_batch):
    global_step.assign_add(1)
    loss = estimator.train_step(data_batch, target_policy)
    return loss

  with summary_writer.as_default():
    running_losses = []
    running_estimates = []
    running_estimate_cis = []
    for step in range(num_steps):
      data_batch = dataset.get_step(batch_size, num_steps=2)
      loss = one_step(data_batch)
      running_losses.append(loss)

      if step % 500 == 0 or step == num_steps - 1:
        print('step', step, 'loss', np.mean(running_losses, 0))
        running_losses = []
        estimate = estimator.estimate_average_reward(
            dataset, target_policy, episode_limit=num_trajectory)
        estimate = inv_non_negative_estimate_translation(estimate)
        running_estimates.append(estimate)
        print('estimated per step avg %s' % estimate)
        print('avg last 3 estimated per step avg %s' %
              np.mean(running_estimates[-3:], axis=0))

        estimate_ci = estimator.estimate_reward_ci(dataset, target_policy)
        estimate_ci = np.array(
            [inv_non_negative_estimate_translation(ele) for ele in estimate_ci])
        running_estimate_cis.append(estimate_ci)
        print('estimated CI per step avg %s' % estimate_ci)
        print('avg last 3 estimated CI per step avg %s' %
              np.mean(running_estimate_cis[-3:], axis=0))

  if save_dir is not None:
    results_filename = os.path.join(save_dir, 'results.npy')
    with tf.io.gfile.GFile(results_filename, 'w') as f:
      np.save(f, running_estimate_cis)
  print('Done!')
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    alpha = FLAGS.alpha
    alpha_target = FLAGS.alpha_target

    num_steps = FLAGS.num_steps
    batch_size = FLAGS.batch_size
    zeta_learning_rate = FLAGS.zeta_learning_rate
    nu_learning_rate = FLAGS.nu_learning_rate
    solve_for_state_action_ratio = FLAGS.solve_for_state_action_ratio
    eps_std = FLAGS.eps_std
    kl_regularizer = FLAGS.kl_regularizer

    target_policy = get_target_policy(load_dir,
                                      env_name,
                                      tabular_obs,
                                      alpha=alpha_target)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)

    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('behavior per-step',
          estimator_lib.get_fullbatch_average(dataset, gamma=gamma))

    train_hparam_str = ('eps{EPS}_kl{KL}').format(EPS=eps_std,
                                                  KL=kl_regularizer)

    if save_dir is not None:
        # Save for a specific alpha target
        target_hparam_str = hparam_str.replace(
            'alpha{}'.format(alpha),
            'alpha{}_alphat{}'.format(alpha, alpha_target))
        save_dir = os.path.join(save_dir, target_hparam_str, train_hparam_str)
        summary_writer = tf.summary.create_file_writer(logdir=save_dir)
    else:
        summary_writer = tf.summary.create_noop_writer()

    estimator = TabularBayesDice(
        dataset_spec=dataset.spec,
        gamma=gamma,
        solve_for_state_action_ratio=solve_for_state_action_ratio,
        zeta_learning_rate=zeta_learning_rate,
        nu_learning_rate=nu_learning_rate,
        kl_regularizer=kl_regularizer,
        eps_std=eps_std,
    )
    estimator.prepare_dataset(dataset, target_policy)

    global_step = tf.Variable(0, dtype=tf.int64)
    tf.summary.experimental.set_step(global_step)
    with summary_writer.as_default():
        running_losses = []
        running_estimates = []
        for step in range(num_steps):
            loss = estimator.train_step()[0]
            running_losses.append(loss)
            global_step.assign_add(1)

            if step % 500 == 0 or step == num_steps - 1:
                print('step', step, 'losses', np.mean(running_losses, 0))
                estimate = estimator.estimate_average_reward(
                    dataset, target_policy)
                tf.debugging.check_numerics(estimate, 'NaN in estimate')
                running_estimates.append(estimate)
                tf.print('est', tf.math.reduce_mean(estimate),
                         tf.math.reduce_std(estimate))

                running_losses = []

    if save_dir is not None:
        with tf.io.gfile.GFile(os.path.join(save_dir, 'results.npy'),
                               'w') as f:
            np.save(f, running_estimates)
        print('saved results to %s' % save_dir)

    print('Done!')
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    num_expert_trajectory = FLAGS.num_expert_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    alpha_expert = FLAGS.alpha_expert
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    embed_dim = FLAGS.embed_dim
    fourier_dim = FLAGS.fourier_dim
    embed_learning_rate = FLAGS.embed_learning_rate
    learning_rate = FLAGS.learning_rate
    finetune = FLAGS.finetune
    latent_policy = FLAGS.latent_policy
    embed_learner = FLAGS.embed_learner
    num_steps = FLAGS.num_steps
    embed_pretraining_steps = FLAGS.embed_pretraining_steps
    batch_size = FLAGS.batch_size

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha_expert,
                      SEED=seed,
                      NUM_TRAJ=num_expert_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading expert dataset.')
    expert_dataset = Dataset.load(directory)
    print('num loaded expert steps', expert_dataset.num_steps)
    print('num loaded total expert steps', expert_dataset.num_total_steps)
    print('num loaded expert episodes', expert_dataset.num_episodes)
    print('num loaded total expert episodes',
          expert_dataset.num_total_episodes)
    expert_estimate = estimator_lib.get_fullbatch_average(expert_dataset,
                                                          gamma=gamma)
    print('expert data per step avg', expert_estimate)

    hparam_dict = {
        'env_name': env_name,
        'alpha_expert': alpha_expert,
        'seed': seed,
        'num_trajectory': num_trajectory,
        'num_expert_trajectory': num_expert_trajectory,
        'max_trajectory_length': max_trajectory_length,
        'embed_learner': embed_learner,
        'embed_dim': embed_dim,
        'fourier_dim': fourier_dim,
        'embed_learning_rate': embed_learning_rate,
        'learning_rate': learning_rate,
        'latent_policy': latent_policy,
        'finetune': finetune,
    }
    hparam_str = ','.join([
        '%s=%s' % (k, str(hparam_dict[k])) for k in sorted(hparam_dict.keys())
    ])
    summary_writer = tf.summary.create_file_writer(
        os.path.join(save_dir, hparam_str, 'train'))

    if embed_learner == 'sgd' or not embed_learner:
        algo = TabularBCSGD(dataset.spec,
                            gamma=gamma,
                            embed_dim=embed_dim,
                            embed_learning_rate=embed_learning_rate,
                            learning_rate=learning_rate,
                            finetune=finetune,
                            latent_policy=latent_policy)
    elif embed_learner == 'svd':
        algo = TabularBCSVD(dataset.spec,
                            gamma=gamma,
                            embed_dim=embed_dim,
                            learning_rate=learning_rate)
    elif embed_learner == 'energy':
        algo = TabularBCEnergy(dataset.spec,
                               gamma=gamma,
                               embed_dim=embed_dim,
                               fourier_dim=fourier_dim,
                               embed_learning_rate=embed_learning_rate,
                               learning_rate=learning_rate)
    else:
        raise ValueError('embed learner %s not supported' % embed_learner)

    if embed_learner == 'svd':
        embed_dict = algo.solve(dataset)
        with summary_writer.as_default():
            for k, v in embed_dict.items():
                tf.summary.scalar(f'embed/{k}', v, step=0)
                print('embed', k, v)
    else:
        algo.prepare_datasets(dataset, expert_dataset)
        if embed_learner is not None:
            for step in range(embed_pretraining_steps):
                batch = dataset.get_step(batch_size, num_steps=2)
                embed_dict = algo.train_embed(batch)
                if step % FLAGS.eval_interval == 0:
                    with summary_writer.as_default():
                        for k, v in embed_dict.items():
                            tf.summary.scalar(f'embed/{k}', v, step=step)
                            print('embed', step, k, v)

    for step in range(num_steps):
        batch = expert_dataset.get_step(batch_size, num_steps=2)
        info_dict = algo.train_step(batch)
        if step % FLAGS.eval_interval == 0:
            with summary_writer.as_default():
                for k, v in info_dict.items():
                    tf.summary.scalar(f'bc/{k}', v, step=step)
                    print('bc', k, v)

            policy_fn, policy_info_spec = algo.get_policy()
            onpolicy_data = get_onpolicy_dataset(env_name, tabular_obs,
                                                 policy_fn, policy_info_spec)
            onpolicy_episodes, _ = onpolicy_data.get_episode(
                100, truncate_episode_at=max_trajectory_length)
            with summary_writer.as_default():
                tf.print('eval/reward', np.mean(onpolicy_episodes.reward))
                tf.summary.scalar('eval/reward',
                                  np.mean(onpolicy_episodes.reward),
                                  step=step)
def main(argv):
    env_name = FLAGS.env_name
    seed = FLAGS.seed
    tabular_obs = FLAGS.tabular_obs
    num_trajectory = FLAGS.num_trajectory
    max_trajectory_length = FLAGS.max_trajectory_length
    alpha = FLAGS.alpha
    load_dir = FLAGS.load_dir
    save_dir = FLAGS.save_dir
    gamma = FLAGS.gamma
    assert 0 <= gamma < 1.
    limit_episodes = FLAGS.limit_episodes

    target_policy = get_target_policy(load_dir, env_name, tabular_obs)

    hparam_str = ('{ENV_NAME}_tabular{TAB}_alpha{ALPHA}_seed{SEED}_'
                  'numtraj{NUM_TRAJ}_maxtraj{MAX_TRAJ}').format(
                      ENV_NAME=env_name,
                      TAB=tabular_obs,
                      ALPHA=alpha,
                      SEED=seed,
                      NUM_TRAJ=num_trajectory,
                      MAX_TRAJ=max_trajectory_length)
    directory = os.path.join(load_dir, hparam_str)
    print('Loading dataset.')
    dataset = Dataset.load(directory)
    all_steps = dataset.get_all_steps()
    max_reward = tf.reduce_max(all_steps.reward)
    min_reward = tf.reduce_min(all_steps.reward)
    print('num loaded steps', dataset.num_steps)
    print('num loaded total steps', dataset.num_total_steps)
    print('num loaded episodes', dataset.num_episodes)
    print('num loaded total episodes', dataset.num_total_episodes)
    print('min reward', min_reward, 'max reward', max_reward)

    train_hparam_str = ('gamma{GAM}_limit{LIMIT}').format(GAM=gamma,
                                                          LIMIT=limit_episodes)

    estimate = estimator_lib.get_fullbatch_average(dataset, gamma=gamma)
    print('data per step avg', estimate)

    estimator = TabularQLearning(
        dataset.spec,
        gamma,
        num_qvalues=200,
        perturbation_scale=[0.0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.4, 1.],
        default_reward_value=0.0,
        limit_episodes=limit_episodes)
    estimate = estimator.solve(dataset, target_policy)
    print('estimated per step avg', estimate)

    if save_dir is not None:
        results_dir = os.path.join(save_dir, hparam_str)
        if not tf.io.gfile.exists(results_dir):
            tf.io.gfile.makedirs(results_dir)
        results_filename = os.path.join(results_dir,
                                        'results_%s.npy' % train_hparam_str)
        with tf.io.gfile.GFile(results_filename, 'w') as f:
            np.save(f, estimate)

    print('Done!')