Esempio n. 1
0
    def __init__(self,
                 sess,
                 env,
                 reward_model,
                 dynamics_model,
                 traj_len=100,
                 rollout_len=None,
                 query_loss_opt='pref_uncertainty',
                 imitation_kwargs={},
                 use_rand_policy=False,
                 query_type='sketch',
                 guided_search=False):

        if query_type not in ['pref', 'sketch', 'demo']:
            raise ValueError

        if traj_len > env.max_ep_len + 1:
            traj_len = env.max_ep_len + 1

        if rollout_len is None:
            rollout_len = env.max_ep_len

        if traj_len > rollout_len + 1:
            raise ValueError

        if use_rand_policy and guided_search:
            raise ValueError

        if guided_search and query_loss_opt == 'pref_uncertainty':
            raise ValueError

        self.query_loss = eval('self.%s_query_loss' % query_loss_opt)

        self.sess = sess
        self.query_type = query_type
        self.env = env
        self.reward_model = reward_model
        self.dynamics_model = dynamics_model
        self.traj_len = traj_len
        self.rollout_len = rollout_len
        self.use_rand_policy = use_rand_policy
        self.guided_search = guided_search
        self.imitation_kwargs = imitation_kwargs
        self.query_loss_opt = query_loss_opt

        if self.use_rand_policy:
            self.imitator = utils.make_random_policy(self.env)
        else:
            query_loss_opt_for_pol = (query_loss_opt
                                      if self.guided_search else 'max_rew')
            if 'plan_horizon' not in self.imitation_kwargs:
                self.imitation_kwargs['plan_horizon'] = self.rollout_len
            self.imitator = make_imitation_policy(
                self.sess,
                self.env,
                self.reward_model,
                self.dynamics_model,
                query_loss_opt=query_loss_opt_for_pol,
                **self.imitation_kwargs)
Esempio n. 2
0
def main():
  unused_sess = utils.make_tf_session(gpu_mode=False)

  env = envs.make_clfbandit_env(verbose=True)
  env.expert_policy = env.make_expert_policy()
  env.random_policy = utils.make_random_policy(env)
  trans_env = envs.make_clfbandit_trans_env(env)

  utils.run_ep(env.expert_policy, env)
  utils.run_ep(env.random_policy, env)
  utils.run_ep(trans_env.expert_policy, trans_env)
  utils.run_ep(trans_env.random_policy, trans_env)

  logging.info('OK')
Esempio n. 3
0
from matplotlib import pyplot as plt
import matplotlib.animation
import matplotlib as mpl

import warnings

warnings.filterwarnings('ignore')

sess = utils.make_tf_session(gpu_mode=False)

plot_traj = lambda traj, *args, **kwargs: utils.plot_trajs([traj], *args, **
                                                           kwargs)

env = envs.make_carracing_env(sess, load_reward=True)
random_policy = utils.make_random_policy(env)

trans_env = None

encoder = load_wm_pretrained_vae(sess, env)
dynamics_model = load_wm_pretrained_rnn(encoder, sess, env)

with open(os.path.join(utils.carracing_data_dir, 'rnn_enc_demo_rollouts.pkl'),
          'rb') as f:
    demo_rollouts = pickle.load(f)

with open(os.path.join(utils.carracing_data_dir, 'rnn_enc_aug_rollouts.pkl'),
          'rb') as f:
    aug_rollouts = pickle.load(f)

env.default_init_obs = demo_rollouts[-2][50][0]
Esempio n. 4
0
def main():
  sess = utils.make_tf_session(gpu_mode=False)

  env = envs.make_carracing_env(sess)
  trans_env = envs.make_carracing_trans_env(sess)
  random_policy = utils.make_random_policy(env)

  utils.run_ep(random_policy, env, max_ep_len=3, render=False)
  trans_rollout = utils.run_ep(
      random_policy, trans_env, max_ep_len=3, render=False)

  logging.info('envs and policies OK')

  raw_demo_rollouts = [
      utils.run_ep(random_policy, env, max_ep_len=3, render=False)
      for _ in range(n_demo_rollouts)
  ]
  raw_aug_rollouts = [
      utils.run_ep(random_policy, env, max_ep_len=3, render=False)
      for _ in range(n_aug_rollouts)
  ]
  raw_aug_rollouts += raw_demo_rollouts

  raw_aug_obses = []
  for rollout in raw_aug_rollouts:
    for x in rollout:
      raw_aug_obses.append(x[0])
  raw_aug_obses = np.array(raw_aug_obses)
  raw_aug_obs_data = utils.split_rollouts({'obses': raw_aug_obses})

  logging.info('data collection OK')

  encoder = VAEModel(
      sess,
      env,
      learning_rate=0.0001,
      kl_tolerance=0.5,
      scope=str(uuid.uuid4()),
      scope_file=os.path.join(test_data_dir, 'enc_scope.pkl'),
      tf_file=os.path.join(test_data_dir, 'enc.tf'))

  encoder.train(
      raw_aug_obs_data,
      iterations=1,
      ftol=1e-4,
      learning_rate=1e-3,
      val_update_freq=1,
      verbose=False)

  encoder = load_wm_pretrained_vae(sess, env)

  encoder.save()

  encoder.load()

  obs = raw_aug_rollouts[0][0][0]
  latent = encoder.encode_frame(obs)
  unused_recon = encoder.decode_latent(latent)

  logging.info('encoder OK')

  raw_aug_traj_data = utils.split_rollouts(
      utils.vectorize_rollouts(
          raw_aug_rollouts, env.max_ep_len, preserve_trajs=True))

  abs_model = AbsorptionModel(
      sess,
      env,
      n_layers=1,
      layer_size=32,
      scope=str(uuid.uuid4()),
      scope_file=os.path.join(test_data_dir, 'abs_scope.pkl'),
      tf_file=os.path.join(test_data_dir, 'abs.tf'))

  dynamics_model = MDNRNNDynamicsModel(
      encoder,
      sess,
      env,
      scope=str(uuid.uuid4()),
      tf_file=os.path.join(test_data_dir, 'dyn.tf'),
      scope_file=os.path.join(test_data_dir, 'dyn_scope.pkl'),
      abs_model=abs_model)

  dynamics_model.train(
      raw_aug_traj_data,
      iterations=1,
      learning_rate=1e-3,
      ftol=1e-4,
      batch_size=2,
      val_update_freq=1,
      verbose=False)

  dynamics_model = load_wm_pretrained_rnn(encoder, sess, env)

  dynamics_model.save()

  dynamics_model.load()

  demo_traj_data = utils.rnn_encode_rollouts(raw_demo_rollouts, env, encoder,
                                             dynamics_model)
  aug_traj_data = utils.rnn_encode_rollouts(raw_aug_rollouts, env, encoder,
                                            dynamics_model)
  demo_rollouts = utils.rollouts_of_traj_data(demo_traj_data)
  aug_rollouts = utils.rollouts_of_traj_data(aug_traj_data)
  demo_data = utils.split_rollouts(utils.flatten_traj_data(demo_traj_data))
  aug_data = utils.split_rollouts(utils.flatten_traj_data(aug_traj_data))

  env.default_init_obs = aug_rollouts[0][0][0]

  trans_rollouts = utils.rollouts_of_traj_data(
      utils.rnn_encode_rollouts([trans_rollout], trans_env, encoder,
                                dynamics_model))
  trans_env.default_init_obs = trans_rollouts[0][0][0]

  logging.info('mdnrnn dynamics OK')

  demo_data_for_reward_model = demo_data
  demo_rollouts_for_reward_model = demo_rollouts

  sketch_data_for_reward_model = aug_data
  sketch_rollouts_for_reward_model = aug_rollouts

  reward_init_kwargs = {
      'n_rew_nets_in_ensemble': 2,
      'n_layers': 1,
      'layer_size': 32,
      'scope': str(uuid.uuid4()),
      'scope_file': os.path.join(test_data_dir, 'true_rew_scope.pkl'),
      'tf_file': os.path.join(test_data_dir, 'true_rew.tf'),
      'rew_func_input': "s'",
      'use_discrete_rewards': True
  }

  reward_train_kwargs = {
      'demo_coeff': 1.,
      'sketch_coeff': 1.,
      'iterations': 1,
      'ftol': 1e-4,
      'batch_size': 2,
      'learning_rate': 1e-3,
      'val_update_freq': 1,
      'verbose': False
  }

  data = envs.make_carracing_rew(
      sess,
      env,
      sketch_data=sketch_data_for_reward_model,
      reward_init_kwargs=reward_init_kwargs,
      reward_train_kwargs=reward_train_kwargs)
  env.__dict__.update(data)
  trans_env.__dict__.update(data)

  autolabels = reward_models.autolabel_prefs(
      aug_rollouts, env, segment_len=env.max_ep_len + 1)

  pref_logs_for_reward_model = autolabels
  pref_data_for_reward_model = utils.split_prefs(autolabels)

  logging.info('autolabels OK')

  for rew_func_input in ['s', 'sa', "s'"]:
    reward_model = reward_models.RewardModel(
        sess,
        env,
        n_rew_nets_in_ensemble=2,
        n_layers=1,
        layer_size=32,
        scope=str(uuid.uuid4()),
        scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'),
        tf_file=os.path.join(test_data_dir, 'rew.tf'),
        rew_func_input=rew_func_input,
        use_discrete_rewards=True)

  for demo_data in [None, demo_data_for_reward_model]:
    for sketch_data in [None, sketch_data_for_reward_model]:
      for pref_data in [None, pref_data_for_reward_model]:
        if pref_data is None and sketch_data is None:
          continue
        reward_model.train(
            demo_data=demo_data,
            sketch_data=sketch_data,
            pref_data=pref_data,
            demo_coeff=1.,
            sketch_coeff=1.,
            iterations=1,
            ftol=1e-4,
            batch_size=2,
            learning_rate=1e-3,
            val_update_freq=1,
            verbose=False)

  reward_model.save()

  reward_model.load()

  logging.info('reward models OK')

  for query_loss_opt in [
      'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew', 'max_nov'
  ]:
    for init_obs in [None, env.default_init_obs]:
      for join_trajs_at_init_state in [True, False]:
        for shoot_steps in [1, 2]:
          if (shoot_steps > 1 and
              np.array(init_obs == env.default_init_obs).all()):
            continue
          traj_opt = GDTrajOptimizer(
              sess,
              env,
              reward_model,
              dynamics_model,
              traj_len=2,
              n_trajs=2,
              prior_coeff=1.,
              diversity_coeff=0.,
              query_loss_opt=query_loss_opt,
              opt_init_obs=(init_obs is None),
              join_trajs_at_init_state=join_trajs_at_init_state,
              shoot_steps=shoot_steps,
              learning_rate=1e-2)

          traj_opt.run(
              init_obs=init_obs,
              iterations=1,
              ftol=1e-4,
              verbose=False,
          )

  logging.info('grad descent traj opt OK')

  imitation_kwargs = {'plan_horizon': 10, 'n_blind_steps': 2, 'test_mode': True}

  for n_eval_rollouts in [0, 1]:
    reward_models.evaluate_reward_model(
        sess,
        env,
        trans_env,
        reward_model,
        dynamics_model,
        offpol_eval_rollouts=sketch_rollouts_for_reward_model,
        n_eval_rollouts=n_eval_rollouts,
        imitation_kwargs=imitation_kwargs)

  logging.info('reward eval OK')

  for query_loss_opt in [
      'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew', 'max_nov',
      'unif'
  ]:
    for use_rand_policy in [False, True]:
      traj_opt = StochTrajOptimizer(
          sess,
          env,
          reward_model,
          dynamics_model,
          traj_len=2,
          rollout_len=2,
          query_loss_opt=query_loss_opt,
          use_rand_policy=use_rand_policy)

      for init_obs in [None, env.default_init_obs]:
        traj_opt.run(n_trajs=2, n_samples=2, init_obs=init_obs, verbose=False)

  logging.info('stoch traj opt OK')

  reward_model = reward_models.RewardModel(
      sess,
      env,
      n_rew_nets_in_ensemble=2,
      n_layers=1,
      layer_size=32,
      scope=str(uuid.uuid4()),
      scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'),
      tf_file=os.path.join(test_data_dir, 'rew.tf'),
      rew_func_input="s'",
      use_discrete_rewards=True)

  rew_optimizer = InteractiveRewardOptimizer(sess, env, trans_env, reward_model,
                                             dynamics_model)

  reward_train_kwargs = {
      'demo_coeff': 1.,
      'sketch_coeff': 1.,
      'iterations': 1,
      'ftol': 1e-4,
      'batch_size': 2,
      'learning_rate': 1e-3,
      'val_update_freq': 1,
      'verbose': False
  }

  dynamics_train_kwargs = {
      'iterations': 1,
      'batch_size': 2,
      'learning_rate': 1e-3,
      'ftol': 1e-4,
      'val_update_freq': 1,
      'verbose': False
  }

  gd_traj_opt_init_kwargs = {
      'traj_len': env.max_ep_len,
      'n_trajs': 2,
      'prior_coeff': 1.,
      'diversity_coeff': 1.,
      'query_loss_opt': 'pref_uncertainty',
      'opt_init_obs': False,
      'learning_rate': 1e-2,
      'join_trajs_at_init_state': False
  }

  gd_traj_opt_run_kwargs = {
      'init_obs': env.default_init_obs,
      'iterations': 1,
      'ftol': 1e-4,
      'verbose': False,
  }

  unused_stoch_traj_opt_init_kwargs = {
      'traj_len': 2,
      'rollout_len': 2,
      'query_loss_opt': 'pref_uncertainty'
  }

  unused_stoch_traj_opt_run_kwargs = {
      'n_samples': 2,
      'init_obs': None,
      'verbose': False
  }

  eval_kwargs = {'n_eval_rollouts': 1}

  for init_train in [True, False]:
    for query_type in ['pref', 'sketch']:
      rew_optimizer.run(
          demo_rollouts=demo_rollouts_for_reward_model,
          sketch_rollouts=sketch_rollouts_for_reward_model,
          pref_logs=pref_logs_for_reward_model,
          rollouts_for_dyn=raw_aug_rollouts,
          reward_train_kwargs=reward_train_kwargs,
          dynamics_train_kwargs=dynamics_train_kwargs,
          traj_opt_cls=GDTrajOptimizer,
          traj_opt_run_kwargs=gd_traj_opt_run_kwargs,
          traj_opt_init_kwargs=gd_traj_opt_init_kwargs,
          imitation_kwargs=imitation_kwargs,
          eval_kwargs=eval_kwargs,
          init_train_dyn=init_train,
          init_train_rew=init_train,
          n_imitation_rollouts_per_dyn_update=1,
          n_queries=1,
          reward_update_freq=1,
          reward_eval_freq=1,
          dyn_update_freq=1,
          verbose=False,
          query_type=query_type)

  rew_optimizer.save()

  rew_optimizer.load()

  logging.info('rqst OK')
Esempio n. 5
0
def main():
    sess = utils.make_tf_session(gpu_mode=False)

    env = envs.make_pointmass_env()
    trans_env = envs.make_pointmass_trans_env(env)
    expert_policy = env.make_expert_policy()
    random_policy = utils.make_random_policy(env)

    default_init_obs = env.default_init_obs

    utils.run_ep(expert_policy, env)
    utils.run_ep(random_policy, env)
    utils.run_ep(expert_policy, trans_env)
    utils.run_ep(random_policy, trans_env)

    logging.info('envs and policies OK')

    demo_rollouts = [
        utils.run_ep(expert_policy, env) for _ in range(n_demo_rollouts)
    ]

    aug_rollouts = demo_rollouts + [
        utils.run_ep(random_policy, env) for _ in range(n_aug_rollouts)
    ]

    demo_data = utils.split_rollouts(
        utils.vectorize_rollouts(demo_rollouts, env.max_ep_len))
    aug_data = utils.split_rollouts(
        utils.vectorize_rollouts(aug_rollouts, env.max_ep_len))

    unused_demo_traj_data = utils.split_rollouts(
        utils.vectorize_rollouts(demo_rollouts,
                                 env.max_ep_len,
                                 preserve_trajs=True))
    unused_aug_traj_data = utils.split_rollouts(
        utils.vectorize_rollouts(aug_rollouts,
                                 env.max_ep_len,
                                 preserve_trajs=True))

    logging.info('data collection OK')

    abs_model = AbsorptionModel(sess,
                                env,
                                n_layers=1,
                                layer_size=32,
                                scope=str(uuid.uuid4()),
                                scope_file=os.path.join(
                                    test_data_dir, 'abs_scope.pkl'),
                                tf_file=os.path.join(test_data_dir, 'abs.tf'))

    dynamics_model = MLPDynamicsModel(
        sess,
        env,
        n_layers=1,
        layer_size=32,
        scope=str(uuid.uuid4()),
        scope_file=os.path.join(test_data_dir, 'dyn_scope.pkl'),
        tf_file=os.path.join(test_data_dir, 'dyn.tf'),
        abs_model=abs_model)

    dynamics_model.train(aug_data,
                         iterations=1,
                         ftol=1e-4,
                         learning_rate=1e-3,
                         batch_size=4,
                         val_update_freq=1,
                         verbose=False)

    dynamics_model.save()

    dynamics_model.load()

    logging.info('dynamics model OK')

    demo_data_for_reward_model = demo_data
    demo_rollouts_for_reward_model = demo_rollouts
    sketch_data_for_reward_model = aug_data
    sketch_rollouts_for_reward_model = aug_rollouts

    autolabels = reward_models.autolabel_prefs(
        sketch_rollouts_for_reward_model, env, segment_len=env.max_ep_len + 1)

    pref_logs_for_reward_model = autolabels
    pref_data_for_reward_model = utils.split_prefs(autolabels)

    logging.info('autolabels OK')

    for rew_func_input in ['sa', 's', "s'"]:
        reward_model = reward_models.RewardModel(
            sess,
            env,
            n_rew_nets_in_ensemble=4,
            n_layers=1,
            layer_size=64,
            scope=str(uuid.uuid4()),
            scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'),
            tf_file=os.path.join(test_data_dir, 'rew.tf'),
            rew_func_input=rew_func_input,
            use_discrete_rewards=True)

    for demo_data in [None, demo_data_for_reward_model]:
        for sketch_data in [None, sketch_data_for_reward_model]:
            for pref_data in [None, pref_data_for_reward_model]:
                if pref_data is None and sketch_data is None:
                    continue
                reward_model.train(demo_data=demo_data,
                                   sketch_data=sketch_data,
                                   pref_data=pref_data,
                                   demo_coeff=1.,
                                   sketch_coeff=1.,
                                   iterations=1,
                                   ftol=1e-4,
                                   batch_size=4,
                                   learning_rate=1e-3,
                                   val_update_freq=1,
                                   verbose=False)

    reward_model.save()

    reward_model.load()

    logging.info('reward models OK')

    for query_loss_opt in [
            'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew',
            'max_nov'
    ]:
        for init_obs in [None, default_init_obs]:
            for join_trajs_at_init_state in [True, False]:
                for query_type in ['pref', 'demo', 'sketch']:
                    if query_type == 'pref' and query_loss_opt == 'max_nov':
                        continue

                    for shoot_steps in [1, 2]:
                        traj_optimizer = GDTrajOptimizer(
                            sess,
                            env,
                            reward_model,
                            dynamics_model,
                            traj_len=2,
                            n_trajs=2,
                            prior_coeff=1.,
                            diversity_coeff=0.,
                            query_loss_opt=query_loss_opt,
                            opt_init_obs=(init_obs is None),
                            join_trajs_at_init_state=join_trajs_at_init_state,
                            shoot_steps=shoot_steps,
                            learning_rate=1e-2,
                            query_type=query_type)

                        traj_optimizer.run(
                            init_obs=init_obs,
                            iterations=1,
                            ftol=1e-4,
                            verbose=False,
                        )

    logging.info('grad descent traj opt OK')

    imitation_kwargs = {
        'plan_horizon': 10,
        'n_blind_steps': 2,
        'test_mode': True
    }

    for n_eval_rollouts in [0, 1]:
        reward_models.evaluate_reward_model(
            sess,
            env,
            trans_env,
            reward_model,
            dynamics_model,
            offpol_eval_rollouts=sketch_rollouts_for_reward_model,
            n_eval_rollouts=n_eval_rollouts,
            imitation_kwargs=imitation_kwargs)

    logging.info('reward eval OK')

    for query_loss_opt in [
            'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew',
            'max_nov', 'unif'
    ]:
        for use_rand_policy in [False, True]:
            traj_optimizer = StochTrajOptimizer(
                sess,
                env,
                reward_model,
                dynamics_model,
                traj_len=2,
                rollout_len=2,
                query_loss_opt=query_loss_opt,
                use_rand_policy=use_rand_policy)

            for init_obs in [None, default_init_obs]:
                traj_optimizer.run(n_trajs=2,
                                   n_samples=2,
                                   init_obs=init_obs,
                                   verbose=False)

    logging.info('stoch traj opt OK')

    reward_model = reward_models.RewardModel(
        sess,
        env,
        n_rew_nets_in_ensemble=4,
        n_layers=1,
        layer_size=64,
        scope=str(uuid.uuid4()),
        scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'),
        tf_file=os.path.join(test_data_dir, 'rew.tf'),
        use_discrete_rewards=True)

    rew_optimizer = InteractiveRewardOptimizer(sess, env, trans_env,
                                               reward_model, dynamics_model)

    reward_train_kwargs = {
        'demo_coeff': 1.,
        'sketch_coeff': 1.,
        'iterations': 1,
        'ftol': 1e-4,
        'batch_size': 4,
        'learning_rate': 1e-3,
        'val_update_freq': 1,
        'verbose': False
    }

    dynamics_train_kwargs = {
        'iterations': 1,
        'batch_size': 4,
        'learning_rate': 1e-3,
        'ftol': 1e-4,
        'val_update_freq': 1,
        'verbose': False
    }

    gd_traj_opt_init_kwargs = {
        'traj_len': env.max_ep_len,
        'n_trajs': 2,
        'prior_coeff': 1.,
        'diversity_coeff': 1.,
        'query_loss_opt': 'pref_uncertainty',
        'opt_init_obs': False,
        'learning_rate': 1e-2,
        'join_trajs_at_init_state': False
    }

    gd_traj_opt_run_kwargs = {
        'init_obs': default_init_obs,
        'iterations': 1,
        'ftol': 1e-4,
        'verbose': False
    }

    unused_stoch_traj_opt_init_kwargs = {
        'traj_len': 2,
        'rollout_len': 2,
        'query_loss_opt': 'pref_uncertainty'
    }

    unused_stoch_traj_opt_run_kwargs = {
        'n_samples': 2,
        'init_obs': None,
        'verbose': False
    }

    eval_kwargs = {'n_eval_rollouts': 1}

    for init_train in [True, False]:
        for query_type in ['pref', 'sketch']:
            rew_optimizer.run(demo_rollouts=demo_rollouts_for_reward_model,
                              sketch_rollouts=sketch_rollouts_for_reward_model,
                              pref_logs=pref_logs_for_reward_model,
                              rollouts_for_dyn=aug_rollouts,
                              reward_train_kwargs=reward_train_kwargs,
                              dynamics_train_kwargs=dynamics_train_kwargs,
                              traj_opt_cls=GDTrajOptimizer,
                              traj_opt_run_kwargs=gd_traj_opt_run_kwargs,
                              traj_opt_init_kwargs=gd_traj_opt_init_kwargs,
                              imitation_kwargs=imitation_kwargs,
                              eval_kwargs=eval_kwargs,
                              init_train_dyn=init_train,
                              init_train_rew=init_train,
                              n_imitation_rollouts_per_dyn_update=1,
                              n_queries=1,
                              reward_update_freq=1,
                              reward_eval_freq=1,
                              dyn_update_freq=1,
                              verbose=False,
                              query_type=query_type)

    rew_optimizer.save()

    rew_optimizer.load()

    logging.info('rqst OK')
Esempio n. 6
0
    def _run(
        self,
        init_obs=None,
        act_seq=None,
        iterations=10000,
        ftol=1e-6,
        min_iters=2,
        verbose=False,
        warm_start=False,
        init_with_lbfgs=False,
        init_act_seq=None,
        init_traj=None,
    ):

        if (init_obs is not None) == self.opt_init_obs:
            raise ValueError

        if (act_seq is not None) == self.opt_act_seq:
            raise ValueError

        if act_seq is not None and init_act_seq is not None:
            raise ValueError

        if init_act_seq is not None and warm_start:
            raise ValueError

        if self.query_loss_opt == 'unif':
            if self.env.name == 'clfbandit':
                std = np.exp(-self.prior_coeff)

                def rand_traj():
                    obs = np.random.normal(0, std,
                                           self.env.n_z_dim)[np.newaxis, :]
                    next_obs = self.env.absorbing_state[np.newaxis, :]
                    return np.concatenate((obs, next_obs), axis=0)

                trajs_eval = [rand_traj() for _ in range(self.n_trajs)]
                act_seqs_eval = [[self.env.action_space.sample()]
                                 for _ in range(self.n_trajs)]
            elif self.env.name == 'pointmass':
                unif_env = envs.make_pointmass_env()
                random_policy = utils.make_random_policy(unif_env)
                unif_rollouts = [
                    utils.run_ep(random_policy, unif_env, max_ep_len=1)
                    for _ in range(self.n_trajs)
                ]
                trajs_eval = [
                    utils.traj_of_rollout(rollout) for rollout in unif_rollouts
                ]
                act_seqs_eval = [
                    utils.act_seq_of_rollout(rollout)
                    for rollout in unif_rollouts
                ]
            else:
                raise ValueError
            loss_eval = 0.
            return {
                'traj': trajs_eval,
                'act_seq': act_seqs_eval,
                'loss': loss_eval
            }

        scopes = [self.opt_scope]
        if not warm_start:
            scopes.append(self.traj_scope)
        utils.init_tf_vars(self.sess, scopes, use_cache=True)

        feed_dict = {}
        assign_ops = []
        if init_act_seq is not None:
            feed_dict[self.init_act_seq_ph] = init_act_seq
            assign_ops.append(self.assign_init_act_seq)
        if init_traj is not None:
            self.obs_dim = (self.env.n_z_dim if self.env.name == 'carracing'
                            else self.env.n_obs_dim)
            feed_dict[self.init_traj_ph] = init_traj[1:, :self.obs_dim]
            assign_ops.append(self.assign_init_traj)
        if assign_ops != []:
            self.sess.run(assign_ops, feed_dict=feed_dict)

        feed_dict = {}
        if init_obs is not None:
            feed_dict[self.init_obs_ph] = init_obs() if callable(
                init_obs) else init_obs
        if act_seq is not None:
            feed_dict[self.act_seq_ph] = act_seq

        if verbose:
            print('iters loss')

        if init_with_lbfgs:
            self.lbfgs_optimizer.minimize(self.sess, feed_dict=feed_dict)

        loss_evals = []
        loss_eval, trajs_eval, act_seqs_eval = self.sess.run(
            [self.loss, self.trajs, self.act_seqs], feed_dict=feed_dict)
        best_eval = {
            'traj': trajs_eval,
            'act_seq': act_seqs_eval,
            'loss': loss_eval
        }
        #start_time = time.time() # uncomment for profiling
        for t in range(iterations):
            loss_eval, trajs_eval, act_seqs_eval, _ = self.sess.run(
                [self.loss, self.trajs, self.act_seqs, self.update_op],
                feed_dict=feed_dict)

            if verbose:
                print('%d %f' % (t, loss_eval))

            loss_evals.append(loss_eval)

            if loss_eval < best_eval['loss']:
                best_eval = {
                    'traj': trajs_eval,
                    'act_seq': act_seqs_eval,
                    'loss': loss_eval
                }

            if ftol is not None and utils.converged(
                    loss_evals, ftol, min_iters=min_iters):
                break
        # uncomment for profiling
        #print('call to update_op: %0.3f' % ((time.time() - start_time) / t))
        #print('iterations: %d' % t)

        if verbose:
            plt.plot(loss_evals)
            plt.show()

        return best_eval