def __init__(self, sess, env, reward_model, dynamics_model, traj_len=100, rollout_len=None, query_loss_opt='pref_uncertainty', imitation_kwargs={}, use_rand_policy=False, query_type='sketch', guided_search=False): if query_type not in ['pref', 'sketch', 'demo']: raise ValueError if traj_len > env.max_ep_len + 1: traj_len = env.max_ep_len + 1 if rollout_len is None: rollout_len = env.max_ep_len if traj_len > rollout_len + 1: raise ValueError if use_rand_policy and guided_search: raise ValueError if guided_search and query_loss_opt == 'pref_uncertainty': raise ValueError self.query_loss = eval('self.%s_query_loss' % query_loss_opt) self.sess = sess self.query_type = query_type self.env = env self.reward_model = reward_model self.dynamics_model = dynamics_model self.traj_len = traj_len self.rollout_len = rollout_len self.use_rand_policy = use_rand_policy self.guided_search = guided_search self.imitation_kwargs = imitation_kwargs self.query_loss_opt = query_loss_opt if self.use_rand_policy: self.imitator = utils.make_random_policy(self.env) else: query_loss_opt_for_pol = (query_loss_opt if self.guided_search else 'max_rew') if 'plan_horizon' not in self.imitation_kwargs: self.imitation_kwargs['plan_horizon'] = self.rollout_len self.imitator = make_imitation_policy( self.sess, self.env, self.reward_model, self.dynamics_model, query_loss_opt=query_loss_opt_for_pol, **self.imitation_kwargs)
def main(): unused_sess = utils.make_tf_session(gpu_mode=False) env = envs.make_clfbandit_env(verbose=True) env.expert_policy = env.make_expert_policy() env.random_policy = utils.make_random_policy(env) trans_env = envs.make_clfbandit_trans_env(env) utils.run_ep(env.expert_policy, env) utils.run_ep(env.random_policy, env) utils.run_ep(trans_env.expert_policy, trans_env) utils.run_ep(trans_env.random_policy, trans_env) logging.info('OK')
from matplotlib import pyplot as plt import matplotlib.animation import matplotlib as mpl import warnings warnings.filterwarnings('ignore') sess = utils.make_tf_session(gpu_mode=False) plot_traj = lambda traj, *args, **kwargs: utils.plot_trajs([traj], *args, ** kwargs) env = envs.make_carracing_env(sess, load_reward=True) random_policy = utils.make_random_policy(env) trans_env = None encoder = load_wm_pretrained_vae(sess, env) dynamics_model = load_wm_pretrained_rnn(encoder, sess, env) with open(os.path.join(utils.carracing_data_dir, 'rnn_enc_demo_rollouts.pkl'), 'rb') as f: demo_rollouts = pickle.load(f) with open(os.path.join(utils.carracing_data_dir, 'rnn_enc_aug_rollouts.pkl'), 'rb') as f: aug_rollouts = pickle.load(f) env.default_init_obs = demo_rollouts[-2][50][0]
def main(): sess = utils.make_tf_session(gpu_mode=False) env = envs.make_carracing_env(sess) trans_env = envs.make_carracing_trans_env(sess) random_policy = utils.make_random_policy(env) utils.run_ep(random_policy, env, max_ep_len=3, render=False) trans_rollout = utils.run_ep( random_policy, trans_env, max_ep_len=3, render=False) logging.info('envs and policies OK') raw_demo_rollouts = [ utils.run_ep(random_policy, env, max_ep_len=3, render=False) for _ in range(n_demo_rollouts) ] raw_aug_rollouts = [ utils.run_ep(random_policy, env, max_ep_len=3, render=False) for _ in range(n_aug_rollouts) ] raw_aug_rollouts += raw_demo_rollouts raw_aug_obses = [] for rollout in raw_aug_rollouts: for x in rollout: raw_aug_obses.append(x[0]) raw_aug_obses = np.array(raw_aug_obses) raw_aug_obs_data = utils.split_rollouts({'obses': raw_aug_obses}) logging.info('data collection OK') encoder = VAEModel( sess, env, learning_rate=0.0001, kl_tolerance=0.5, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'enc_scope.pkl'), tf_file=os.path.join(test_data_dir, 'enc.tf')) encoder.train( raw_aug_obs_data, iterations=1, ftol=1e-4, learning_rate=1e-3, val_update_freq=1, verbose=False) encoder = load_wm_pretrained_vae(sess, env) encoder.save() encoder.load() obs = raw_aug_rollouts[0][0][0] latent = encoder.encode_frame(obs) unused_recon = encoder.decode_latent(latent) logging.info('encoder OK') raw_aug_traj_data = utils.split_rollouts( utils.vectorize_rollouts( raw_aug_rollouts, env.max_ep_len, preserve_trajs=True)) abs_model = AbsorptionModel( sess, env, n_layers=1, layer_size=32, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'abs_scope.pkl'), tf_file=os.path.join(test_data_dir, 'abs.tf')) dynamics_model = MDNRNNDynamicsModel( encoder, sess, env, scope=str(uuid.uuid4()), tf_file=os.path.join(test_data_dir, 'dyn.tf'), scope_file=os.path.join(test_data_dir, 'dyn_scope.pkl'), abs_model=abs_model) dynamics_model.train( raw_aug_traj_data, iterations=1, learning_rate=1e-3, ftol=1e-4, batch_size=2, val_update_freq=1, verbose=False) dynamics_model = load_wm_pretrained_rnn(encoder, sess, env) dynamics_model.save() dynamics_model.load() demo_traj_data = utils.rnn_encode_rollouts(raw_demo_rollouts, env, encoder, dynamics_model) aug_traj_data = utils.rnn_encode_rollouts(raw_aug_rollouts, env, encoder, dynamics_model) demo_rollouts = utils.rollouts_of_traj_data(demo_traj_data) aug_rollouts = utils.rollouts_of_traj_data(aug_traj_data) demo_data = utils.split_rollouts(utils.flatten_traj_data(demo_traj_data)) aug_data = utils.split_rollouts(utils.flatten_traj_data(aug_traj_data)) env.default_init_obs = aug_rollouts[0][0][0] trans_rollouts = utils.rollouts_of_traj_data( utils.rnn_encode_rollouts([trans_rollout], trans_env, encoder, dynamics_model)) trans_env.default_init_obs = trans_rollouts[0][0][0] logging.info('mdnrnn dynamics OK') demo_data_for_reward_model = demo_data demo_rollouts_for_reward_model = demo_rollouts sketch_data_for_reward_model = aug_data sketch_rollouts_for_reward_model = aug_rollouts reward_init_kwargs = { 'n_rew_nets_in_ensemble': 2, 'n_layers': 1, 'layer_size': 32, 'scope': str(uuid.uuid4()), 'scope_file': os.path.join(test_data_dir, 'true_rew_scope.pkl'), 'tf_file': os.path.join(test_data_dir, 'true_rew.tf'), 'rew_func_input': "s'", 'use_discrete_rewards': True } reward_train_kwargs = { 'demo_coeff': 1., 'sketch_coeff': 1., 'iterations': 1, 'ftol': 1e-4, 'batch_size': 2, 'learning_rate': 1e-3, 'val_update_freq': 1, 'verbose': False } data = envs.make_carracing_rew( sess, env, sketch_data=sketch_data_for_reward_model, reward_init_kwargs=reward_init_kwargs, reward_train_kwargs=reward_train_kwargs) env.__dict__.update(data) trans_env.__dict__.update(data) autolabels = reward_models.autolabel_prefs( aug_rollouts, env, segment_len=env.max_ep_len + 1) pref_logs_for_reward_model = autolabels pref_data_for_reward_model = utils.split_prefs(autolabels) logging.info('autolabels OK') for rew_func_input in ['s', 'sa', "s'"]: reward_model = reward_models.RewardModel( sess, env, n_rew_nets_in_ensemble=2, n_layers=1, layer_size=32, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'), tf_file=os.path.join(test_data_dir, 'rew.tf'), rew_func_input=rew_func_input, use_discrete_rewards=True) for demo_data in [None, demo_data_for_reward_model]: for sketch_data in [None, sketch_data_for_reward_model]: for pref_data in [None, pref_data_for_reward_model]: if pref_data is None and sketch_data is None: continue reward_model.train( demo_data=demo_data, sketch_data=sketch_data, pref_data=pref_data, demo_coeff=1., sketch_coeff=1., iterations=1, ftol=1e-4, batch_size=2, learning_rate=1e-3, val_update_freq=1, verbose=False) reward_model.save() reward_model.load() logging.info('reward models OK') for query_loss_opt in [ 'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew', 'max_nov' ]: for init_obs in [None, env.default_init_obs]: for join_trajs_at_init_state in [True, False]: for shoot_steps in [1, 2]: if (shoot_steps > 1 and np.array(init_obs == env.default_init_obs).all()): continue traj_opt = GDTrajOptimizer( sess, env, reward_model, dynamics_model, traj_len=2, n_trajs=2, prior_coeff=1., diversity_coeff=0., query_loss_opt=query_loss_opt, opt_init_obs=(init_obs is None), join_trajs_at_init_state=join_trajs_at_init_state, shoot_steps=shoot_steps, learning_rate=1e-2) traj_opt.run( init_obs=init_obs, iterations=1, ftol=1e-4, verbose=False, ) logging.info('grad descent traj opt OK') imitation_kwargs = {'plan_horizon': 10, 'n_blind_steps': 2, 'test_mode': True} for n_eval_rollouts in [0, 1]: reward_models.evaluate_reward_model( sess, env, trans_env, reward_model, dynamics_model, offpol_eval_rollouts=sketch_rollouts_for_reward_model, n_eval_rollouts=n_eval_rollouts, imitation_kwargs=imitation_kwargs) logging.info('reward eval OK') for query_loss_opt in [ 'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew', 'max_nov', 'unif' ]: for use_rand_policy in [False, True]: traj_opt = StochTrajOptimizer( sess, env, reward_model, dynamics_model, traj_len=2, rollout_len=2, query_loss_opt=query_loss_opt, use_rand_policy=use_rand_policy) for init_obs in [None, env.default_init_obs]: traj_opt.run(n_trajs=2, n_samples=2, init_obs=init_obs, verbose=False) logging.info('stoch traj opt OK') reward_model = reward_models.RewardModel( sess, env, n_rew_nets_in_ensemble=2, n_layers=1, layer_size=32, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'), tf_file=os.path.join(test_data_dir, 'rew.tf'), rew_func_input="s'", use_discrete_rewards=True) rew_optimizer = InteractiveRewardOptimizer(sess, env, trans_env, reward_model, dynamics_model) reward_train_kwargs = { 'demo_coeff': 1., 'sketch_coeff': 1., 'iterations': 1, 'ftol': 1e-4, 'batch_size': 2, 'learning_rate': 1e-3, 'val_update_freq': 1, 'verbose': False } dynamics_train_kwargs = { 'iterations': 1, 'batch_size': 2, 'learning_rate': 1e-3, 'ftol': 1e-4, 'val_update_freq': 1, 'verbose': False } gd_traj_opt_init_kwargs = { 'traj_len': env.max_ep_len, 'n_trajs': 2, 'prior_coeff': 1., 'diversity_coeff': 1., 'query_loss_opt': 'pref_uncertainty', 'opt_init_obs': False, 'learning_rate': 1e-2, 'join_trajs_at_init_state': False } gd_traj_opt_run_kwargs = { 'init_obs': env.default_init_obs, 'iterations': 1, 'ftol': 1e-4, 'verbose': False, } unused_stoch_traj_opt_init_kwargs = { 'traj_len': 2, 'rollout_len': 2, 'query_loss_opt': 'pref_uncertainty' } unused_stoch_traj_opt_run_kwargs = { 'n_samples': 2, 'init_obs': None, 'verbose': False } eval_kwargs = {'n_eval_rollouts': 1} for init_train in [True, False]: for query_type in ['pref', 'sketch']: rew_optimizer.run( demo_rollouts=demo_rollouts_for_reward_model, sketch_rollouts=sketch_rollouts_for_reward_model, pref_logs=pref_logs_for_reward_model, rollouts_for_dyn=raw_aug_rollouts, reward_train_kwargs=reward_train_kwargs, dynamics_train_kwargs=dynamics_train_kwargs, traj_opt_cls=GDTrajOptimizer, traj_opt_run_kwargs=gd_traj_opt_run_kwargs, traj_opt_init_kwargs=gd_traj_opt_init_kwargs, imitation_kwargs=imitation_kwargs, eval_kwargs=eval_kwargs, init_train_dyn=init_train, init_train_rew=init_train, n_imitation_rollouts_per_dyn_update=1, n_queries=1, reward_update_freq=1, reward_eval_freq=1, dyn_update_freq=1, verbose=False, query_type=query_type) rew_optimizer.save() rew_optimizer.load() logging.info('rqst OK')
def main(): sess = utils.make_tf_session(gpu_mode=False) env = envs.make_pointmass_env() trans_env = envs.make_pointmass_trans_env(env) expert_policy = env.make_expert_policy() random_policy = utils.make_random_policy(env) default_init_obs = env.default_init_obs utils.run_ep(expert_policy, env) utils.run_ep(random_policy, env) utils.run_ep(expert_policy, trans_env) utils.run_ep(random_policy, trans_env) logging.info('envs and policies OK') demo_rollouts = [ utils.run_ep(expert_policy, env) for _ in range(n_demo_rollouts) ] aug_rollouts = demo_rollouts + [ utils.run_ep(random_policy, env) for _ in range(n_aug_rollouts) ] demo_data = utils.split_rollouts( utils.vectorize_rollouts(demo_rollouts, env.max_ep_len)) aug_data = utils.split_rollouts( utils.vectorize_rollouts(aug_rollouts, env.max_ep_len)) unused_demo_traj_data = utils.split_rollouts( utils.vectorize_rollouts(demo_rollouts, env.max_ep_len, preserve_trajs=True)) unused_aug_traj_data = utils.split_rollouts( utils.vectorize_rollouts(aug_rollouts, env.max_ep_len, preserve_trajs=True)) logging.info('data collection OK') abs_model = AbsorptionModel(sess, env, n_layers=1, layer_size=32, scope=str(uuid.uuid4()), scope_file=os.path.join( test_data_dir, 'abs_scope.pkl'), tf_file=os.path.join(test_data_dir, 'abs.tf')) dynamics_model = MLPDynamicsModel( sess, env, n_layers=1, layer_size=32, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'dyn_scope.pkl'), tf_file=os.path.join(test_data_dir, 'dyn.tf'), abs_model=abs_model) dynamics_model.train(aug_data, iterations=1, ftol=1e-4, learning_rate=1e-3, batch_size=4, val_update_freq=1, verbose=False) dynamics_model.save() dynamics_model.load() logging.info('dynamics model OK') demo_data_for_reward_model = demo_data demo_rollouts_for_reward_model = demo_rollouts sketch_data_for_reward_model = aug_data sketch_rollouts_for_reward_model = aug_rollouts autolabels = reward_models.autolabel_prefs( sketch_rollouts_for_reward_model, env, segment_len=env.max_ep_len + 1) pref_logs_for_reward_model = autolabels pref_data_for_reward_model = utils.split_prefs(autolabels) logging.info('autolabels OK') for rew_func_input in ['sa', 's', "s'"]: reward_model = reward_models.RewardModel( sess, env, n_rew_nets_in_ensemble=4, n_layers=1, layer_size=64, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'), tf_file=os.path.join(test_data_dir, 'rew.tf'), rew_func_input=rew_func_input, use_discrete_rewards=True) for demo_data in [None, demo_data_for_reward_model]: for sketch_data in [None, sketch_data_for_reward_model]: for pref_data in [None, pref_data_for_reward_model]: if pref_data is None and sketch_data is None: continue reward_model.train(demo_data=demo_data, sketch_data=sketch_data, pref_data=pref_data, demo_coeff=1., sketch_coeff=1., iterations=1, ftol=1e-4, batch_size=4, learning_rate=1e-3, val_update_freq=1, verbose=False) reward_model.save() reward_model.load() logging.info('reward models OK') for query_loss_opt in [ 'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew', 'max_nov' ]: for init_obs in [None, default_init_obs]: for join_trajs_at_init_state in [True, False]: for query_type in ['pref', 'demo', 'sketch']: if query_type == 'pref' and query_loss_opt == 'max_nov': continue for shoot_steps in [1, 2]: traj_optimizer = GDTrajOptimizer( sess, env, reward_model, dynamics_model, traj_len=2, n_trajs=2, prior_coeff=1., diversity_coeff=0., query_loss_opt=query_loss_opt, opt_init_obs=(init_obs is None), join_trajs_at_init_state=join_trajs_at_init_state, shoot_steps=shoot_steps, learning_rate=1e-2, query_type=query_type) traj_optimizer.run( init_obs=init_obs, iterations=1, ftol=1e-4, verbose=False, ) logging.info('grad descent traj opt OK') imitation_kwargs = { 'plan_horizon': 10, 'n_blind_steps': 2, 'test_mode': True } for n_eval_rollouts in [0, 1]: reward_models.evaluate_reward_model( sess, env, trans_env, reward_model, dynamics_model, offpol_eval_rollouts=sketch_rollouts_for_reward_model, n_eval_rollouts=n_eval_rollouts, imitation_kwargs=imitation_kwargs) logging.info('reward eval OK') for query_loss_opt in [ 'pref_uncertainty', 'rew_uncertainty', 'max_rew', 'min_rew', 'max_nov', 'unif' ]: for use_rand_policy in [False, True]: traj_optimizer = StochTrajOptimizer( sess, env, reward_model, dynamics_model, traj_len=2, rollout_len=2, query_loss_opt=query_loss_opt, use_rand_policy=use_rand_policy) for init_obs in [None, default_init_obs]: traj_optimizer.run(n_trajs=2, n_samples=2, init_obs=init_obs, verbose=False) logging.info('stoch traj opt OK') reward_model = reward_models.RewardModel( sess, env, n_rew_nets_in_ensemble=4, n_layers=1, layer_size=64, scope=str(uuid.uuid4()), scope_file=os.path.join(test_data_dir, 'rew_scope.pkl'), tf_file=os.path.join(test_data_dir, 'rew.tf'), use_discrete_rewards=True) rew_optimizer = InteractiveRewardOptimizer(sess, env, trans_env, reward_model, dynamics_model) reward_train_kwargs = { 'demo_coeff': 1., 'sketch_coeff': 1., 'iterations': 1, 'ftol': 1e-4, 'batch_size': 4, 'learning_rate': 1e-3, 'val_update_freq': 1, 'verbose': False } dynamics_train_kwargs = { 'iterations': 1, 'batch_size': 4, 'learning_rate': 1e-3, 'ftol': 1e-4, 'val_update_freq': 1, 'verbose': False } gd_traj_opt_init_kwargs = { 'traj_len': env.max_ep_len, 'n_trajs': 2, 'prior_coeff': 1., 'diversity_coeff': 1., 'query_loss_opt': 'pref_uncertainty', 'opt_init_obs': False, 'learning_rate': 1e-2, 'join_trajs_at_init_state': False } gd_traj_opt_run_kwargs = { 'init_obs': default_init_obs, 'iterations': 1, 'ftol': 1e-4, 'verbose': False } unused_stoch_traj_opt_init_kwargs = { 'traj_len': 2, 'rollout_len': 2, 'query_loss_opt': 'pref_uncertainty' } unused_stoch_traj_opt_run_kwargs = { 'n_samples': 2, 'init_obs': None, 'verbose': False } eval_kwargs = {'n_eval_rollouts': 1} for init_train in [True, False]: for query_type in ['pref', 'sketch']: rew_optimizer.run(demo_rollouts=demo_rollouts_for_reward_model, sketch_rollouts=sketch_rollouts_for_reward_model, pref_logs=pref_logs_for_reward_model, rollouts_for_dyn=aug_rollouts, reward_train_kwargs=reward_train_kwargs, dynamics_train_kwargs=dynamics_train_kwargs, traj_opt_cls=GDTrajOptimizer, traj_opt_run_kwargs=gd_traj_opt_run_kwargs, traj_opt_init_kwargs=gd_traj_opt_init_kwargs, imitation_kwargs=imitation_kwargs, eval_kwargs=eval_kwargs, init_train_dyn=init_train, init_train_rew=init_train, n_imitation_rollouts_per_dyn_update=1, n_queries=1, reward_update_freq=1, reward_eval_freq=1, dyn_update_freq=1, verbose=False, query_type=query_type) rew_optimizer.save() rew_optimizer.load() logging.info('rqst OK')
def _run( self, init_obs=None, act_seq=None, iterations=10000, ftol=1e-6, min_iters=2, verbose=False, warm_start=False, init_with_lbfgs=False, init_act_seq=None, init_traj=None, ): if (init_obs is not None) == self.opt_init_obs: raise ValueError if (act_seq is not None) == self.opt_act_seq: raise ValueError if act_seq is not None and init_act_seq is not None: raise ValueError if init_act_seq is not None and warm_start: raise ValueError if self.query_loss_opt == 'unif': if self.env.name == 'clfbandit': std = np.exp(-self.prior_coeff) def rand_traj(): obs = np.random.normal(0, std, self.env.n_z_dim)[np.newaxis, :] next_obs = self.env.absorbing_state[np.newaxis, :] return np.concatenate((obs, next_obs), axis=0) trajs_eval = [rand_traj() for _ in range(self.n_trajs)] act_seqs_eval = [[self.env.action_space.sample()] for _ in range(self.n_trajs)] elif self.env.name == 'pointmass': unif_env = envs.make_pointmass_env() random_policy = utils.make_random_policy(unif_env) unif_rollouts = [ utils.run_ep(random_policy, unif_env, max_ep_len=1) for _ in range(self.n_trajs) ] trajs_eval = [ utils.traj_of_rollout(rollout) for rollout in unif_rollouts ] act_seqs_eval = [ utils.act_seq_of_rollout(rollout) for rollout in unif_rollouts ] else: raise ValueError loss_eval = 0. return { 'traj': trajs_eval, 'act_seq': act_seqs_eval, 'loss': loss_eval } scopes = [self.opt_scope] if not warm_start: scopes.append(self.traj_scope) utils.init_tf_vars(self.sess, scopes, use_cache=True) feed_dict = {} assign_ops = [] if init_act_seq is not None: feed_dict[self.init_act_seq_ph] = init_act_seq assign_ops.append(self.assign_init_act_seq) if init_traj is not None: self.obs_dim = (self.env.n_z_dim if self.env.name == 'carracing' else self.env.n_obs_dim) feed_dict[self.init_traj_ph] = init_traj[1:, :self.obs_dim] assign_ops.append(self.assign_init_traj) if assign_ops != []: self.sess.run(assign_ops, feed_dict=feed_dict) feed_dict = {} if init_obs is not None: feed_dict[self.init_obs_ph] = init_obs() if callable( init_obs) else init_obs if act_seq is not None: feed_dict[self.act_seq_ph] = act_seq if verbose: print('iters loss') if init_with_lbfgs: self.lbfgs_optimizer.minimize(self.sess, feed_dict=feed_dict) loss_evals = [] loss_eval, trajs_eval, act_seqs_eval = self.sess.run( [self.loss, self.trajs, self.act_seqs], feed_dict=feed_dict) best_eval = { 'traj': trajs_eval, 'act_seq': act_seqs_eval, 'loss': loss_eval } #start_time = time.time() # uncomment for profiling for t in range(iterations): loss_eval, trajs_eval, act_seqs_eval, _ = self.sess.run( [self.loss, self.trajs, self.act_seqs, self.update_op], feed_dict=feed_dict) if verbose: print('%d %f' % (t, loss_eval)) loss_evals.append(loss_eval) if loss_eval < best_eval['loss']: best_eval = { 'traj': trajs_eval, 'act_seq': act_seqs_eval, 'loss': loss_eval } if ftol is not None and utils.converged( loss_evals, ftol, min_iters=min_iters): break # uncomment for profiling #print('call to update_op: %0.3f' % ((time.time() - start_time) / t)) #print('iterations: %d' % t) if verbose: plt.plot(loss_evals) plt.show() return best_eval