def find_deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories(expert_trajectories, learner_policy, limit_trajs, data_subsamp_freq, ipython_after_eval): # Load the learner's policy policy_file, policy_key = util.split_h5_name(learner_policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Load the expert trajectories exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = imitate_mj.load_dataset( expert_trajectories, limit_trajs, data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 # Generate the actions according to the learner's policy for the expert's observations learner_actions_Bstacked_Da = policy.sample_actions(exobs_Bstacked_Do)[0] # Calcualating the deviation histogram: action_deviations = np.linalg.norm(exa_Bstacked_Da - learner_actions_Bstacked_Da, axis=1) # Plot the histogram # sns.kdeplot(action_deviations,shade=True) # FIXME: Uncomment the following plt.figure() plt.hist(action_deviations, bins=100) plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories.png') plt.show() if ipython_after_eval: import IPython; IPython.embed()
def find_deviation_of_agent_actions_from_expert_actions_for_underperforming_trajectories(learner_trajectories, expert_policy, lower_bound_reward, ipython_after_eval, generate_plot): obs,a,r,l = find_underperforming_trajectories(learner_trajectories, lower_bound_reward) print(type(obs)) # Load the expert's policy policy_file, policy_key = util.split_h5_name(expert_policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Generate the actions according to the expert's policy for the observations in the underperforming trajs expert_actions = policy.sample_actions(obs.reshape((-1,obs.shape[-1])))[0].reshape((-1,a.shape[1],a.shape[2])) # Calcualating the deviation histogram: action_deviations = np.linalg.norm(expert_actions.reshape((-1,a.shape[-1])) - a.reshape((-1,a.shape[-1])), axis=1) if generate_plot: plt.figure() plt.hist(action_deviations, bins=100) plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_underperforming_learner_trajectories.png') plt.show() if ipython_after_eval: import IPython; IPython.embed()
def load_trained_policy_and_mdp(env_name, policy_state_str): """ Creates the specialized MDP and policy objects needed to sample expert trajectories for a given environment. Returns: mdp: An instance of `RLGymMDP`, similar to a real gym env except with customized obs/action spaces and an internal `RLGyMSim` object. policy: The agent's policy, encoded as either rl.GaussianPolicy for continuous actions, or rl.GibbsPolicy for discrete actions. train_args: A dictionary of arguments (like argparse dicts) based on the trained policy's TRPO run. """ import gym import policyopt from policyopt import nn, rl from environments import rlgymenv # Load the saved state policy_file, policy_key = util.split_h5_name(policy_state_str) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) # Initialize the MDP print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size) # Initialize the policy nn.reset_global_scope() enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') # Load the policy parameters policy.load_h5(policy_file, policy_key) return mdp, policy, train_args
def load_trained_policy_and_mdp(env_name, policy_state_str): import gym import policyopt from policyopt import nn, rl from environments import rlgymenv # Load the saved state policy_file, policy_key = util.split_h5_name(policy_state_str) print('Loading policy parameters from %s in %s' % (policy_key, policy_file)) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) # Initialize the MDP print('Loading environment', env_name) mdp = rlgymenv.RLGymMDP(env_name) print('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) # Initialize the policy nn.reset_global_scope() enable_obsnorm = bool(train_args['enable_obsnorm'] ) if 'enable_obsnorm' in train_args else train_args[ 'obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') # Load the policy parameters policy.load_h5(policy_file, policy_key) return mdp, policy, train_args
def load_trained_policy_and_mdp(env_name, policy_state_str): import gym import policyopt from policyopt import nn, rl from environments import rlgymenv # Load the saved state policy_file, policy_key = util.split_h5_name(policy_state_str) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) # Initialize the MDP print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size) # Initialize the policy nn.reset_global_scope() enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') # Load the policy parameters policy.load_h5(policy_file, policy_key) return mdp, policy, train_args
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('output_dir', type=str) parser.add_argument('--deterministic', default=1, type=int) parser.add_argument('--max_steps', type=int, required=True) parser.add_argument('--env_name', type=str, default=None) args = parser.parse_args() util.mkdir_p(args.output_dir) assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir print 'Writing to', args.output_dir # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] if args.env_name is None else args.env_name print 'Loading environment', env_name mdp = rllabenv.RLLabMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) util.header('Max steps is {}'.format(args.max_steps)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Animate sim = mdp.new_sim() steps = 0 exit = False while not exit: sim.reset() while not sim.done: a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:] sim.step(a) sim.draw() viewer = sim.env.viewer data, w, h = viewer.get_image() image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:] cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1]) print steps steps += 1 if steps >= args.max_steps: exit = True break
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('--eval_only', action='store_true') parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving parser.add_argument('--out', type=str, default=None) parser.add_argument('--count', type=int, default=None) parser.add_argument('--deterministic', action='store_true') args = parser.parse_args() # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) if args.eval_only: n = 50 print 'Evaluating based on {} trajs'.format(n) if False: eval_trajbatch = mdp.sim_mp( policy_fn=lambda obs_B_Do: policy.sample_actions(obs_B_Do, args.deterministic), obsfeat_fn=lambda obs:obs, cfg=policyopt.SimConfig( min_num_trajs=n, min_total_sa=-1, batch_size=None, max_traj_len=args.max_traj_len)) returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1) avgr = eval_trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in eval_trajbatch]) ent = policy._compute_actiondist_entropy(eval_trajbatch.adist.stacked).mean() print 'ret: {} +/- {}'.format(returns.mean(), returns.std()) print 'avgr: {}'.format(avgr) print 'len: {} +/- {}'.format(lengths.mean(), lengths.std()) print 'ent: {}'.format(ent) print returns else: returns = [] lengths = [] sim = mdp.new_sim() for i_traj in xrange(n): print i_traj, n sim.reset() totalr = 0. l = 0 while not sim.done: a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:] r = sim.step(a) totalr += r l += 1 returns.append(totalr) lengths.append(l) import IPython; IPython.embed() elif args.out is not None: # Sample trajs and write to file print 'Saving traj samples to file: {}'.format(args.out) assert not os.path.exists(args.out) assert args.count > 0 # Simulate to create a trajectory batch util.header('Sampling {} trajectories of maximum length {}'.format(args.count, args.max_traj_len)) trajs = [] for i in tqdm.trange(args.count): trajs.append(mdp.sim_single( lambda obs: policy.sample_actions(obs, args.deterministic), lambda obs: obs, args.max_traj_len)) trajbatch = policyopt.TrajBatch.FromTrajs(trajs) print print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean() # Save the trajs to a file with h5py.File(args.out, 'w') as f: def write(name, a): # chunks of 128 trajs each f.create_dataset(name, data=a, chunks=(min(128, a.shape[0]),)+a.shape[1:], compression='gzip', compression_opts=9) # Right-padded trajectory data write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # Also save args to this script argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) f.attrs['args'] = argstr else: # Animate sim = mdp.new_sim() raw_obs, normalized_obs = [], [] while True: sim.reset() totalr = 0. steps = 0 while not sim.done: raw_obs.append(sim.obs[None,:]) normalized_obs.append(policy.compute_internal_normalized_obsfeat(sim.obs[None,:])) a = policy.sample_actions(sim.obs[None,:], args.deterministic)[0][0,:] r = sim.step(a) totalr += r steps += 1 sim.draw() if steps % 1000 == 0: tmpraw = np.concatenate(raw_obs, axis=0) tmpnormed = np.concatenate(normalized_obs, axis=0) print 'raw mean, raw std, normed mean, normed std' print np.stack([tmpraw.mean(0), tmpraw.std(0), tmpnormed.mean(0), tmpnormed.std(0)]) print 'Steps: %d, return: %.5f' % (steps, totalr)
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('--eval_only', action='store_true') parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving parser.add_argument('--out', type=str, default=None) parser.add_argument('--count', type=int, default=None) parser.add_argument('--deterministic', action='store_true') args = parser.parse_args() # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm'] ) if 'enable_obsnorm' in train_args else train_args[ 'obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) if args.eval_only: n = 50 print 'Evaluating based on {} trajs'.format(n) if False: eval_trajbatch = mdp.sim_mp( policy_fn=lambda obs_B_Do: policy.sample_actions( obs_B_Do, args.deterministic), obsfeat_fn=lambda obs: obs, cfg=policyopt.SimConfig(min_num_trajs=n, min_total_sa=-1, batch_size=None, max_traj_len=args.max_traj_len)) returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1) avgr = eval_trajbatch.r.stacked.mean() lengths = np.array([len(traj) for traj in eval_trajbatch]) ent = policy._compute_actiondist_entropy( eval_trajbatch.adist.stacked).mean() print 'ret: {} +/- {}'.format(returns.mean(), returns.std()) print 'avgr: {}'.format(avgr) print 'len: {} +/- {}'.format(lengths.mean(), lengths.std()) print 'ent: {}'.format(ent) print returns else: returns = [] lengths = [] sim = mdp.new_sim() for i_traj in xrange(n): iteration = 0 sim.reset() totalr = 0. l = 0 while not sim.done and iteration < args.max_traj_len: a = policy.sample_actions(sim.obs[None, :], bool( args.deterministic))[0][0, :] r = sim.step(a) totalr += r l += 1 iteration += 1 print i_traj, n, totalr, iteration returns.append(totalr) lengths.append(l) print 'Avg Return: ', np.array(returns).mean() print 'Std Return: ', np.array(returns).std() #import IPython; IPython.embed() elif args.out is not None: # Sample trajs and write to file print 'Saving traj samples to file: {}'.format(args.out) assert not os.path.exists(args.out) assert args.count > 0 # Simulate to create a trajectory batch util.header('Sampling {} trajectories of maximum length {}'.format( args.count, args.max_traj_len)) trajs = [] for i in tqdm.trange(args.count): trajs.append( mdp.sim_single( lambda obs: policy.sample_actions(obs, args.deterministic), lambda obs: obs, args.max_traj_len)) trajbatch = policyopt.TrajBatch.FromTrajs(trajs) print print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean() # Save the trajs to a file with h5py.File(args.out, 'w') as f: def write(name, a): # chunks of 128 trajs each f.create_dataset(name, data=a, chunks=(min(128, a.shape[0]), ) + a.shape[1:], compression='gzip', compression_opts=9) # Right-padded trajectory data write('obs_B_T_Do', trajbatch.obs.padded(fill=0.)) write('a_B_T_Da', trajbatch.a.padded(fill=0.)) write('r_B_T', trajbatch.r.padded(fill=0.)) # Trajectory lengths write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32)) # Also save args to this script argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) f.attrs['args'] = argstr else: # Animate sim = mdp.new_sim() raw_obs, normalized_obs = [], [] tret_list = [] iteration = 0 while iteration < 50: sim.reset() totalr = 0. steps = 0 while not sim.done: raw_obs.append(sim.obs[None, :]) normalized_obs.append( policy.compute_internal_normalized_obsfeat( sim.obs[None, :])) a = policy.sample_actions(sim.obs[None, :], args.deterministic)[0][0, :] r = sim.step(a) totalr += r steps += 1 sim.draw() if steps % args.max_traj_len == 0: tmpraw = np.concatenate(raw_obs, axis=0) tmpnormed = np.concatenate(normalized_obs, axis=0) print 'raw mean, raw std, normed mean, normed std' print np.stack([ tmpraw.mean(0), tmpraw.std(0), tmpnormed.mean(0), tmpnormed.std(0) ]) break print 'Steps: %d, return: %.5f' % (steps, totalr) tret_list.append(totalr) iteration += 1 print 'Avg Return: ', np.array(tret_list).mean() print 'Std Return: ', np.array(tret_list).std()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument('--resume_training', action='store_true', help="Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf") parser.add_argument('--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") parser.add_argument('--limit_trajs', type=int, required=True, help="How many expert trajectories to be used for training. If None : full dataset is used.") parser.add_argument('--data_subsamp_freq', type=int, required=True, help="A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)") # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1) parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01) parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) args = parser.parse_args() # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) print "\n\n========== Policy network specifications loaded ===========\n\n" mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) print "\n\n========== MDP initialized ===========\n\n" # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, policy_key = util.split_h5_name(args.checkpoint) policy_file = file[:-3]+'_policy.h5' policy.load_h5(policy_file, policy_key) util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(),)) print "\n\n========== Policy initialized ===========\n\n" # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 print "\n\n========== Expert data loaded ===========\n\n" # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o:o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) print "======= Behavioral Cloning optimizer initialized =======" elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1./mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier') #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, reward_key = util.split_h5_name(args.checkpoint) reward_file = file[:-3]+'_reward.h5' print reward_file reward.load_h5(reward_file, reward_key) elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1./mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool(args.no_vf) else rl.ValueFunc( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1./mdp.env_spec.timestep_limit, varscope_name='ValueFunc') if args.resume_training: if args.checkpoint is not None: file, vf_key = util.split_h5_name(args.checkpoint) vf_file = file[:-3]+'_vf.h5' vf.load_h5(vf_file, vf_key) opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig( min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) print "======== Observation normalization done ========" # Run optimizer print "======== Optimization begins ========" # Trial: make checkpoints for policy, reward and vf policy_log = nn.TrainingLog(args.log[:-3]+'_policy.h5', [('args', argstr)]) reward_log = nn.TrainingLog(args.log[:-3]+'_reward.h5', [('args', argstr)]) vf_log = nn.TrainingLog(args.log[:-3]+'_vf.h5', [('args', argstr)]) for i in xrange(args.max_iter): #Optimization step iter_info = opt.step() #Log and plot #pdb.set_trace() policy_log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0 ## FIXME: AS remove comment ) reward_log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0 ## FIXME: AS remove comment ) vf_log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0 ## FIXME: AS remove comment ) if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: policy_log.write_snapshot(policy, i) reward_log.write_snapshot(reward, i) vf_log.write_snapshot(vf, i) if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0,1 range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max())) range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()
def main(): """ If we have trained policies and snapshots, I think we can use this to watch videos of our agent in action. I don't think I can use this without doing some training first. This doesn't do training itself; we need to provide a policy, but the h5 file has to also be a directory which contains other information (see the yaml files for what I believe are similar examples). I'm not sure why we have rl giving us Gaussian policies vs Gibbs policies. What's the difference? They should just be functions mapping from states to actions? After that, it seems like we're just simulating stuff and hopefully a video would appear if I can get this to run. """ np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('output_dir', type=str) parser.add_argument('--deterministic', default=1, type=int) parser.add_argument('--max_steps', type=int, required=True) parser.add_argument('--env_name', type=str, default=None) args = parser.parse_args() util.mkdir_p(args.output_dir) assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir print 'Writing to', args.output_dir # Load the saved state policy_file, policy_key = util.split_h5_name(args.policy) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) # Initialize the MDP env_name = train_args['env_name'] if args.env_name is None else args.env_name print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) util.header('Max steps is {}'.format(args.max_steps)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy') else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy') policy.load_h5(policy_file, policy_key) # Animate sim = mdp.new_sim() steps = 0 exit = False while not exit: sim.reset() while not sim.done: a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:] sim.step(a) sim.draw() viewer = sim.env.viewer data, w, h = viewer.get_image() image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:] cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1]) print steps steps += 1 if steps >= args.max_steps: exit = True break
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() parser.add_argument('--mode', choices=MODES, required=True) # Expert dataset parser.add_argument('--data', type=str, required=True) parser.add_argument( '--resume_training', action='store_true', help= "Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf" ) parser.add_argument( '--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") parser.add_argument( '--limit_trajs', type=int, required=True, help= "How many expert trajectories to be used for training. If None : full dataset is used." ) parser.add_argument( '--data_subsamp_freq', type=int, required=True, help= "A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)" ) # MDP options parser.add_argument('--env_name', type=str, required=True) parser.add_argument('--max_traj_len', type=int, default=None) # Policy architecture parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE) parser.add_argument('--tiny_policy', action='store_true') parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata') # Behavioral cloning optimizer parser.add_argument('--bclone_lr', type=float, default=1e-3) parser.add_argument('--bclone_batch_size', type=int, default=128) # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100) parser.add_argument('--bclone_eval_ntrajs', type=int, default=20) parser.add_argument('--bclone_eval_freq', type=int, default=1000) parser.add_argument('--bclone_train_frac', type=float, default=.7) # Imitation optimizer parser.add_argument('--discount', type=float, default=.995) parser.add_argument('--lam', type=float, default=.97) parser.add_argument('--max_iter', type=int, default=1000000) parser.add_argument('--policy_max_kl', type=float, default=.01) parser.add_argument('--policy_cg_damping', type=float, default=.1, help="TRPO parameter") parser.add_argument('--no_vf', type=int, default=0) parser.add_argument('--vf_max_kl', type=float, default=.01) parser.add_argument('--vf_cg_damping', type=float, default=.1) parser.add_argument('--policy_ent_reg', type=float, default=0.) parser.add_argument('--reward_type', type=str, default='nn') # parser.add_argument('--linear_reward_bin_features', type=int, default=0) parser.add_argument('--reward_max_kl', type=float, default=.01, help="TRPO parameter") parser.add_argument('--reward_lr', type=float, default=.01) parser.add_argument('--reward_steps', type=int, default=1) parser.add_argument('--reward_ent_reg_weight', type=float, default=.001) parser.add_argument('--reward_include_time', type=int, default=0) parser.add_argument('--sim_batch_size', type=int, default=None) parser.add_argument('--min_total_sa', type=int, default=50000) parser.add_argument('--favor_zero_expert_reward', type=int, default=0) # Saving stuff parser.add_argument('--print_freq', type=int, default=1) parser.add_argument('--save_freq', type=int, default=20) parser.add_argument('--plot_freq', type=int, default=0) parser.add_argument('--log', type=str, required=False) # CVaR parameters parser.add_argument('--useCVaR', action='store_true') parser.add_argument('--CVaR_alpha', type=float, default=0.9) parser.add_argument('--CVaR_beta', type=float, default=0.) parser.add_argument('--CVaR_lr', type=float, default=0.01) # !!! The following argument --disc_CVaR_weight is not of use and should be removed parser.add_argument( '--disc_CVaR_weight', type=float, default=1., help= "Weight given to CVaR loss for the discriminator. Added by Anirban for smooth convergence." ) parser.add_argument('--CVaR_Lambda_not_trainable', action='store_false') parser.add_argument('--CVaR_Lambda_val_if_not_trainable', type=float, default=0.5) #Filtering expert trajectories parser.add_argument('--use_expert_traj_filtering', action='store_true') parser.add_argument('--expert_traj_filt_percentile_threshold', type=float, default=20) # Additive state prior formulation parser.add_argument('--use_additiveStatePrior', action='store_true') parser.add_argument('--additiveStatePrior_weight', type=float, default=1.) parser.add_argument('--n_gmm_components', type=int, default=5) parser.add_argument('--cov_type_gmm', type=str, default='diag') parser.add_argument('--familiarity_alpha', type=float, default=10000000) parser.add_argument('--familiarity_beta', type=float, default=100) parser.add_argument('--kickThreshold_percentile', type=float, default=100.0) parser.add_argument('--appendFlag', action='store_true') args = parser.parse_args() if args.useCVaR: print ">>>>>>>>>>>>>>>>>>> TRAINING RAIL <<<<<<<<<<<<<<<<<<<" elif args.use_additiveStatePrior: print ">>>>>>>>>>>>>>>>>>> USING ADDITIVE STATE PRIOR <<<<<<<<<<<<<<<<<<<" else: print ">>>>>>>>> TRAINING GAIL <<<<<<<<<<" # Initialize the MDP if args.tiny_policy: assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set' args.policy_hidden_spec = TINY_ARCHITECTURE argstr = json.dumps(vars(args), separators=(',', ':'), indent=2) print(argstr) print "\n\n========== Policy network specifications loaded ===========\n\n" mdp = rlgymenv.RLGymMDP(args.env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) print "\n\n========== MDP initialized ===========\n\n" # Initialize the policy enable_obsnorm = args.obsnorm_mode != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=args.policy_hidden_spec, min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy', args.useCVaR) else: policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec, enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy', args.useCVaR) offset = 0 #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, policy_key = util.split_h5_name(args.checkpoint) offset = int(policy_key.split('/')[-1][4:]) print '\n**************************************************' print 'Resuming from checkpoint : %d of %s' % (offset, file) print '**************************************************\n' if args.appendFlag and file != args.log: raise RuntimeError( 'Log file and checkpoint should have the same name if appendFlag is on. %s vs %s' % file, args.log) policy_file = file[:-3] + '_policy.h5' # Because we're naming the file as *_policy.h5 itself policy.load_h5(policy_file, policy_key) util.header('Policy architecture') for v in policy.get_trainable_variables(): util.header('- %s (%d parameters)' % (v.name, v.get_value().size)) util.header('Total: %d parameters' % (policy.get_num_params(), )) print "\n\n========== Policy initialized ===========\n\n" # Load expert data exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset( args.data, args.limit_trajs, args.data_subsamp_freq, len_filtering=args.use_expert_traj_filtering, len_filter_threshold=args.expert_traj_filt_percentile_threshold) assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size assert ext_Bstacked.ndim == 1 print "\n\n========== Expert data loaded ===========\n\n" print '\n==================== Hyperparams ====================' print '\texpert_traj_filt_percentile_threshold = %f' % args.expert_traj_filt_percentile_threshold print '\tfamiliarity_alpha = %f' % args.familiarity_alpha print '\tfamiliarity_beta = %f' % args.familiarity_beta print '\tkickThreshold_percentile = %f' % args.kickThreshold_percentile print '==============================================\n' # Start optimization max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit print 'Max traj len:', max_traj_len if args.mode == 'bclone': # For behavioral cloning, only print output when evaluating args.print_freq = args.bclone_eval_freq args.save_freq = args.bclone_eval_freq reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning opt = imitation.BehavioralCloningOptimizer( mdp, policy, lr=args.bclone_lr, batch_size=args.bclone_batch_size, obsfeat_fn=lambda o: o, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, eval_sim_cfg=policyopt.SimConfig( min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), eval_freq=args.bclone_eval_freq, train_frac=args.bclone_train_frac) print "======= Behavioral Cloning optimizer initialized =======" elif args.mode == 'ga': if args.reward_type == 'nn': reward = imitation.TransitionClassifier( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, action_space=mdp.action_space, max_kl=args.reward_max_kl, adam_lr=args.reward_lr, adam_steps=args.reward_steps, ent_reg_weight=args.reward_ent_reg_weight, enable_inputnorm=True, include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), varscope_name='TransitionClassifier', useCVaR=args.useCVaR, CVaR_loss_weightage=args.disc_CVaR_weight) #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>. if args.resume_training: if args.checkpoint is not None: file, reward_key = util.split_h5_name(args.checkpoint) reward_file = file[:-3] + '_reward.h5' print reward_file reward.load_h5(reward_file, reward_key) elif args.reward_type in ['l2ball', 'simplex']: reward = imitation.LinearReward( obsfeat_space=mdp.obs_space, action_space=mdp.action_space, mode=args.reward_type, enable_inputnorm=True, favor_zero_expert_reward=bool(args.favor_zero_expert_reward), include_time=bool(args.reward_include_time), time_scale=1. / mdp.env_spec.timestep_limit, exobs_Bex_Do=exobs_Bstacked_Do, exa_Bex_Da=exa_Bstacked_Da, ext_Bex=ext_Bstacked) else: raise NotImplementedError(args.reward_type) vf = None if bool( args.no_vf) else rl.ValueFunc( #Add resume training functionality hidden_spec=args.policy_hidden_spec, obsfeat_space=mdp.obs_space, enable_obsnorm=args.obsnorm_mode != 'none', enable_vnorm=True, max_kl=args.vf_max_kl, damping=args.vf_cg_damping, time_scale=1. / mdp.env_spec.timestep_limit, varscope_name='ValueFunc') if args.resume_training: if args.checkpoint is not None: file, vf_key = util.split_h5_name(args.checkpoint) vf_file = file[:-3] + '_vf.h5' vf.load_h5(vf_file, vf_key) if args.useCVaR: opt = imitation.ImitationOptimizer_CVaR( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, useCVaR=True), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked, #For CVaR CVaR_alpha=args.CVaR_alpha, CVaR_beta=args.CVaR_beta, CVaR_lr=args.CVaR_lr, CVaR_Lambda_trainable=args.CVaR_Lambda_not_trainable, CVaR_Lambda_val_if_not_trainable=args. CVaR_Lambda_val_if_not_trainable, offset=offset + 1) elif args.use_additiveStatePrior: opt = imitation.ImitationOptimizer_additiveStatePrior( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, useCVaR=False), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked, n_gmm_components=args.n_gmm_components, cov_type_gmm=args.cov_type_gmm, additiveStatePrior_weight=args.additiveStatePrior_weight, alpha=args.familiarity_alpha, beta=args.familiarity_beta, kickThreshold_percentile=args.kickThreshold_percentile, offset=offset + 1) else: opt = imitation.ImitationOptimizer( mdp=mdp, discount=args.discount, lam=args.lam, policy=policy, sim_cfg=policyopt.SimConfig(min_num_trajs=-1, min_total_sa=args.min_total_sa, batch_size=args.sim_batch_size, max_traj_len=max_traj_len), step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping, useCVaR=False), reward_func=reward, value_func=vf, policy_obsfeat_fn=lambda obs: obs, reward_obsfeat_fn=lambda obs: obs, policy_ent_reg=args.policy_ent_reg, ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da, ex_t=ext_Bstacked) # Set observation normalization if args.obsnorm_mode == 'expertdata': policy.update_obsnorm(exobs_Bstacked_Do) if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da) if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do)) print "======== Observation normalization done ========" # Run optimizer print "======== Optimization begins ========" # Trial: make checkpoints for policy, reward and vf policy_log = nn.TrainingLog(args.log[:-3] + '_policy.h5', [('args', argstr)], args.appendFlag) reward_log = nn.TrainingLog(args.log[:-3] + '_reward.h5', [('args', argstr)], args.appendFlag) vf_log = nn.TrainingLog(args.log[:-3] + '_vf.h5', [('args', argstr)], args.appendFlag) kickStatesData = [] print '\n**************************************' print 'Running iterations from %d to %d' % (offset + 1, args.max_iter) for i in xrange(offset + 1, args.max_iter): # for i in range(1): #FIXME: this is just for studying the insides of the training algo # All training a.k.a. optimization happens in the next line!!! -_- # pdb.set_trace() iter_info = opt.step( i, kickStatesData) if args.use_additiveStatePrior else opt.step(i) #========= The rest is fluff ============= #Log and plot #pdb.set_trace() policy_log.write( iter_info, print_header=i % (20 * args.print_freq) == 0, # display=False display=i % args.print_freq == 0 ## FIXME: AS remove comment ) # reward_log.write(iter_info, # print_header=i % (20*args.print_freq) == 0, # display=False # # display=i % args.print_freq == 0 ## FIXME: AS remove comment # ) # vf_log.write(iter_info, # print_header=i % (20*args.print_freq) == 0, # display=False # # display=i % args.print_freq == 0 ## FIXME: AS remove comment # ) #FIXME: problem running this on 211 and 138. No problem on 151 if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None: policy_log.write_snapshot(policy, i) reward_log.write_snapshot(reward, i) vf_log.write_snapshot(vf, i) # analysisFile=open(args.log[:-3]+'_kickedStates' + str(i) + '.pkl', 'wb') analysisFile = open(args.log[:-3] + '_kickedStates.pkl', 'wb') pkl.dump({'kickStatesData': kickStatesData}, analysisFile, protocol=2) analysisFile.close() if args.plot_freq != 0 and i % args.plot_freq == 0: exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1) pdata_M_Doa = np.concatenate( [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1) # Plot reward import matplotlib.pyplot as plt _, ax = plt.subplots() idx1, idx2 = 0, 1 range1 = (min(exdata_N_Doa[:, idx1].min(), pdata_M_Doa[:, idx1].min()), max(exdata_N_Doa[:, idx1].max(), pdata_M_Doa[:, idx1].max())) range2 = (min(exdata_N_Doa[:, idx2].min(), pdata_M_Doa[:, idx2].min()), max(exdata_N_Doa[:, idx2].max(), pdata_M_Doa[:, idx2].max())) reward.plot(ax, idx1, idx2, range1, range2, n=100) # Plot expert data ax.scatter(exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2], color='blue', s=1, label='expert') # Plot policy samples ax.scatter(pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2], color='red', s=1, label='apprentice') ax.legend() plt.show()
def main(): np.set_printoptions(suppress=True, precision=5, linewidth=1000) parser = argparse.ArgumentParser() # MDP options parser.add_argument('policy', type=str) parser.add_argument('--eval_only', action='store_true') parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving parser.add_argument('--out', type=str, default=None) parser.add_argument('--count', type=int, default=None) parser.add_argument('--deterministic', action='store_true') args = parser.parse_args() #filenames = os.listdir(args.policy) csvf = open(args.policy[:-3] + '.csv', 'w') csvwriter = csv.writer(csvf) dataf = open(args.policy[:-3] + 'full.csv', 'w') datawriter = csv.writer(dataf) #csvwriter.writerow(['filename', 'average', 'std']) # Load the saved state if args.policy.find('reacher') > 0: key_iter = 200 elif args.policy.find('humanoid') > 0: key_iter = 1500 else: key_iter = 500 policy_file, policy_key = util.split_h5_name(args.policy + '/snapshots/iter%07d' % key_iter) print 'Loading policy parameters from %s in %s' % (policy_key, policy_file) with h5py.File(policy_file, 'r') as f: train_args = json.loads(f.attrs['args']) dset = f[policy_key] import pprint pprint.pprint(dict(dset.attrs)) if args.policy.find('shared1') > 0: sharednet = True else: sharednet = False # Initialize the MDP env_name = train_args['env_name'] print 'Loading environment', env_name mdp = rlgymenv.RLGymMDP(env_name) util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)) if args.max_traj_len is None: args.max_traj_len = mdp.env_spec.timestep_limit util.header('Max traj len is {}'.format(args.max_traj_len)) # Initialize the policy and load its parameters enable_obsnorm = bool(train_args['enable_obsnorm'] ) if 'enable_obsnorm' in train_args else train_args[ 'obsnorm_mode'] != 'none' if isinstance(mdp.action_space, policyopt.ContinuousSpace): policy_cfg = rl.GaussianPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], min_stdev=0., init_logstdev=0., enable_obsnorm=enable_obsnorm) policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy', use_shared_std_network=sharednet) else: policy_cfg = rl.GibbsPolicyConfig( hidden_spec=train_args['policy_hidden_spec'], enable_obsnorm=enable_obsnorm) policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy', use_shared_std_network=sharednet) policy.load_h5(policy_file, policy_key) n = 50 print 'Evaluating based on {} trajs'.format(n) returns = [] lengths = [] sim = mdp.new_sim() for i_traj in xrange(n): iteration = 0 sim.reset() totalr = 0. l = 0 while not sim.done and iteration < args.max_traj_len: a = policy.sample_actions(sim.obs[None, :], bool(args.deterministic))[0][0, :] r = sim.step(a) totalr += r l += 1 iteration += 1 print i_traj, n, totalr, iteration datawriter.writerow([i_traj, n, totalr, iteration]) returns.append(totalr) lengths.append(l) avg, std = np.array(returns).mean(), np.array(returns).std() print 'Avg Return: ', avg, 'Std: ', std csvwriter.writerow([args.policy, avg, std]) del policy #import IPython; IPython.embed() csvf.close() dataf.close()