Example #1
0
def find_deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories(expert_trajectories, learner_policy, limit_trajs, data_subsamp_freq, ipython_after_eval):
	# Load the learner's policy
	policy_file, policy_key = util.split_h5_name(learner_policy)
	print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
	with h5py.File(policy_file, 'r') as f:
	    train_args = json.loads(f.attrs['args'])
	    dset = f[policy_key]
	    import pprint
	    pprint.pprint(dict(dset.attrs))

	# Initialize the MDP
	env_name = train_args['env_name']
	print 'Loading environment', env_name
	mdp = rlgymenv.RLGymMDP(env_name)
	util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

	# Initialize the policy and load its parameters
	enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
	if isinstance(mdp.action_space, policyopt.ContinuousSpace):
	    policy_cfg = rl.GaussianPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        min_stdev=0.,
	        init_logstdev=0.,
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
	else:
	    policy_cfg = rl.GibbsPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

	policy.load_h5(policy_file, policy_key)

	# Load the expert trajectories
	exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = imitate_mj.load_dataset(
	    expert_trajectories, limit_trajs, data_subsamp_freq)
	assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
	assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
	assert ext_Bstacked.ndim == 1



	# Generate the actions according to the learner's policy for the expert's observations
	learner_actions_Bstacked_Da = policy.sample_actions(exobs_Bstacked_Do)[0]

	# Calcualating the deviation histogram:
	action_deviations = np.linalg.norm(exa_Bstacked_Da - learner_actions_Bstacked_Da, axis=1)

	# Plot the histogram
	# sns.kdeplot(action_deviations,shade=True)

	# FIXME: Uncomment the following
	plt.figure()
	plt.hist(action_deviations, bins=100)
	plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories.png')
	plt.show()	

	if ipython_after_eval:
		import IPython; IPython.embed()    
Example #2
0
def find_deviation_of_agent_actions_from_expert_actions_for_underperforming_trajectories(learner_trajectories, expert_policy, lower_bound_reward, ipython_after_eval, generate_plot):
	obs,a,r,l = find_underperforming_trajectories(learner_trajectories, lower_bound_reward)
	print(type(obs))
	# Load the expert's policy
	policy_file, policy_key = util.split_h5_name(expert_policy)
	print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
	with h5py.File(policy_file, 'r') as f:
	    train_args = json.loads(f.attrs['args'])
	    dset = f[policy_key]
	    import pprint
	    pprint.pprint(dict(dset.attrs))

	# Initialize the MDP
	env_name = train_args['env_name']
	print 'Loading environment', env_name
	mdp = rlgymenv.RLGymMDP(env_name)
	util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

	# Initialize the policy and load its parameters
	enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
	if isinstance(mdp.action_space, policyopt.ContinuousSpace):
	    policy_cfg = rl.GaussianPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        min_stdev=0.,
	        init_logstdev=0.,
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
	else:
	    policy_cfg = rl.GibbsPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

	policy.load_h5(policy_file, policy_key)
	
	# Generate the actions according to the expert's policy for the observations in the underperforming trajs

	expert_actions = policy.sample_actions(obs.reshape((-1,obs.shape[-1])))[0].reshape((-1,a.shape[1],a.shape[2]))
	

	# Calcualating the deviation histogram:
	action_deviations = np.linalg.norm(expert_actions.reshape((-1,a.shape[-1])) - a.reshape((-1,a.shape[-1])), axis=1)
	if generate_plot:
		plt.figure()
		plt.hist(action_deviations, bins=100)
		plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_underperforming_learner_trajectories.png')
		plt.show()	
	if ipython_after_eval:
		import IPython; IPython.embed() 
Example #3
0
def load_trained_policy_and_mdp(env_name, policy_state_str):
    """ Creates the specialized MDP and policy objects needed to sample expert
    trajectories for a given environment.

    Returns:
        mdp: An instance of `RLGymMDP`, similar to a real gym env except with
            customized obs/action spaces and an internal `RLGyMSim` object.
        policy: The agent's policy, encoded as either rl.GaussianPolicy for
            continuous actions, or rl.GibbsPolicy for discrete actions.
        train_args: A dictionary of arguments (like argparse dicts) based on the
            trained policy's TRPO run.
    """
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args
Example #4
0
def load_trained_policy_and_mdp(env_name, policy_state_str):
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print('Loading policy parameters from %s in %s' %
          (policy_key, policy_file))
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print('Loading environment', env_name)
    mdp = rlgymenv.RLGymMDP(env_name)
    print('MDP observation space, action space sizes: %d, %d\n' %
          (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args
Example #5
0
def load_trained_policy_and_mdp(env_name, policy_state_str):
    import gym
    import policyopt
    from policyopt import nn, rl
    from environments import rlgymenv

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(policy_state_str)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])

    # Initialize the MDP
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    print 'MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size)

    # Initialize the policy
    nn.reset_global_scope()
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    # Load the policy parameters
    policy.load_h5(policy_file, policy_key)

    return mdp, policy, train_args
Example #6
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('output_dir', type=str)
    parser.add_argument('--deterministic', default=1, type=int)
    parser.add_argument('--max_steps', type=int, required=True)
    parser.add_argument('--env_name', type=str, default=None)
    args = parser.parse_args()

    util.mkdir_p(args.output_dir)
    assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir
    print 'Writing to', args.output_dir

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name'] if args.env_name is None else args.env_name
    print 'Loading environment', env_name
    mdp = rllabenv.RLLabMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    util.header('Max steps is {}'.format(args.max_steps))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    # Animate
    sim = mdp.new_sim()
    steps = 0
    exit = False
    while not exit:
        sim.reset()
        while not sim.done:
            a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
            sim.step(a)
            sim.draw()
            viewer = sim.env.viewer
            data, w, h = viewer.get_image()
            image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:]
            cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1])

            print steps
            steps += 1

            if steps >= args.max_steps:
                exit = True
                break
Example #7
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    if args.eval_only:
        n = 50
        print 'Evaluating based on {} trajs'.format(n)

        if False:
            eval_trajbatch = mdp.sim_mp(
                policy_fn=lambda obs_B_Do: policy.sample_actions(obs_B_Do, args.deterministic),
                obsfeat_fn=lambda obs:obs,
                cfg=policyopt.SimConfig(
                    min_num_trajs=n, min_total_sa=-1,
                    batch_size=None, max_traj_len=args.max_traj_len))
            returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1)
            avgr = eval_trajbatch.r.stacked.mean()
            lengths = np.array([len(traj) for traj in eval_trajbatch])
            ent = policy._compute_actiondist_entropy(eval_trajbatch.adist.stacked).mean()
            print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
            print 'avgr: {}'.format(avgr)
            print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
            print 'ent: {}'.format(ent)
            print returns
        else:
            returns = []
            lengths = []
            sim = mdp.new_sim()
            for i_traj in xrange(n):
                print i_traj, n
                sim.reset()
                totalr = 0.
                l = 0
                while not sim.done:
                    a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
                    r = sim.step(a)
                    totalr += r
                    l += 1
                returns.append(totalr)
                lengths.append(l)
        import IPython; IPython.embed()

    elif args.out is not None:
        # Sample trajs and write to file
        print 'Saving traj samples to file: {}'.format(args.out)

        assert not os.path.exists(args.out)
        assert args.count > 0
        # Simulate to create a trajectory batch
        util.header('Sampling {} trajectories of maximum length {}'.format(args.count, args.max_traj_len))
        trajs = []
        for i in tqdm.trange(args.count):
            trajs.append(mdp.sim_single(
                lambda obs: policy.sample_actions(obs, args.deterministic),
                lambda obs: obs,
                args.max_traj_len))
        trajbatch = policyopt.TrajBatch.FromTrajs(trajs)

        print
        print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean()

        # Save the trajs to a file
        with h5py.File(args.out, 'w') as f:
            def write(name, a):
                # chunks of 128 trajs each
                f.create_dataset(name, data=a, chunks=(min(128, a.shape[0]),)+a.shape[1:], compression='gzip', compression_opts=9)

            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32))

            # Also save args to this script
            argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            f.attrs['args'] = argstr

    else:
        # Animate
        sim = mdp.new_sim()
        raw_obs, normalized_obs = [], []
        while True:
            sim.reset()
            totalr = 0.
            steps = 0
            while not sim.done:
                raw_obs.append(sim.obs[None,:])
                normalized_obs.append(policy.compute_internal_normalized_obsfeat(sim.obs[None,:]))

                a = policy.sample_actions(sim.obs[None,:], args.deterministic)[0][0,:]
                r = sim.step(a)
                totalr += r
                steps += 1
                sim.draw()

                if steps % 1000 == 0:
                    tmpraw = np.concatenate(raw_obs, axis=0)
                    tmpnormed = np.concatenate(normalized_obs, axis=0)
                    print 'raw mean, raw std, normed mean, normed std'
                    print np.stack([tmpraw.mean(0), tmpraw.std(0), tmpnormed.mean(0), tmpnormed.std(0)])
            print 'Steps: %d, return: %.5f' % (steps, totalr)
Example #8
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int,
                        default=None)  # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    if args.eval_only:
        n = 50
        print 'Evaluating based on {} trajs'.format(n)

        if False:
            eval_trajbatch = mdp.sim_mp(
                policy_fn=lambda obs_B_Do: policy.sample_actions(
                    obs_B_Do, args.deterministic),
                obsfeat_fn=lambda obs: obs,
                cfg=policyopt.SimConfig(min_num_trajs=n,
                                        min_total_sa=-1,
                                        batch_size=None,
                                        max_traj_len=args.max_traj_len))
            returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1)
            avgr = eval_trajbatch.r.stacked.mean()
            lengths = np.array([len(traj) for traj in eval_trajbatch])
            ent = policy._compute_actiondist_entropy(
                eval_trajbatch.adist.stacked).mean()
            print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
            print 'avgr: {}'.format(avgr)
            print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
            print 'ent: {}'.format(ent)
            print returns
        else:
            returns = []
            lengths = []
            sim = mdp.new_sim()

            for i_traj in xrange(n):
                iteration = 0
                sim.reset()
                totalr = 0.
                l = 0
                while not sim.done and iteration < args.max_traj_len:
                    a = policy.sample_actions(sim.obs[None, :],
                                              bool(
                                                  args.deterministic))[0][0, :]
                    r = sim.step(a)
                    totalr += r
                    l += 1
                    iteration += 1

                print i_traj, n, totalr, iteration
                returns.append(totalr)
                lengths.append(l)

            print 'Avg Return: ', np.array(returns).mean()
            print 'Std Return: ', np.array(returns).std()
        #import IPython; IPython.embed()

    elif args.out is not None:
        # Sample trajs and write to file
        print 'Saving traj samples to file: {}'.format(args.out)

        assert not os.path.exists(args.out)
        assert args.count > 0
        # Simulate to create a trajectory batch
        util.header('Sampling {} trajectories of maximum length {}'.format(
            args.count, args.max_traj_len))
        trajs = []
        for i in tqdm.trange(args.count):
            trajs.append(
                mdp.sim_single(
                    lambda obs: policy.sample_actions(obs, args.deterministic),
                    lambda obs: obs, args.max_traj_len))
        trajbatch = policyopt.TrajBatch.FromTrajs(trajs)

        print
        print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean()

        # Save the trajs to a file
        with h5py.File(args.out, 'w') as f:

            def write(name, a):
                # chunks of 128 trajs each
                f.create_dataset(name,
                                 data=a,
                                 chunks=(min(128, a.shape[0]), ) + a.shape[1:],
                                 compression='gzip',
                                 compression_opts=9)

            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B',
                  np.array([len(traj) for traj in trajbatch], dtype=np.int32))

            # Also save args to this script
            argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            f.attrs['args'] = argstr

    else:
        # Animate
        sim = mdp.new_sim()
        raw_obs, normalized_obs = [], []

        tret_list = []
        iteration = 0
        while iteration < 50:
            sim.reset()
            totalr = 0.
            steps = 0
            while not sim.done:
                raw_obs.append(sim.obs[None, :])
                normalized_obs.append(
                    policy.compute_internal_normalized_obsfeat(
                        sim.obs[None, :]))

                a = policy.sample_actions(sim.obs[None, :],
                                          args.deterministic)[0][0, :]
                r = sim.step(a)
                totalr += r
                steps += 1
                sim.draw()

                if steps % args.max_traj_len == 0:
                    tmpraw = np.concatenate(raw_obs, axis=0)
                    tmpnormed = np.concatenate(normalized_obs, axis=0)
                    print 'raw mean, raw std, normed mean, normed std'
                    print np.stack([
                        tmpraw.mean(0),
                        tmpraw.std(0),
                        tmpnormed.mean(0),
                        tmpnormed.std(0)
                    ])
                    break
            print 'Steps: %d, return: %.5f' % (steps, totalr)
            tret_list.append(totalr)
            iteration += 1

        print 'Avg Return: ', np.array(tret_list).mean()
        print 'Std Return: ', np.array(tret_list).std()
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--resume_training', action='store_true', help="Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf") 
    parser.add_argument('--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") 
    parser.add_argument('--limit_trajs', type=int, required=True, help="How many expert trajectories to be used for training. If None : full dataset is used.") 
    parser.add_argument('--data_subsamp_freq', type=int, required=True, help="A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)")
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)
    print "\n\n========== Policy network specifications loaded ===========\n\n"

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    print "\n\n========== MDP initialized ===========\n\n"

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
    if args.resume_training:
        if args.checkpoint is not None:
            file, policy_key = util.split_h5_name(args.checkpoint)
            policy_file = file[:-3]+'_policy.h5'
            policy.load_h5(policy_file, policy_key)

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(),))

    print "\n\n========== Policy initialized ===========\n\n"

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    print "\n\n========== Expert data loaded ===========\n\n"

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning
        opt = imitation.BehavioralCloningOptimizer(
            mdp, policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o:o,
            ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

        print "======= Behavioral Cloning optimizer initialized ======="

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier( #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
            #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
            if args.resume_training:
                if args.checkpoint is not None:
                    file, reward_key = util.split_h5_name(args.checkpoint)
                    reward_file = file[:-3]+'_reward.h5'
                    print reward_file
                    reward.load_h5(reward_file, reward_key)

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc( #Add resume training functionality
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1./mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')
        if args.resume_training:
            if args.checkpoint is not None:
                file, vf_key = util.split_h5_name(args.checkpoint)
                vf_file = file[:-3]+'_vf.h5'
                vf.load_h5(vf_file, vf_key)

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(
                min_num_trajs=-1, min_total_sa=args.min_total_sa,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da)
        if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

        print "======== Observation normalization done ========"

    # Run optimizer
    print "======== Optimization begins ========"

    # Trial: make checkpoints for policy, reward and vf
    policy_log = nn.TrainingLog(args.log[:-3]+'_policy.h5', [('args', argstr)])
    reward_log = nn.TrainingLog(args.log[:-3]+'_reward.h5', [('args', argstr)])
    vf_log = nn.TrainingLog(args.log[:-3]+'_vf.h5', [('args', argstr)])
    

    for i in xrange(args.max_iter):
        
        #Optimization step
        iter_info = opt.step() 

        #Log and plot
        #pdb.set_trace()
    	policy_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        reward_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        vf_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        

        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            policy_log.write_snapshot(policy, i)
            reward_log.write_snapshot(reward, i)
            vf_log.write_snapshot(vf, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1)
            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0,1
            range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max()))
            range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

            ax.legend()
            plt.show()
Example #10
0
def main():
    """
    If we have trained policies and snapshots, I think we can use this to watch
    videos of our agent in action. I don't think I can use this without doing
    some training first. This doesn't do training itself; we need to provide a
    policy, but the h5 file has to also be a directory which contains other
    information (see the yaml files for what I believe are similar examples).

    I'm not sure why we have rl giving us Gaussian policies vs Gibbs policies.
    What's the difference? They should just be functions mapping from states to
    actions?

    After that, it seems like we're just simulating stuff and hopefully a video
    would appear if I can get this to run.
    """
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('output_dir', type=str)
    parser.add_argument('--deterministic', default=1, type=int)
    parser.add_argument('--max_steps', type=int, required=True)
    parser.add_argument('--env_name', type=str, default=None)
    args = parser.parse_args()

    util.mkdir_p(args.output_dir)
    assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir
    print 'Writing to', args.output_dir

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)

    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name'] if args.env_name is None else args.env_name
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    util.header('Max steps is {}'.format(args.max_steps))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    # Animate
    sim = mdp.new_sim()
    steps = 0
    exit = False
    while not exit:
        sim.reset()
        while not sim.done:
            a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
            sim.step(a)
            sim.draw()
            viewer = sim.env.viewer
            data, w, h = viewer.get_image()
            image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:]
            cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1])

            print steps
            steps += 1

            if steps >= args.max_steps:
                exit = True
                break
Example #11
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument(
        '--resume_training',
        action='store_true',
        help=
        "Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf"
    )
    parser.add_argument(
        '--checkpoint',
        type=str,
        help="Load from checkpoint if provided and if --resume_training")
    parser.add_argument(
        '--limit_trajs',
        type=int,
        required=True,
        help=
        "How many expert trajectories to be used for training. If None : full dataset is used."
    )
    parser.add_argument(
        '--data_subsamp_freq',
        type=int,
        required=True,
        help=
        "A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)"
    )
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)

    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping',
                        type=float,
                        default=.1,
                        help="TRPO parameter")
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl',
                        type=float,
                        default=.01,
                        help="TRPO parameter")
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)
    # CVaR parameters
    parser.add_argument('--useCVaR', action='store_true')
    parser.add_argument('--CVaR_alpha', type=float, default=0.9)
    parser.add_argument('--CVaR_beta', type=float, default=0.)
    parser.add_argument('--CVaR_lr', type=float, default=0.01)
    # !!! The following argument --disc_CVaR_weight is not of use and should be removed
    parser.add_argument(
        '--disc_CVaR_weight',
        type=float,
        default=1.,
        help=
        "Weight given to CVaR loss for the discriminator. Added by Anirban for smooth convergence."
    )
    parser.add_argument('--CVaR_Lambda_not_trainable', action='store_false')
    parser.add_argument('--CVaR_Lambda_val_if_not_trainable',
                        type=float,
                        default=0.5)
    #Filtering expert trajectories
    parser.add_argument('--use_expert_traj_filtering', action='store_true')
    parser.add_argument('--expert_traj_filt_percentile_threshold',
                        type=float,
                        default=20)
    # Additive state prior formulation
    parser.add_argument('--use_additiveStatePrior', action='store_true')
    parser.add_argument('--additiveStatePrior_weight', type=float, default=1.)
    parser.add_argument('--n_gmm_components', type=int, default=5)
    parser.add_argument('--cov_type_gmm', type=str, default='diag')
    parser.add_argument('--familiarity_alpha', type=float, default=10000000)
    parser.add_argument('--familiarity_beta', type=float, default=100)

    parser.add_argument('--kickThreshold_percentile',
                        type=float,
                        default=100.0)
    parser.add_argument('--appendFlag', action='store_true')

    args = parser.parse_args()

    if args.useCVaR:
        print ">>>>>>>>>>>>>>>>>>> TRAINING RAIL <<<<<<<<<<<<<<<<<<<"
    elif args.use_additiveStatePrior:
        print ">>>>>>>>>>>>>>>>>>> USING ADDITIVE STATE PRIOR <<<<<<<<<<<<<<<<<<<"
    else:
        print ">>>>>>>>> TRAINING GAIL <<<<<<<<<<"

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)
    print "\n\n========== Policy network specifications loaded ===========\n\n"

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    print "\n\n========== MDP initialized ===========\n\n"

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy', args.useCVaR)
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy', args.useCVaR)

    offset = 0
    #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
    if args.resume_training:
        if args.checkpoint is not None:
            file, policy_key = util.split_h5_name(args.checkpoint)
            offset = int(policy_key.split('/')[-1][4:])
            print '\n**************************************************'
            print 'Resuming from checkpoint : %d of %s' % (offset, file)
            print '**************************************************\n'

            if args.appendFlag and file != args.log:
                raise RuntimeError(
                    'Log file and checkpoint should have the same name if appendFlag is on. %s vs %s'
                    % file, args.log)

            policy_file = file[:-3] + '_policy.h5'  # Because we're naming the file as *_policy.h5 itself
            policy.load_h5(policy_file, policy_key)

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    print "\n\n========== Policy initialized ===========\n\n"

    # Load expert data

    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data,
        args.limit_trajs,
        args.data_subsamp_freq,
        len_filtering=args.use_expert_traj_filtering,
        len_filter_threshold=args.expert_traj_filt_percentile_threshold)

    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    print "\n\n========== Expert data loaded ===========\n\n"

    print '\n==================== Hyperparams ===================='
    print '\texpert_traj_filt_percentile_threshold = %f' % args.expert_traj_filt_percentile_threshold
    print '\tfamiliarity_alpha = %f' % args.familiarity_alpha
    print '\tfamiliarity_beta = %f' % args.familiarity_beta
    print '\tkickThreshold_percentile = %f' % args.kickThreshold_percentile
    print '==============================================\n'

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None  #There is no role of the reward function or value function in behavioral cloning
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

        print "======= Behavioral Cloning optimizer initialized ======="

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(  #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier',
                useCVaR=args.useCVaR,
                CVaR_loss_weightage=args.disc_CVaR_weight)
            #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
            if args.resume_training:
                if args.checkpoint is not None:
                    file, reward_key = util.split_h5_name(args.checkpoint)
                    reward_file = file[:-3] + '_reward.h5'
                    print reward_file
                    reward.load_h5(reward_file, reward_key)

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(
            args.no_vf) else rl.ValueFunc(  #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                enable_obsnorm=args.obsnorm_mode != 'none',
                enable_vnorm=True,
                max_kl=args.vf_max_kl,
                damping=args.vf_cg_damping,
                time_scale=1. / mdp.env_spec.timestep_limit,
                varscope_name='ValueFunc')
        if args.resume_training:
            if args.checkpoint is not None:
                file, vf_key = util.split_h5_name(args.checkpoint)
                vf_file = file[:-3] + '_vf.h5'
                vf.load_h5(vf_file, vf_key)
        if args.useCVaR:
            opt = imitation.ImitationOptimizer_CVaR(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=True),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked,
                #For CVaR
                CVaR_alpha=args.CVaR_alpha,
                CVaR_beta=args.CVaR_beta,
                CVaR_lr=args.CVaR_lr,
                CVaR_Lambda_trainable=args.CVaR_Lambda_not_trainable,
                CVaR_Lambda_val_if_not_trainable=args.
                CVaR_Lambda_val_if_not_trainable,
                offset=offset + 1)
        elif args.use_additiveStatePrior:
            opt = imitation.ImitationOptimizer_additiveStatePrior(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=False),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked,
                n_gmm_components=args.n_gmm_components,
                cov_type_gmm=args.cov_type_gmm,
                additiveStatePrior_weight=args.additiveStatePrior_weight,
                alpha=args.familiarity_alpha,
                beta=args.familiarity_beta,
                kickThreshold_percentile=args.kickThreshold_percentile,
                offset=offset + 1)
        else:
            opt = imitation.ImitationOptimizer(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=False),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

        print "======== Observation normalization done ========"

    # Run optimizer
    print "======== Optimization begins ========"

    # Trial: make checkpoints for policy, reward and vf
    policy_log = nn.TrainingLog(args.log[:-3] + '_policy.h5',
                                [('args', argstr)], args.appendFlag)
    reward_log = nn.TrainingLog(args.log[:-3] + '_reward.h5',
                                [('args', argstr)], args.appendFlag)
    vf_log = nn.TrainingLog(args.log[:-3] + '_vf.h5', [('args', argstr)],
                            args.appendFlag)

    kickStatesData = []

    print '\n**************************************'
    print 'Running iterations from %d to %d' % (offset + 1, args.max_iter)

    for i in xrange(offset + 1, args.max_iter):
        # for i in range(1): #FIXME: this is just for studying the insides of the training algo

        # All training a.k.a. optimization happens in the next line!!! -_-
        # pdb.set_trace()
        iter_info = opt.step(
            i, kickStatesData) if args.use_additiveStatePrior else opt.step(i)

        #========= The rest is fluff =============

        #Log and plot
        #pdb.set_trace()
        policy_log.write(
            iter_info,
            print_header=i % (20 * args.print_freq) == 0,
            # display=False
            display=i % args.print_freq == 0  ## FIXME: AS remove comment
        )
        # reward_log.write(iter_info,
        #         print_header=i % (20*args.print_freq) == 0,
        #         display=False
        #         # display=i % args.print_freq == 0 ## FIXME: AS remove comment
        #         )
        # vf_log.write(iter_info,
        #         print_header=i % (20*args.print_freq) == 0,
        #         display=False
        #         # display=i % args.print_freq == 0 ## FIXME: AS remove comment
        #         )

        #FIXME: problem running this on 211 and 138. No problem on 151
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            policy_log.write_snapshot(policy, i)
            reward_log.write_snapshot(reward, i)
            vf_log.write_snapshot(vf, i)

            # analysisFile=open(args.log[:-3]+'_kickedStates' + str(i) + '.pkl', 'wb')
            analysisFile = open(args.log[:-3] + '_kickedStates.pkl', 'wb')
            pkl.dump({'kickStatesData': kickStatesData},
                     analysisFile,
                     protocol=2)
            analysisFile.close()

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()
Example #12
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int,
                        default=None)  # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    #filenames = os.listdir(args.policy)
    csvf = open(args.policy[:-3] + '.csv', 'w')
    csvwriter = csv.writer(csvf)

    dataf = open(args.policy[:-3] + 'full.csv', 'w')
    datawriter = csv.writer(dataf)
    #csvwriter.writerow(['filename', 'average', 'std'])

    # Load the saved state
    if args.policy.find('reacher') > 0:
        key_iter = 200
    elif args.policy.find('humanoid') > 0:
        key_iter = 1500
    else:
        key_iter = 500

    policy_file, policy_key = util.split_h5_name(args.policy +
                                                 '/snapshots/iter%07d' %
                                                 key_iter)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    if args.policy.find('shared1') > 0:
        sharednet = True
    else:
        sharednet = False

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters

    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg,
                                   mdp.obs_space,
                                   mdp.action_space,
                                   'GaussianPolicy',
                                   use_shared_std_network=sharednet)
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg,
                                mdp.obs_space,
                                mdp.action_space,
                                'GibbsPolicy',
                                use_shared_std_network=sharednet)
    policy.load_h5(policy_file, policy_key)

    n = 50
    print 'Evaluating based on {} trajs'.format(n)

    returns = []
    lengths = []
    sim = mdp.new_sim()

    for i_traj in xrange(n):
        iteration = 0
        sim.reset()
        totalr = 0.
        l = 0
        while not sim.done and iteration < args.max_traj_len:
            a = policy.sample_actions(sim.obs[None, :],
                                      bool(args.deterministic))[0][0, :]
            r = sim.step(a)
            totalr += r
            l += 1
            iteration += 1

        print i_traj, n, totalr, iteration
        datawriter.writerow([i_traj, n, totalr, iteration])
        returns.append(totalr)
        lengths.append(l)
    avg, std = np.array(returns).mean(), np.array(returns).std()
    print 'Avg Return: ', avg, 'Std: ', std
    csvwriter.writerow([args.policy, avg, std])
    del policy
    #import IPython; IPython.embed()

    csvf.close()
    dataf.close()