Esempio n. 1
0
    def __init__(self, input_B_Di, input_shape, output_shape, initializer):
        assert len(input_shape) == len(output_shape) == 1
        util.header('Affine(in=%d, out=%d)' %
                    (input_shape[0], output_shape[0]))
        self._output_shape = (output_shape[0], )
        with variable_scope(type(self).__name__) as self.__varscope:
            if initializer is None:
                # initializer = np.random.randn(input_shape[0], output_shape[0]) * np.sqrt(2./input_shape[0])

                # Glorot/Bengio 2010
                s = np.sqrt(6. / (input_shape[0] + output_shape[0]))
                initializer = np.random.uniform(low=-s,
                                                high=s,
                                                size=(input_shape[0],
                                                      output_shape[0]))

            else:
                assert initializer.shape == (input_shape[0], output_shape[0])
            self.W_Di_Do = get_variable(
                'W', initializer.astype(theano.config.floatX))
            self.b_1_Do = get_variable('b',
                                       np.zeros((1, output_shape[0]),
                                                dtype=theano.config.floatX),
                                       broadcastable=(True, False))
            self._output_B_Do = input_B_Di.dot(self.W_Di_Do) + self.b_1_Do
Esempio n. 2
0
def find_deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories(expert_trajectories, learner_policy, limit_trajs, data_subsamp_freq, ipython_after_eval):
	# Load the learner's policy
	policy_file, policy_key = util.split_h5_name(learner_policy)
	print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
	with h5py.File(policy_file, 'r') as f:
	    train_args = json.loads(f.attrs['args'])
	    dset = f[policy_key]
	    import pprint
	    pprint.pprint(dict(dset.attrs))

	# Initialize the MDP
	env_name = train_args['env_name']
	print 'Loading environment', env_name
	mdp = rlgymenv.RLGymMDP(env_name)
	util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

	# Initialize the policy and load its parameters
	enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
	if isinstance(mdp.action_space, policyopt.ContinuousSpace):
	    policy_cfg = rl.GaussianPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        min_stdev=0.,
	        init_logstdev=0.,
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
	else:
	    policy_cfg = rl.GibbsPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

	policy.load_h5(policy_file, policy_key)

	# Load the expert trajectories
	exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = imitate_mj.load_dataset(
	    expert_trajectories, limit_trajs, data_subsamp_freq)
	assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
	assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
	assert ext_Bstacked.ndim == 1



	# Generate the actions according to the learner's policy for the expert's observations
	learner_actions_Bstacked_Da = policy.sample_actions(exobs_Bstacked_Do)[0]

	# Calcualating the deviation histogram:
	action_deviations = np.linalg.norm(exa_Bstacked_Da - learner_actions_Bstacked_Da, axis=1)

	# Plot the histogram
	# sns.kdeplot(action_deviations,shade=True)

	# FIXME: Uncomment the following
	plt.figure()
	plt.hist(action_deviations, bins=100)
	plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_expert_trajectories.png')
	plt.show()	

	if ipython_after_eval:
		import IPython; IPython.embed()    
Esempio n. 3
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('env', type=str)
    parser.add_argument('--num_eval_trajs', type=int, default=50)
    parser.add_argument('--max_traj_len', type=int, default=None)
    parser.add_argument('--out', type=str, default=None)

    args = parser.parse_args()

    # Initialize the mdp
    mdp = rlgymenv.RLGymMDP(args.env)
    env = gym.make(args.env)
    print "Initialized environment %s" % args.env
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Run the simulation
    returns = []
    lengths = []
    sim = mdp.new_sim()

    for i_traj in range(args.num_eval_trajs):
        print i_traj, args.num_eval_trajs
        sim.reset()
        totalr = 0.
        l = 0
        while not sim.done and l < args.max_traj_len:
            #a = [np.random.uniform(mdp.action_space.low[i], mdp.action_space.high[i]) for i in range(len(mdp.action_space.shape[0]))]
            a = env.action_space.sample()
            if isinstance(mdp.action_space, policyopt.FiniteSpace):
                a = np.asarray([a])
            r = sim.step(a)
            totalr += r
            l += 1
        returns.append(totalr)
        lengths.append(l)
    print "Mean reward: {}, Std reward: {}, Mean length: {}, Std length: {}\n".format(
        np.asarray(returns).mean(),
        np.asarray(returns).std(),
        np.asarray(lengths).mean(),
        np.asarray(lengths).std())
    if args.out is not None:
        with open(args.out, 'w') as f:
            f.write(
                "Mean reward: {}, Std reward: {}, Mean length: {}, Std length: {}\n"
                .format(
                    np.asarray(returns).mean(),
                    np.asarray(returns).std(),
                    np.asarray(lengths).mean(),
                    np.asarray(lengths).std()))
            f.close()
Esempio n. 4
0
 def __init__(self, input_B_Di, output_shape, func):
     util.header('Nonlinearity(func=%s)' % func)
     self._output_shape = output_shape
     with variable_scope(type(self).__name__) as self.__varscope:
         self._output_B_Do = {
             'relu': tensor.nnet.relu,
             'lrelu': lambda x: tensor.nnet.relu(x, .01),
             'elu': tensor.nnet.elu,
             'tanh': tensor.tanh,
         }[func](input_B_Di)
Esempio n. 5
0
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic):
    policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx)
    trajbatch, _, _ = exec_saved_policy(
        env_name,
        policystr,
        num_trajs,
        deterministic=deterministic,
        max_traj_len=None)
    returns = trajbatch.r.padded(fill=0.).sum(axis=1)
    lengths = np.array([len(traj) for traj in trajbatch])
    util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std()))
    return returns, lengths
Esempio n. 6
0
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic):
    policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx)
    trajbatch, _, _ = exec_saved_policy(
        env_name,
        policystr,
        num_trajs,
        deterministic=deterministic,
        max_traj_len=None)
    returns = trajbatch.r.padded(fill=0.).sum(axis=1)
    lengths = np.array([len(traj) for traj in trajbatch])
    util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std()))
    return returns, lengths
Esempio n. 7
0
def phase1_train(spec, specfilename):
    util.header('=== Phase 1: training ===')

    # Generate array job that trains all algorithms
    # over all tasks, for all dataset sizes (3 loops)

    taskname2dset = gen_taskname2outfile(spec)

    # Make checkpoint dir. All outputs go here
    checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
    util.mkdir_p(checkptdir)
    # Make sure checkpoint dir is empty
    assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir)

    # Assemble the commands to run on the cluster
    cmd_templates, outputfilenames, argdicts = [], [], []
    for alg in spec['training']['algorithms']:
        for task in spec['tasks']:
            for num_trajs in spec['training']['dataset_num_trajs']:
                assert num_trajs <= spec['training']['full_dataset_num_trajs']
                for run in range(spec['training']['runs']):
                    # A string identifier. Used in filenames for this run
                    strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                    cmd_templates.append(alg['cmd'].replace('\n', ' ').strip())
                    outputfilenames.append(strid + '.txt')
                    argdicts.append({
                        'env': task['env'],
                        'dataset': taskname2dset[task['name']],
                        'num_trajs': num_trajs,
                        'cuts_off_on_success': int(task['cuts_off_on_success']),
                        'data_subsamp_freq': task['data_subsamp_freq'],
                        'out': os.path.join(checkptdir, strid + '.h5'),
                    })

    pbsopts = spec['options']['pbs']
    runpbs(
        cmd_templates, outputfilenames, argdicts,
        jobname=pbsopts['jobname'], queue=pbsopts['queue'], nodes=1, ppn=pbsopts['ppn'],
        job_range=pbsopts['range'] if 'range' in pbsopts else None,
        qsub_script_copy=os.path.join(checkptdir, 'qsub_script.sh')
    )

    # Copy the pipeline yaml file to the output dir too
    shutil.copyfile(specfilename, os.path.join(checkptdir, 'pipeline.yaml'))

    # Keep git commit
    import subprocess
    git_hash = subprocess.check_output('git rev-parse HEAD', shell=True).strip()
    with open(os.path.join(checkptdir, 'git_hash.txt'), 'w') as f:
        f.write(git_hash + '\n')
Esempio n. 8
0
def eval_snapshot(env_name, checkptfile, snapshot_idx, num_trajs, deterministic):
    """ Called during evaluation stage, prints results on screen and returns
    data which we save in a results `.h5` file. """
    policystr = '{}/snapshots/iter{:07d}'.format(checkptfile, snapshot_idx)
    trajbatch, _, _ = exec_saved_policy(
        env_name,
        policystr,
        num_trajs,
        deterministic=deterministic,
        max_traj_len=None)
    returns = trajbatch.r.padded(fill=0.).sum(axis=1)
    lengths = np.array([len(traj) for traj in trajbatch])
    util.header('{} gets return {} +/- {}'.format(policystr, returns.mean(), returns.std()))
    return returns, lengths
Esempio n. 9
0
def find_deviation_of_agent_actions_from_expert_actions_for_underperforming_trajectories(learner_trajectories, expert_policy, lower_bound_reward, ipython_after_eval, generate_plot):
	obs,a,r,l = find_underperforming_trajectories(learner_trajectories, lower_bound_reward)
	print(type(obs))
	# Load the expert's policy
	policy_file, policy_key = util.split_h5_name(expert_policy)
	print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
	with h5py.File(policy_file, 'r') as f:
	    train_args = json.loads(f.attrs['args'])
	    dset = f[policy_key]
	    import pprint
	    pprint.pprint(dict(dset.attrs))

	# Initialize the MDP
	env_name = train_args['env_name']
	print 'Loading environment', env_name
	mdp = rlgymenv.RLGymMDP(env_name)
	util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

	# Initialize the policy and load its parameters
	enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
	if isinstance(mdp.action_space, policyopt.ContinuousSpace):
	    policy_cfg = rl.GaussianPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        min_stdev=0.,
	        init_logstdev=0.,
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
	else:
	    policy_cfg = rl.GibbsPolicyConfig(
	        hidden_spec=train_args['policy_hidden_spec'],
	        enable_obsnorm=enable_obsnorm)
	    policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

	policy.load_h5(policy_file, policy_key)
	
	# Generate the actions according to the expert's policy for the observations in the underperforming trajs

	expert_actions = policy.sample_actions(obs.reshape((-1,obs.shape[-1])))[0].reshape((-1,a.shape[1],a.shape[2]))
	

	# Calcualating the deviation histogram:
	action_deviations = np.linalg.norm(expert_actions.reshape((-1,a.shape[-1])) - a.reshape((-1,a.shape[-1])), axis=1)
	if generate_plot:
		plt.figure()
		plt.hist(action_deviations, bins=100)
		plt.savefig('deviation_of_agent_actions_from_expert_actions_for_observations_from_underperforming_learner_trajectories.png')
		plt.show()	
	if ipython_after_eval:
		import IPython; IPython.embed() 
Esempio n. 10
0
def phase1_train(spec, specfilename):
    """ In the normal code, this rounds up a long list of commands of the form
    `python (script name) (arguments)` which can be run on a cluster.

    It's really cool how this works. The `cmd_templates` list turns into a bunch
    of python script calls, except it has string formatting to allow the
    arguments to fill them in. A much better way than writing a long bash
    script! (Actually, to *get* a bash script, just write these one by one to a
    file and then I think running the file is OK.)

    I modified this to run sequentially.
    """
    util.header('=== Phase 1: training ===')

    # Generate array job that trains (1) all algorithms over (2) all tasks, for
    # (3) all dataset sizes, so yes it's three loops.
    taskname2dset = gen_taskname2outfile(spec)

    # Make checkpoint dir. All outputs go here
    checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
    util.mkdir_p(checkptdir)
    # Make sure checkpoint dir is empty
    assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir)

    # Assemble the commands to run on the cluster
    cmd_templates, outputfilenames, argdicts = [], [], []
    for alg in spec['training']['algorithms']:
        for task in spec['tasks']:
            for num_trajs in spec['training']['dataset_num_trajs']:
                assert num_trajs <= spec['training']['full_dataset_num_trajs']
                for run in range(spec['training']['runs']):
                    # A string identifier. Used in filenames for this run
                    strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                    cmd_templates.append(alg['cmd'].replace('\n', ' ').strip())
                    outputfilenames.append(strid + '.txt')
                    argdicts.append({
                        'env': task['env'],
                        'dataset': taskname2dset[task['name']],
                        'num_trajs': num_trajs,
                        'cuts_off_on_success': int(task['cuts_off_on_success']),
                        'data_subsamp_freq': task['data_subsamp_freq'],
                        'out': os.path.join(checkptdir, strid + '.h5'),
                    })

    # (New code from Daniel) Put commands in a list and run them sequentially.
    all_commands = [x.format(**y) for (x,y) in zip(cmd_templates,argdicts)]
    print("Total number of commands to run: {}.".format(len(all_commands)))
    for command in all_commands:
        subprocess.call(command.split(" "))
Esempio n. 11
0
    def __init__(self, input_B_Di, input_shape, layerspec_json):
        '''
        Args:
            layerspec (string): JSON string describing layers
        '''
        assert len(input_shape) >= 1
        self.input_B_Di = input_B_Di

        layerspec = json.loads(layerspec_json)
        util.header('Loading feedforward net specification')
        print(json.dumps(layerspec, indent=2, separators=(',', ': ')))

        self.layers = []
        with variable_scope(type(self).__name__) as self.__varscope:

            prev_output, prev_output_shape = input_B_Di, input_shape

            for i_layer, ls in enumerate(layerspec):
                with variable_scope('layer_%d' % i_layer):
                    if ls['type'] == 'reshape':
                        _check_keys(ls, ['type', 'new_shape'], [])
                        self.layers.append(
                            ReshapeLayer(prev_output, ls['new_shape']))

                    elif ls['type'] == 'fc':
                        _check_keys(ls, ['type', 'n'], ['initializer'])
                        self.layers.append(
                            AffineLayer(prev_output,
                                        prev_output_shape,
                                        output_shape=(ls['n'], ),
                                        initializer=_parse_initializer(ls)))

                    elif ls['type'] == 'nonlin':
                        _check_keys(ls, ['type', 'func'], [])
                        self.layers.append(
                            NonlinearityLayer(prev_output, prev_output_shape,
                                              ls['func']))

                    else:
                        raise NotImplementedError('Unknown layer type %s' %
                                                  ls['type'])

                prev_output, prev_output_shape = self.layers[
                    -1].output, self.layers[-1].output_shape
        self._output, self._output_shape = prev_output, prev_output_shape
Esempio n. 12
0
def phase1_train(spec, specfilename):
    util.header('=== Phase 1: training ===')

    # Generate array job that trains all algorithms
    # over all tasks, for all dataset sizes (3 loops)

    taskname2dset = gen_taskname2outfile(spec)

    # Make checkpoint dir. All outputs go here
    checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
    util.mkdir_p(checkptdir)
    # Make sure checkpoint dir is empty
    assert not os.listdir(checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir)

    # Assemble the commands to run on the cluster
    cmd_templates, outputfilenames, argdicts = [], [], []
    for alg in spec['training']['algorithms']:
        for task in spec['tasks']:
            for num_trajs in spec['training']['dataset_num_trajs']:
                assert num_trajs <= spec['training']['full_dataset_num_trajs']
                for run in range(spec['training']['runs']):
                    # A string identifier. Used in filenames for this run
                    strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                    cmd_templates.append(alg['cmd'].replace('\n', ' ').strip())
                    outputfilenames.append(strid + '.txt')
                    argdicts.append({
                        'env': task['env'],
                        'dataset': taskname2dset[task['name']],
                        'num_trajs': num_trajs,
                        'cuts_off_on_success': int(task['cuts_off_on_success']),
                        'data_subsamp_freq': task['data_subsamp_freq'],
                        'out': os.path.join(checkptdir, strid + '.h5'),
                    })

    for x, y in zip(cmd_templates, argdicts):  
        subprocess.call (x.format(**y).split(" "))

    # Copy the pipeline yaml file to the output dir too
    shutil.copyfile(specfilename, os.path.join(checkptdir, 'pipeline.yaml'))

    # Keep git commit
    git_hash = subprocess.check_output('git rev-parse HEAD', shell=True).strip()
    with open(os.path.join(checkptdir, 'git_hash.txt'), 'w') as f:
        f.write(git_hash + '\n')
Esempio n. 13
0
def phase0_sampletrajs(spec, specfilename):
    """ The first phase, sampling expert trajectories from TRPO. 
    
    This *can* be done sequentially on one computer, no need to worry. This
    *will* save the .h5 files according to `storagedir` in the specs, so
    manually remove if needed. 
    
    This will sample `full_dataset_num_trajs` expert trajectories. I think it
    might be better to have that value be perhaps 50, since then I can use those
    values directly when plotting the expert performance alongside the
    algorithms, to be consistent in getting 50 samples.
    
    Just note that sampling more than 10 trajectories (or whatever our limit is)
    will **not** change the actual dataset, i.e. if we need 10 out of 20
    trajectories, the `load_datasets` method will always load the first 10, and
    not randomly pick 10 out of the 20.
    """
    util.header('=== Phase 0: Sampling trajs from expert policies ===')
    num_trajs = spec['training']['full_dataset_num_trajs']
    util.header('Sampling {} trajectories'.format(num_trajs))

    # Make filenames and check if they're valid first
    taskname2outfile = gen_taskname2outfile(spec, assert_not_exists=True)

    # Sample trajs for each task
    for task in spec['tasks']:
        # Execute the policy
        trajbatch, policy, _ = exec_saved_policy(
            task['env'], task['policy'], num_trajs,
            deterministic=spec['training']['deterministic_expert'],
            max_traj_len=None)
        
        # Quick evaluation
        returns = trajbatch.r.padded(fill=0.).sum(axis=1)
        avgr = trajbatch.r.stacked.mean()
        lengths = np.array([len(traj) for traj in trajbatch])
        ent = policy._compute_actiondist_entropy(trajbatch.adist.stacked).mean()
        print 'returns.shape: {}'.format(returns.shape)
        print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
        print 'avgr: {}'.format(avgr)
        print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
        print 'ent: {}'.format(ent)

        # Save the trajs to a file. Pad in case uneven lengths, but typically
        # the experts last the full duration so the lengths will be equivalent.
        with h5py.File(taskname2outfile[task['name']], 'w') as f:
            def write(dsetname, a):
                f.create_dataset(dsetname, data=a, compression='gzip', compression_opts=9)
            # Right-padded trajectory data using custom RaggedArray class.
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32))
            # # Also save args to this script
            # argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            # f.attrs['args'] = argstr
        util.header('Wrote {}'.format(taskname2outfile[task['name']]))
Esempio n. 14
0
def phase0_sampletrajs(spec, specfilename):
    util.header('=== Phase 0: Sampling trajs from expert policies ===')

    num_trajs = spec['training']['full_dataset_num_trajs']
    util.header('Sampling {} trajectories'.format(num_trajs))

    # Make filenames and check if they're valid first
    taskname2outfile = gen_taskname2outfile(spec, assert_not_exists=True)

    # Sample trajs for each task
    for task in spec['tasks']:
        # Execute the policy
        trajbatch, policy, _ = exec_saved_policy(
            task['env'],
            task['policy'],
            num_trajs,
            deterministic=spec['training']['deterministic_expert'],
            max_traj_len=None)

        # Quick evaluation
        returns = trajbatch.r.padded(fill=0.).sum(axis=1)
        avgr = trajbatch.r.stacked.mean()
        lengths = np.array([len(traj) for traj in trajbatch])
        ent = policy._compute_actiondist_entropy(
            trajbatch.adist.stacked).mean()
        print('ret: {} +/- {}'.format(returns.mean(), returns.std()))
        print('avgr: {}'.format(avgr))
        print('len: {} +/- {}'.format(lengths.mean(), lengths.std()))
        print('ent: {}'.format(ent))

        # Save the trajs to a file
        with h5py.File(taskname2outfile[task['name']], 'w') as f:

            def write(dsetname, a):
                f.create_dataset(dsetname,
                                 data=a,
                                 compression='gzip',
                                 compression_opts=9)

            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B',
                  np.array([len(traj) for traj in trajbatch], dtype=np.int32))
            # # Also save args to this script
            # argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            # f.attrs['args'] = argstr
        util.header('Wrote {}'.format(taskname2outfile[task['name']]))
Esempio n. 15
0
def phase0_sampletrajs(spec, specfilename):
    util.header('=== Phase 0: Sampling trajs from expert policies ===')

    num_trajs = spec['training']['full_dataset_num_trajs']
    util.header('Sampling {} trajectories'.format(num_trajs))

    # Make filenames and check if they're valid first
    taskname2outfile = gen_taskname2outfile(spec, assert_not_exists=True)

    # Sample trajs for each task
    for task in spec['tasks']:
        # Execute the policy
        trajbatch, policy, _ = exec_saved_policy(
            task['env'], task['policy'], num_trajs,
            deterministic=spec['training']['deterministic_expert'],
            max_traj_len=None)

        # Quick evaluation
        returns = trajbatch.r.padded(fill=0.).sum(axis=1)
        avgr = trajbatch.r.stacked.mean()
        lengths = np.array([len(traj) for traj in trajbatch])
        ent = policy._compute_actiondist_entropy(trajbatch.adist.stacked).mean()
        print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
        print 'avgr: {}'.format(avgr)
        print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
        print 'ent: {}'.format(ent)

        # Save the trajs to a file
        with h5py.File(taskname2outfile[task['name']], 'w') as f:
            def write(dsetname, a):
                f.create_dataset(dsetname, data=a, compression='gzip', compression_opts=9)
            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32))
            # # Also save args to this script
            # argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            # f.attrs['args'] = argstr
        util.header('Wrote {}'.format(taskname2outfile[task['name']]))
Esempio n. 16
0
def phase1_train(spec, specfilename):
    util.header('=== Phase 1: training ===')

    # Generate array job that trains all algorithms
    # over all tasks, for all dataset sizes (3 loops)

    taskname2dset = gen_taskname2outfile(spec)

    # Theano GPU command prefix
    gpu_cmd_prefix = 'THEANO_FLAGS=mode=FAST_RUN,floatX=float32,device=gpu'

    # Make checkpoint dir. All outputs go here
    checkptdir = os.path.join(spec['options']['storagedir'],
                              spec['options']['checkpt_subdir'])
    util.mkdir_p(checkptdir)
    # Make sure checkpoint dir is empty
    assert not os.listdir(
        checkptdir), 'Checkpoint directory {} is not empty!'.format(checkptdir)

    # Assemble the commands to run on the cluster
    cmd_templates, outputfilenames, argdicts = [], [], []
    for alg in spec['training']['algorithms']:
        for task in spec['tasks']:
            for num_trajs in spec['training']['dataset_num_trajs']:
                assert num_trajs <= spec['training']['full_dataset_num_trajs']
                for run in range(spec['training']['runs']):
                    # A string identifier. Used in filenames for this run
                    strid = 'alg={},task={},num_trajs={},run={}'.format(
                        alg['name'], task['name'], num_trajs, run)
                    # check if use gpu
                    if spec['training']['use_gpu']:
                        cmd_templates.append(
                            gpu_cmd_prefix + ' ' +
                            alg['cmd'].replace('\n', ' ').strip())
                    else:
                        cmd_templates.append(alg['cmd'].replace('\n',
                                                                ' ').strip())
                    outputfilenames.append(strid + '.txt')
                    argdicts.append({
                        'env':
                        task['env'],
                        'dataset':
                        taskname2dset[task['name']],
                        'num_trajs':
                        num_trajs,
                        'cuts_off_on_success':
                        int(task['cuts_off_on_success']),
                        'data_subsamp_freq':
                        task['data_subsamp_freq'],
                        'out':
                        os.path.join(checkptdir, strid + '.h5'),
                    })

    pbsopts = spec['options']['pbs']
    #    runpbs(
    #        cmd_templates, outputfilenames, argdicts,
    #        jobname=pbsopts['jobname'], queue=pbsopts['queue'], nodes=1, ppn=pbsopts['ppn'],
    #        job_range=pbsopts['range'] if 'range' in pbsopts else None,
    #        qsub_script_copy=os.path.join(checkptdir, 'qsub_script.sh')
    #    )
    runcmds(cmd_templates,
            outputfilenames,
            argdicts,
            jobname=pbsopts['jobname'],
            outputfile_dir=os.path.join(
                checkptdir, 'logs_%s_%s' %
                (pbsopts['jobname'],
                 datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))))

    # Copy the pipeline yaml file to the output dir too
    shutil.copyfile(specfilename, os.path.join(checkptdir, 'pipeline.yaml'))
Esempio n. 17
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_traj_len', type=int, default=None)
    parser.add_argument('--env_name', type=str, required=True)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--enable_obsnorm', type=int, default=1)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--use_tanh', type=int, default=0)
    # Optimizer
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    # Sampling
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=100000)
    # Saving stuff
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    if args.tiny_policy or args.use_tanh:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE

        if args.use_tanh:
            arch = json.loads(args.policy_hidden_spec)
            for layer in arch:
                if layer['type'] == 'nonlin':
                    layer['func'] = 'tanh'
            args.policy_hidden_spec = json.dumps(arch)
        print 'Modified architecture:', args.policy_hidden_spec

    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=bool(args.enable_obsnorm))
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            enable_obsnorm=bool(args.enable_obsnorm))
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    util.header('Policy architecture')
    policy.print_trainable_variables()

    vf = rl.ValueFunc(
        hidden_spec=args.policy_hidden_spec,
        obsfeat_space=mdp.obs_space,
        enable_obsnorm=bool(args.enable_obsnorm),
        enable_vnorm=True,
        max_kl=args.vf_max_kl,
        damping=args.vf_cg_damping,
        time_scale=1./mdp.env_spec.timestep_limit,
        varscope_name='ValueFunc')

    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len
    opt = rl.SamplingPolicyOptimizer(
        mdp=mdp,
        discount=args.discount,
        lam=args.lam,
        policy=policy,
        sim_cfg=SimConfig(
            min_num_trajs=-1,
            min_total_sa=args.min_total_sa,
            batch_size=args.sim_batch_size,
            max_traj_len=max_traj_len),
        step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping),
        value_func=vf,
        obsfeat_fn=lambda obs: obs,
    )

    log = nn.TrainingLog(args.log, [('args', argstr)])

    for i in xrange(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info, print_header=i % 20 == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)
Esempio n. 18
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    parser.add_argument('--seed', type=int, default=0)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    parser.add_argument('--use_shared_std_network', type=int, default=0)
    # Generative Moment matching
    parser.add_argument('--kernel_batchsize', type=int, default=1000)
    parser.add_argument('--kernel_reg_weight', type=float, default=0.)
    parser.add_argument('--use_median_heuristic', type=int, default=1)
    parser.add_argument('--use_logscale_reward', type=int)
    parser.add_argument('--reward_epsilon', type=float, default=0.0001)
    # Auto-Encoder Information
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)
    parser.add_argument('--save_reward', type=int, default=0)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy',
                                   bool(args.use_shared_std_network))
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space,
                                mdp.action_space, 'GibbsPolicy',
                                bool(args.use_shared_std_network))

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq, args.seed)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    elif args.mode == 'gmmil':
        if args.use_median_heuristic == 0:
            bandwidth_params = [
                1.0, 1.0 / 2.0, 1.0 / 5.0, 1.0 / 10.0, 1.0 / 40.0, 1.0 / 80.0
            ]
        else:
            bandwidth_params = []

        if args.reward_type == 'mmd':
            reward = gmmil.MMDReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked,
                kernel_bandwidth_params=bandwidth_params,
                kernel_reg_weight=args.kernel_reg_weight,
                kernel_batchsize=args.kernel_batchsize,
                use_median_heuristic=args.use_median_heuristic,
                use_logscale_reward=bool(args.use_logscale_reward),
                save_reward=bool(args.save_reward),
                epsilon=args.reward_epsilon)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer
    log = nn.TrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info,
                  print_header=i % (20 * args.print_freq) == 0,
                  display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()
Esempio n. 19
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')

    # add a spec for transition classifier
    parser.add_argument('--clf_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)

    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=100)
    parser.add_argument('--log', type=str, required=False)

    # Sequential model
    parser.add_argument('--seq_model', type=int, default=0)
    parser.add_argument('--time_step', type=int, default=10)

    args = parser.parse_args()

    # Initialize the MDP
    if not args.seq_model:
        if args.tiny_policy:
            assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
            args.policy_hidden_spec = TINY_ARCHITECTURE
        argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
        print(argstr)
    # Add sequential model
    else:
        if args.tiny_policy:
            assert args.policy_hidden_spec == SEQ_SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
            args.policy_hidden_spec = SEQ_TINY_ARCHITECTURE
#        # change the default architecture to fit sequential model
#        if args.policy_hidden_spec == SIMPLE_ARCHITECTURE:
#            args.policy_hidden_spec = SEQ_SIMPLE_ARCHITECTURE
        if args.clf_hidden_spec == SIMPLE_ARCHITECTURE:
            args.clf_hidden_spec = SEQ_SIMPLE_ARCHITECTURE
        argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'

    if not args.seq_model:
        if isinstance(mdp.action_space, policyopt.ContinuousSpace):
            policy_cfg = rl.GaussianPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm)
            policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space,
                                       mdp.action_space, 'GaussianPolicy')
        else:
            policy_cfg = rl.GibbsPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                enable_obsnorm=enable_obsnorm)
            policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space,
                                    mdp.action_space, 'GibbsPolicy')
    # Add squential model
    else:
        if isinstance(mdp.action_space, policyopt.ContinuousSpace):
            policy_cfg = rl.SeqGaussianPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                time_step=args.time_step,  # add time step
                min_stdev=0.,
                init_logstdev=0.,
                enable_obsnorm=enable_obsnorm,
                enable_actnorm=False)  # XXX not implement actnorm yet
            policy = rl.SeqGaussianPolicy(policy_cfg, mdp.obs_space,
                                          mdp.action_space,
                                          'SeqGaussianPolicy')
        else:
            policy_cfg = rl.SeqGibbsPolicyConfig(
                hidden_spec=args.policy_hidden_spec,
                time_step=args.time_step,  # add time step
                enable_obsnorm=enable_obsnorm,
                enable_actnorm=False)  # XXX not implement actnorm yet
            policy = rl.SeqGibbsPolicy(policy_cfg, mdp.obs_space,
                                       mdp.action_space, 'SeqGibbsPolicy')

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    #    print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype
    #    print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype
    #    print 'Debug: ext_Bstacked dtype:', ext_Bstacked.dtype

    #    assert 1 == 0

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print('Max traj len:', max_traj_len)

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        #        args.print_freq = args.bclone_eval_freq
        #        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len,
                smp_traj_len=-1),
            eval_freq=args.
            bclone_eval_freq,  # XXX set a value when using bclone
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len,
                                        smp_traj_len=-1),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping,
                              sequential_model=False),  # add sequential model
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Add Sequential Model
    elif args.mode == 'sga':
        if args.reward_type == 'nn':
            reward = imitation.SequentialTransitionClassifier(
                hidden_spec=args.clf_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                time_step=args.time_step,  # add time step
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='SequentialTransitionClassifier')
#        elif args.reward_type in ['l2ball', 'simplex']:
#            reward = imitation.LinearReward(
#                obsfeat_space=mdp.obs_space,
#                action_space=mdp.action_space,
#                mode=args.reward_type,
#                enable_inputnorm=True,
#                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
#                include_time=bool(args.reward_include_time),
#                time_scale=1./mdp.env_spec.timestep_limit,
#                exobs_Bex_Do=exobs_Bstacked_Do,
#                exa_Bex_Da=exa_Bstacked_Da,
#                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.SequentialValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            time_step=args.time_step,  # add time step
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='SequentialValueFunc')

        opt = imitation.SequentialImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SeqSimConfig(
                min_num_trajs=-1,
                min_total_sa=args.min_total_sa,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len,
                time_step=args.time_step),  # add time step
            step_func=rl.TRPO(
                max_kl=args.policy_max_kl,
                damping=args.policy_cg_damping,
                sequential_model=False),  # XXX not use sequential trpo
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        if not args.seq_model:
            policy.update_obsnorm(exobs_Bstacked_Do)
            if reward is not None:
                reward.update_inputnorm(
                    opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da)
            if vf is not None:
                vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))
        # Add sequential model
        else:
            Bstacked, Do, T = exobs_Bstacked_Do.shape[
                0], exobs_Bstacked_Do.shape[1], args.time_step
            exobs_BT_Do = exobs_Bstacked_Do[:T * (Bstacked // T), :]
            exa_BT_Da = exa_Bstacked_Da[:T * (Bstacked // T), :]
            # reshape:(B*T, ...) => (B, T, ...)
            exobs_B_T_Do = np.reshape(
                exobs_BT_Do, (Bstacked // T, T, exobs_Bstacked_Do.shape[1]))
            exa_B_T_Da = np.reshape(
                exa_BT_Da, (Bstacked // T, T, exa_Bstacked_Da.shape[1]))
            print("Debug: exobs_Bstacked_Do:", exobs_Bstacked_Do.shape[0],
                  exobs_Bstacked_Do.shape[1])
            print("Debug: exobs_B_T_Do:", exobs_B_T_Do.shape[0],
                  exobs_B_T_Do.shape[1], exobs_B_T_Do.shape[2])
            # XXX use original policy (not sequential)
            policy.update_obsnorm(exobs_Bstacked_Do)
            if reward is not None:
                reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_B_T_Do),
                                        exa_B_T_Da)
            if vf is not None:
                vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer

#    log = nn.TrainingLog(args.log, [('args', argstr)])
    log = nn.BasicTrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        #        log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0)
        log.add_log(iter_info,
                    print_header=i % (20 * args.print_freq) == 0,
                    display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            print('%i/%i iters is done. Save snapshot.' % (i, args.max_iter))
            #            log.write_snapshot(policy, i)
            log.write_snapshot(policy, i)

        if args.mode == 'ga' and args.plot_freq != 0 and i % args.plot_freq == 0:
            print('%i/%i iters is done. Save plot.' % (i, args.max_iter))
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)
            # convert dtype to follow theano config
            exdata_N_Doa = exdata_N_Doa.astype(theano.config.floatX)
            pdata_M_Doa = pdata_M_Doa.astype(theano.config.floatX)
            #            print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype    # float32
            #            print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype    # int64
            #            print 'Debug: opt.last_sampbatch.obs.stacked dtype:', opt.last_sampbatch.obs.stacked.dtype    # float32
            #            print 'Debug: opt.last_sampbatch.a.stacked dtype:', opt.last_sampbatch.a.stacked.dtype    # int64
            #            print 'Debug: exdata_N_Doa dtype:', exdata_N_Doa.dtype    # float32
            #            print 'Debug: pdata_M_Doa dtype:', pdata_M_Doa.dtype    # float32

            # Plot reward
            #            import matplotlib
            #            matplotlib.use('Agg')
            #            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))

            #            print 'Debug: range1 types:', type(range1[0]), type(range1[1])    # float32, float32
            #            print 'Debug: range2 types:', type(range2[0]), type(range2[1])    # float32, float32

            x, y, z = reward.plot(ax, idx1, idx2, range1, range2, n=100)
            plot = [
                x, y, z, exdata_N_Doa[:, idx1], exdata_N_Doa[:, idx2],
                pdata_M_Doa[:, idx1], pdata_M_Doa[:, idx2]
            ]
            log.write_plot(plot, i)

            # Plot expert data


#            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

# Plot policy samples
#            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

#            ax.legend()
#            plt.show()
#            plt.savefig()
#            plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]]
#            log.write_plot(plot, i)

#        if args.mode == 'sga' and args.plot_freq != 0 and i % args.plot_freq == 0:
#            print ('%i/%i iters is done. Save plot.' %(i, args.max_iter))
#            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1)
#            # reshape: (B, T, ...) => (B*T, ...)
##            B, T, Df = opt.last_sampbatch.obs.stacked.shape
##            obs_flatten = np.reshape(opt.last_sampbatch.obs.stacked, (B*T, opt.last_sampbatch.obs.stacked.shape[2]))
##            a_flatten = np.reshape(opt.last_sampbatch.a.stacked, (B*T, opt.last_sampbatch.a.stacked.shape[2]))
###            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)
#            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)
#            # convert dtype to follow theano config
#            exdata_N_Doa = exdata_N_Doa.astype(theano.config.floatX)
#            pdata_M_Doa = pdata_M_Doa.astype(theano.config.floatX)
##            print 'Debug: exobs_Bstacked_Do dtype:', exobs_Bstacked_Do.dtype    # float32
##            print 'Debug: exa_Bstacked_Da dtype:', exa_Bstacked_Da.dtype    # int64
##            print 'Debug: opt.last_sampbatch.obs.stacked dtype:', opt.last_sampbatch.obs.stacked.dtype    # float32
##            print 'Debug: opt.last_sampbatch.a.stacked dtype:', opt.last_sampbatch.a.stacked.dtype    # int64
##            print 'Debug: exdata_N_Doa dtype:', exdata_N_Doa.dtype    # float32
##            print 'Debug: pdata_M_Doa dtype:', pdata_M_Doa.dtype    # float32

#            # Plot reward
##            import matplotlib
##            matplotlib.use('Agg')
##            import matplotlib.pyplot as plt
#            _, ax = plt.subplots()
#            idx1, idx2 = 0,1
#            range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max()))
#            range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max()))

##            print 'Debug: range1 types:', type(range1[0]), type(range1[1])    # float32, float32
##            print 'Debug: range2 types:', type(range2[0]), type(range2[1])    # float32, float32

#           # for sequential model, input the length of sequence
#           # XXX take care of the usage of memory !!
#           x, y, z = reward.plot(ax, idx1, idx2, range1, range2, args.time_step, n=100)
#           plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]]
#           log.write_plot(plot, i)

#            # Plot expert data
##            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

#            # Plot policy samples
##            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

##            ax.legend()
##            plt.show()
##            plt.savefig()
##            plot = [x, y, z, exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2]]
##            log.write_plot(plot, i)

# write log
    print('Training is done. Save log.')
    log.write_log()
    log.close()
Esempio n. 20
0
 def __init__(self, input_, new_shape):
     self._output_shape = tuple(new_shape)
     util.header('Reshape(new_shape=%s)' % (str(self._output_shape), ))
     with variable_scope(type(self).__name__) as self.__varscope:
         self._output = input_.reshape((-1, ) + self._output_shape)
Esempio n. 21
0
def main():
    """ 
    NOTE! Don't forget that these are effectively called directly from the yaml
    files. They call imitate_mj.py with their own arguments, so check there if
    some of the values differ from the default ones.
    """
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer (ok ... 128 and 0.7 settings are in the paper).
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    print("\n\tNow initializing the policy:")
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))
    print("\tFinished initializing the policy.\n")

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            # FYI: this is the GAIL case. Note that it doesn't take in any of
            # the raw expert data, unlike the other reward types. And we call
            # them `reward types` since the optimize can use their output in
            # some way to impove itself.
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
        elif args.reward_type in ['l2ball', 'simplex']:
            # FEM or game-theoretic apprenticeship learning, respectively.
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        # All three of these 'advanced' IL algorithms use neural network value
        # functions to reduce variance for policy gradient estimates.
        print("\n\tThe **VALUE** function (may have action concatenated):")
        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1. / mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                        min_total_sa=args.min_total_sa,
                                        batch_size=args.sim_batch_size,
                                        max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl,
                              damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer, i.e. {BehavioralCloning,Imitation}Optimizer.
    log = nn.TrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info,
                  print_header=i % (20 * args.print_freq) == 0,
                  display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()
Esempio n. 22
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--limit_trajs', type=int, required=True)
    parser.add_argument('--data_subsamp_freq', type=int, required=True)
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(),))

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None
        opt = imitation.BehavioralCloningOptimizer(
            mdp, policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o:o,
            ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc(
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1./mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(
                min_num_trajs=-1, min_total_sa=args.min_total_sa,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da)
        if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

    # Run optimizer
    log = nn.TrainingLog(args.log, [('args', argstr)])
    for i in xrange(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info, print_header=i % (20*args.print_freq) == 0, display=i % args.print_freq == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1)
            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0,1
            range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max()))
            range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

            ax.legend()
            plt.show()
Esempio n. 23
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_traj_len', type=int, default=None)
    parser.add_argument('--env_name', type=str, required=True)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--enable_obsnorm', type=int, default=1)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--use_tanh', type=int, default=0)
    # Optimizer
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    # Sampling
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=100000)
    # Saving stuff
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    if args.tiny_policy or args.use_tanh:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE

        if args.use_tanh:
            arch = json.loads(args.policy_hidden_spec)
            for layer in arch:
                if layer['type'] == 'nonlin':
                    layer['func'] = 'tanh'
            args.policy_hidden_spec = json.dumps(arch)
        print('Modified architecture:', args.policy_hidden_spec)

    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=bool(args.enable_obsnorm))
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=bool(
                                              args.enable_obsnorm))
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')

    util.header('Policy architecture')
    policy.print_trainable_variables()

    vf = rl.ValueFunc(hidden_spec=args.policy_hidden_spec,
                      obsfeat_space=mdp.obs_space,
                      enable_obsnorm=bool(args.enable_obsnorm),
                      enable_vnorm=True,
                      max_kl=args.vf_max_kl,
                      damping=args.vf_cg_damping,
                      time_scale=1. / mdp.env_spec.timestep_limit,
                      varscope_name='ValueFunc')

    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print('Max traj len:', max_traj_len)
    opt = rl.SamplingPolicyOptimizer(
        mdp=mdp,
        discount=args.discount,
        lam=args.lam,
        policy=policy,
        sim_cfg=SimConfig(min_num_trajs=-1,
                          min_total_sa=args.min_total_sa,
                          batch_size=args.sim_batch_size,
                          max_traj_len=max_traj_len),
        step_func=rl.TRPO(max_kl=args.policy_max_kl,
                          damping=args.policy_cg_damping),
        value_func=vf,
        obsfeat_fn=lambda obs: obs,
    )

    log = nn.TrainingLog(args.log, [('args', argstr)])

    for i in range(args.max_iter):
        iter_info = opt.step()
        log.write(iter_info, print_header=i % 20 == 0)
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            log.write_snapshot(policy, i)
Esempio n. 24
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int, default=None) # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    if args.eval_only:
        n = 50
        print 'Evaluating based on {} trajs'.format(n)

        if False:
            eval_trajbatch = mdp.sim_mp(
                policy_fn=lambda obs_B_Do: policy.sample_actions(obs_B_Do, args.deterministic),
                obsfeat_fn=lambda obs:obs,
                cfg=policyopt.SimConfig(
                    min_num_trajs=n, min_total_sa=-1,
                    batch_size=None, max_traj_len=args.max_traj_len))
            returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1)
            avgr = eval_trajbatch.r.stacked.mean()
            lengths = np.array([len(traj) for traj in eval_trajbatch])
            ent = policy._compute_actiondist_entropy(eval_trajbatch.adist.stacked).mean()
            print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
            print 'avgr: {}'.format(avgr)
            print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
            print 'ent: {}'.format(ent)
            print returns
        else:
            returns = []
            lengths = []
            sim = mdp.new_sim()
            for i_traj in xrange(n):
                print i_traj, n
                sim.reset()
                totalr = 0.
                l = 0
                while not sim.done:
                    a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
                    r = sim.step(a)
                    totalr += r
                    l += 1
                returns.append(totalr)
                lengths.append(l)
        import IPython; IPython.embed()

    elif args.out is not None:
        # Sample trajs and write to file
        print 'Saving traj samples to file: {}'.format(args.out)

        assert not os.path.exists(args.out)
        assert args.count > 0
        # Simulate to create a trajectory batch
        util.header('Sampling {} trajectories of maximum length {}'.format(args.count, args.max_traj_len))
        trajs = []
        for i in tqdm.trange(args.count):
            trajs.append(mdp.sim_single(
                lambda obs: policy.sample_actions(obs, args.deterministic),
                lambda obs: obs,
                args.max_traj_len))
        trajbatch = policyopt.TrajBatch.FromTrajs(trajs)

        print
        print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean()

        # Save the trajs to a file
        with h5py.File(args.out, 'w') as f:
            def write(name, a):
                # chunks of 128 trajs each
                f.create_dataset(name, data=a, chunks=(min(128, a.shape[0]),)+a.shape[1:], compression='gzip', compression_opts=9)

            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B', np.array([len(traj) for traj in trajbatch], dtype=np.int32))

            # Also save args to this script
            argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            f.attrs['args'] = argstr

    else:
        # Animate
        sim = mdp.new_sim()
        raw_obs, normalized_obs = [], []
        while True:
            sim.reset()
            totalr = 0.
            steps = 0
            while not sim.done:
                raw_obs.append(sim.obs[None,:])
                normalized_obs.append(policy.compute_internal_normalized_obsfeat(sim.obs[None,:]))

                a = policy.sample_actions(sim.obs[None,:], args.deterministic)[0][0,:]
                r = sim.step(a)
                totalr += r
                steps += 1
                sim.draw()

                if steps % 1000 == 0:
                    tmpraw = np.concatenate(raw_obs, axis=0)
                    tmpnormed = np.concatenate(normalized_obs, axis=0)
                    print 'raw mean, raw std, normed mean, normed std'
                    print np.stack([tmpraw.mean(0), tmpraw.std(0), tmpnormed.mean(0), tmpnormed.std(0)])
            print 'Steps: %d, return: %.5f' % (steps, totalr)
Esempio n. 25
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument('--resume_training', action='store_true', help="Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf") 
    parser.add_argument('--checkpoint', type=str, help="Load from checkpoint if provided and if --resume_training") 
    parser.add_argument('--limit_trajs', type=int, required=True, help="How many expert trajectories to be used for training. If None : full dataset is used.") 
    parser.add_argument('--data_subsamp_freq', type=int, required=True, help="A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)")
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec', type=str, default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode', choices=OBSNORM_MODES, default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)
    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping', type=float, default=.1)
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl', type=float, default=.01)
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)

    args = parser.parse_args()

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)
    print "\n\n========== Policy network specifications loaded ===========\n\n"

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    print "\n\n========== MDP initialized ===========\n\n"

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')

    #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
    if args.resume_training:
        if args.checkpoint is not None:
            file, policy_key = util.split_h5_name(args.checkpoint)
            policy_file = file[:-3]+'_policy.h5'
            policy.load_h5(policy_file, policy_key)

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(),))

    print "\n\n========== Policy initialized ===========\n\n"

    # Load expert data
    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data, args.limit_trajs, args.data_subsamp_freq)
    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    print "\n\n========== Expert data loaded ===========\n\n"

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None #There is no role of the reward function or value function in behavioral cloning
        opt = imitation.BehavioralCloningOptimizer(
            mdp, policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o:o,
            ex_obs=exobs_Bstacked_Do, ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs, min_total_sa=-1,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

        print "======= Behavioral Cloning optimizer initialized ======="

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier( #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier')
            #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
            if args.resume_training:
                if args.checkpoint is not None:
                    file, reward_key = util.split_h5_name(args.checkpoint)
                    reward_file = file[:-3]+'_reward.h5'
                    print reward_file
                    reward.load_h5(reward_file, reward_key)

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1./mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(args.no_vf) else rl.ValueFunc( #Add resume training functionality
            hidden_spec=args.policy_hidden_spec,
            obsfeat_space=mdp.obs_space,
            enable_obsnorm=args.obsnorm_mode != 'none',
            enable_vnorm=True,
            max_kl=args.vf_max_kl,
            damping=args.vf_cg_damping,
            time_scale=1./mdp.env_spec.timestep_limit,
            varscope_name='ValueFunc')
        if args.resume_training:
            if args.checkpoint is not None:
                file, vf_key = util.split_h5_name(args.checkpoint)
                vf_file = file[:-3]+'_vf.h5'
                vf.load_h5(vf_file, vf_key)

        opt = imitation.ImitationOptimizer(
            mdp=mdp,
            discount=args.discount,
            lam=args.lam,
            policy=policy,
            sim_cfg=policyopt.SimConfig(
                min_num_trajs=-1, min_total_sa=args.min_total_sa,
                batch_size=args.sim_batch_size, max_traj_len=max_traj_len),
            step_func=rl.TRPO(max_kl=args.policy_max_kl, damping=args.policy_cg_damping),
            reward_func=reward,
            value_func=vf,
            policy_obsfeat_fn=lambda obs: obs,
            reward_obsfeat_fn=lambda obs: obs,
            policy_ent_reg=args.policy_ent_reg,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None: reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do), exa_Bstacked_Da)
        if vf is not None: vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

        print "======== Observation normalization done ========"

    # Run optimizer
    print "======== Optimization begins ========"

    # Trial: make checkpoints for policy, reward and vf
    policy_log = nn.TrainingLog(args.log[:-3]+'_policy.h5', [('args', argstr)])
    reward_log = nn.TrainingLog(args.log[:-3]+'_reward.h5', [('args', argstr)])
    vf_log = nn.TrainingLog(args.log[:-3]+'_vf.h5', [('args', argstr)])
    

    for i in xrange(args.max_iter):
        
        #Optimization step
        iter_info = opt.step() 

        #Log and plot
        #pdb.set_trace()
    	policy_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        reward_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        vf_log.write(iter_info, 
                print_header=i % (20*args.print_freq) == 0, 
                display=i % args.print_freq == 0 ## FIXME: AS remove comment
                )
        

        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            policy_log.write_snapshot(policy, i)
            reward_log.write_snapshot(reward, i)
            vf_log.write_snapshot(vf, i)

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da], axis=1)
            pdata_M_Doa = np.concatenate([opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked], axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0,1
            range1 = (min(exdata_N_Doa[:,idx1].min(), pdata_M_Doa[:,idx1].min()), max(exdata_N_Doa[:,idx1].max(), pdata_M_Doa[:,idx1].max()))
            range2 = (min(exdata_N_Doa[:,idx2].min(), pdata_M_Doa[:,idx2].min()), max(exdata_N_Doa[:,idx2].max(), pdata_M_Doa[:,idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:,idx1], exdata_N_Doa[:,idx2], color='blue', s=1, label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:,idx1], pdata_M_Doa[:,idx2], color='red', s=1, label='apprentice')

            ax.legend()
            plt.show()
Esempio n. 26
0
def main():
    """
    If we have trained policies and snapshots, I think we can use this to watch
    videos of our agent in action. I don't think I can use this without doing
    some training first. This doesn't do training itself; we need to provide a
    policy, but the h5 file has to also be a directory which contains other
    information (see the yaml files for what I believe are similar examples).

    I'm not sure why we have rl giving us Gaussian policies vs Gibbs policies.
    What's the difference? They should just be functions mapping from states to
    actions?

    After that, it seems like we're just simulating stuff and hopefully a video
    would appear if I can get this to run.
    """
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('output_dir', type=str)
    parser.add_argument('--deterministic', default=1, type=int)
    parser.add_argument('--max_steps', type=int, required=True)
    parser.add_argument('--env_name', type=str, default=None)
    args = parser.parse_args()

    util.mkdir_p(args.output_dir)
    assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir
    print 'Writing to', args.output_dir

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)

    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name'] if args.env_name is None else args.env_name
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    util.header('Max steps is {}'.format(args.max_steps))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    # Animate
    sim = mdp.new_sim()
    steps = 0
    exit = False
    while not exit:
        sim.reset()
        while not sim.done:
            a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
            sim.step(a)
            sim.draw()
            viewer = sim.env.viewer
            data, w, h = viewer.get_image()
            image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:]
            cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1])

            print steps
            steps += 1

            if steps >= args.max_steps:
                exit = True
                break
Esempio n. 27
0
 def print_trainable_variables(self):
     for v in self.get_trainable_variables():
         util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
     util.header('Total: %d parameters' % (self.get_num_params(), ))
Esempio n. 28
0
def phase2_eval(spec, specfilename):
    util.header('=== Phase 2: evaluating trained models ===')
    import pandas as pd

    taskname2dset = gen_taskname2outfile(spec)

    # This is where model logs are stored.
    # We will also store the evaluation here.
    checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
    print 'Evaluating results in {}'.format(checkptdir)

    results_full_path = os.path.join(checkptdir, spec['options']['results_filename'])
    print 'Will store results in {}'.format(results_full_path)
    if os.path.exists(results_full_path):
        raise RuntimeError('Results file {} already exists'.format(results_full_path))

    # First, pre-determine which evaluations we have to do
    evals_to_do = []
    nonexistent_checkptfiles = []
    for task in spec['tasks']:
        # See how well the algorithms did...
        for alg in spec['training']['algorithms']:
            # ...on various dataset sizes
            for num_trajs in spec['training']['dataset_num_trajs']:
                # for each rerun, for mean / error bars later
                for run in range(spec['training']['runs']):
                    # Make sure the checkpoint file exists (maybe PBS dropped some jobs)
                    strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                    checkptfile = os.path.join(checkptdir, strid + '.h5')
                    if not os.path.exists(checkptfile):
                        nonexistent_checkptfiles.append(checkptfile)
                    evals_to_do.append((task, alg, num_trajs, run, checkptfile))

    if nonexistent_checkptfiles:
        print 'Cannot find checkpoint files:\n', '\n'.join(nonexistent_checkptfiles)
        raise RuntimeError

    # Walk through all saved checkpoints
    collected_results = []
    for i_eval, (task, alg, num_trajs, run, checkptfile) in enumerate(evals_to_do):
        util.header('Evaluating run {}/{}: alg={},task={},num_trajs={},run={}'.format(
            i_eval+1, len(evals_to_do), alg['name'], task['name'], num_trajs, run))

        # Load the task's traj dataset to see how well the expert does
        with h5py.File(taskname2dset[task['name']], 'r') as trajf:
            # Expert's true return and traj lengths
            ex_traj_returns = trajf['r_B_T'][...].sum(axis=1)
            ex_traj_lengths = trajf['len_B'][...]

        # Load the checkpoint file
        with pd.HDFStore(checkptfile, 'r') as f:
            log_df = f['log']
            log_df.set_index('iter', inplace=True)

            # Evaluate true return for the learned policy
            if alg['name'] == 'bclone':
                # Pick the policy with the best validation accuracy
                best_snapshot_idx = log_df['valacc'].argmax()
                alg_traj_returns, alg_traj_lengths = eval_snapshot(
                    task['env'], checkptfile, best_snapshot_idx,
                    spec['options']['eval_num_trajs'], deterministic=True)

            elif any(alg['name'].startswith(s) for s in ('ga', 'fem', 'simplex')):
                # Evaluate the last saved snapshot
                snapshot_names = f.root.snapshots._v_children.keys()
                assert all(name.startswith('iter') for name in snapshot_names)
                snapshot_inds = sorted([int(name[len('iter'):]) for name in snapshot_names])
                best_snapshot_idx = snapshot_inds[-1]
                alg_traj_returns, alg_traj_lengths = eval_snapshot(
                    task['env'], checkptfile, best_snapshot_idx,
                    spec['options']['eval_num_trajs'], deterministic=True)

            else:
                raise NotImplementedError('Analysis not implemented for {}'.format(alg['name']))

            collected_results.append({
                # Trial info
                'alg': alg['name'],
                'task': task['name'],
                'num_trajs': num_trajs,
                'run': run,
                # Expert performance
                'ex_traj_returns': ex_traj_returns,
                'ex_traj_lengths': ex_traj_lengths,
                # Learned policy performance
                'alg_traj_returns': alg_traj_returns,
                'alg_traj_lengths': alg_traj_lengths,
            })

    collected_results = pd.DataFrame(collected_results)
    with pd.HDFStore(results_full_path, 'w') as outf:
        outf['results'] = collected_results
Esempio n. 29
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int,
                        default=None)  # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    #filenames = os.listdir(args.policy)
    csvf = open(args.policy[:-3] + '.csv', 'w')
    csvwriter = csv.writer(csvf)

    dataf = open(args.policy[:-3] + 'full.csv', 'w')
    datawriter = csv.writer(dataf)
    #csvwriter.writerow(['filename', 'average', 'std'])

    # Load the saved state
    if args.policy.find('reacher') > 0:
        key_iter = 200
    elif args.policy.find('humanoid') > 0:
        key_iter = 1500
    else:
        key_iter = 500

    policy_file, policy_key = util.split_h5_name(args.policy +
                                                 '/snapshots/iter%07d' %
                                                 key_iter)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    if args.policy.find('shared1') > 0:
        sharednet = True
    else:
        sharednet = False

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters

    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg,
                                   mdp.obs_space,
                                   mdp.action_space,
                                   'GaussianPolicy',
                                   use_shared_std_network=sharednet)
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg,
                                mdp.obs_space,
                                mdp.action_space,
                                'GibbsPolicy',
                                use_shared_std_network=sharednet)
    policy.load_h5(policy_file, policy_key)

    n = 50
    print 'Evaluating based on {} trajs'.format(n)

    returns = []
    lengths = []
    sim = mdp.new_sim()

    for i_traj in xrange(n):
        iteration = 0
        sim.reset()
        totalr = 0.
        l = 0
        while not sim.done and iteration < args.max_traj_len:
            a = policy.sample_actions(sim.obs[None, :],
                                      bool(args.deterministic))[0][0, :]
            r = sim.step(a)
            totalr += r
            l += 1
            iteration += 1

        print i_traj, n, totalr, iteration
        datawriter.writerow([i_traj, n, totalr, iteration])
        returns.append(totalr)
        lengths.append(l)
    avg, std = np.array(returns).mean(), np.array(returns).std()
    print 'Avg Return: ', avg, 'Std: ', std
    csvwriter.writerow([args.policy, avg, std])
    del policy
    #import IPython; IPython.embed()

    csvf.close()
    dataf.close()
Esempio n. 30
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('--eval_only', action='store_true')
    parser.add_argument('--max_traj_len', type=int,
                        default=None)  # only used for saving
    parser.add_argument('--out', type=str, default=None)
    parser.add_argument('--count', type=int, default=None)
    parser.add_argument('--deterministic', action='store_true')
    args = parser.parse_args()

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name']
    print 'Loading environment', env_name
    mdp = rlgymenv.RLGymMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    if args.max_traj_len is None:
        args.max_traj_len = mdp.env_spec.timestep_limit
    util.header('Max traj len is {}'.format(args.max_traj_len))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']
                          ) if 'enable_obsnorm' in train_args else train_args[
                              'obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    if args.eval_only:
        n = 50
        print 'Evaluating based on {} trajs'.format(n)

        if False:
            eval_trajbatch = mdp.sim_mp(
                policy_fn=lambda obs_B_Do: policy.sample_actions(
                    obs_B_Do, args.deterministic),
                obsfeat_fn=lambda obs: obs,
                cfg=policyopt.SimConfig(min_num_trajs=n,
                                        min_total_sa=-1,
                                        batch_size=None,
                                        max_traj_len=args.max_traj_len))
            returns = eval_trajbatch.r.padded(fill=0.).sum(axis=1)
            avgr = eval_trajbatch.r.stacked.mean()
            lengths = np.array([len(traj) for traj in eval_trajbatch])
            ent = policy._compute_actiondist_entropy(
                eval_trajbatch.adist.stacked).mean()
            print 'ret: {} +/- {}'.format(returns.mean(), returns.std())
            print 'avgr: {}'.format(avgr)
            print 'len: {} +/- {}'.format(lengths.mean(), lengths.std())
            print 'ent: {}'.format(ent)
            print returns
        else:
            returns = []
            lengths = []
            sim = mdp.new_sim()

            for i_traj in xrange(n):
                iteration = 0
                sim.reset()
                totalr = 0.
                l = 0
                while not sim.done and iteration < args.max_traj_len:
                    a = policy.sample_actions(sim.obs[None, :],
                                              bool(
                                                  args.deterministic))[0][0, :]
                    r = sim.step(a)
                    totalr += r
                    l += 1
                    iteration += 1

                print i_traj, n, totalr, iteration
                returns.append(totalr)
                lengths.append(l)

            print 'Avg Return: ', np.array(returns).mean()
            print 'Std Return: ', np.array(returns).std()
        #import IPython; IPython.embed()

    elif args.out is not None:
        # Sample trajs and write to file
        print 'Saving traj samples to file: {}'.format(args.out)

        assert not os.path.exists(args.out)
        assert args.count > 0
        # Simulate to create a trajectory batch
        util.header('Sampling {} trajectories of maximum length {}'.format(
            args.count, args.max_traj_len))
        trajs = []
        for i in tqdm.trange(args.count):
            trajs.append(
                mdp.sim_single(
                    lambda obs: policy.sample_actions(obs, args.deterministic),
                    lambda obs: obs, args.max_traj_len))
        trajbatch = policyopt.TrajBatch.FromTrajs(trajs)

        print
        print 'Average return:', trajbatch.r.padded(fill=0.).sum(axis=1).mean()

        # Save the trajs to a file
        with h5py.File(args.out, 'w') as f:

            def write(name, a):
                # chunks of 128 trajs each
                f.create_dataset(name,
                                 data=a,
                                 chunks=(min(128, a.shape[0]), ) + a.shape[1:],
                                 compression='gzip',
                                 compression_opts=9)

            # Right-padded trajectory data
            write('obs_B_T_Do', trajbatch.obs.padded(fill=0.))
            write('a_B_T_Da', trajbatch.a.padded(fill=0.))
            write('r_B_T', trajbatch.r.padded(fill=0.))
            # Trajectory lengths
            write('len_B',
                  np.array([len(traj) for traj in trajbatch], dtype=np.int32))

            # Also save args to this script
            argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
            f.attrs['args'] = argstr

    else:
        # Animate
        sim = mdp.new_sim()
        raw_obs, normalized_obs = [], []

        tret_list = []
        iteration = 0
        while iteration < 50:
            sim.reset()
            totalr = 0.
            steps = 0
            while not sim.done:
                raw_obs.append(sim.obs[None, :])
                normalized_obs.append(
                    policy.compute_internal_normalized_obsfeat(
                        sim.obs[None, :]))

                a = policy.sample_actions(sim.obs[None, :],
                                          args.deterministic)[0][0, :]
                r = sim.step(a)
                totalr += r
                steps += 1
                sim.draw()

                if steps % args.max_traj_len == 0:
                    tmpraw = np.concatenate(raw_obs, axis=0)
                    tmpnormed = np.concatenate(normalized_obs, axis=0)
                    print 'raw mean, raw std, normed mean, normed std'
                    print np.stack([
                        tmpraw.mean(0),
                        tmpraw.std(0),
                        tmpnormed.mean(0),
                        tmpnormed.std(0)
                    ])
                    break
            print 'Steps: %d, return: %.5f' % (steps, totalr)
            tret_list.append(totalr)
            iteration += 1

        print 'Avg Return: ', np.array(tret_list).mean()
        print 'Std Return: ', np.array(tret_list).std()
Esempio n. 31
0
def phase2_eval(spec, specfilename):
    util.header('=== Phase 2: evaluating trained models ===')
    import pandas as pd

    taskname2dset = gen_taskname2outfile(spec)

    # This is where model logs are stored.
    # We will also store the evaluation here.
    checkptdir = os.path.join(spec['options']['storagedir'], spec['options']['checkpt_subdir'])
    print 'Evaluating results in {}'.format(checkptdir)

    results_full_path = os.path.join(checkptdir, spec['options']['results_filename'])
    print 'Will store results in {}'.format(results_full_path)
    if os.path.exists(results_full_path):
        raise RuntimeError('Results file {} already exists'.format(results_full_path))

    # First, pre-determine which evaluations we have to do
    evals_to_do = []
    nonexistent_checkptfiles = []
    for task in spec['tasks']:
        # See how well the algorithms did...
        for alg in spec['training']['algorithms']:
            # ...on various dataset sizes
            for num_trajs in spec['training']['dataset_num_trajs']:
                # for each rerun, for mean / error bars later
                for run in range(spec['training']['runs']):
                    # Make sure the checkpoint file exists (maybe PBS dropped some jobs)
                    strid = 'alg={},task={},num_trajs={},run={}'.format(alg['name'], task['name'], num_trajs, run)
                    checkptfile = os.path.join(checkptdir, strid + '.h5')
                    if not os.path.exists(checkptfile):
                        nonexistent_checkptfiles.append(checkptfile)
                    evals_to_do.append((task, alg, num_trajs, run, checkptfile))

    if nonexistent_checkptfiles:
        print 'Cannot find checkpoint files:\n', '\n'.join(nonexistent_checkptfiles)
        raise RuntimeError

    # Walk through all saved checkpoints
    collected_results = []
    for i_eval, (task, alg, num_trajs, run, checkptfile) in enumerate(evals_to_do):
        util.header('Evaluating run {}/{}: alg={},task={},num_trajs={},run={}'.format(
            i_eval+1, len(evals_to_do), alg['name'], task['name'], num_trajs, run))

        # Load the task's traj dataset to see how well the expert does
        with h5py.File(taskname2dset[task['name']], 'r') as trajf:
            # Expert's true return and traj lengths
            ex_traj_returns = trajf['r_B_T'][...].sum(axis=1)
            ex_traj_lengths = trajf['len_B'][...]

        # Load the checkpoint file
        with pd.HDFStore(checkptfile, 'r') as f:
            log_df = f['log']
            log_df.set_index('iter', inplace=True)

            # Evaluate true return for the learned policy
            if alg['name'] == 'bclone':
                # Pick the policy with the best validation accuracy
                best_snapshot_idx = log_df['valacc'].argmax()
                alg_traj_returns, alg_traj_lengths = eval_snapshot(
                    task['env'], checkptfile, best_snapshot_idx,
                    spec['options']['eval_num_trajs'], deterministic=True)

            elif any(alg['name'].startswith(s) for s in ('ga', 'fem', 'simplex')):
                # Evaluate the last saved snapshot
                snapshot_names = f.root.snapshots._v_children.keys()
                assert all(name.startswith('iter') for name in snapshot_names)
                snapshot_inds = sorted([int(name[len('iter'):]) for name in snapshot_names])
                best_snapshot_idx = snapshot_inds[-1]
                alg_traj_returns, alg_traj_lengths = eval_snapshot(
                    task['env'], checkptfile, best_snapshot_idx,
                    spec['options']['eval_num_trajs'], deterministic=True)

            else:
                raise NotImplementedError('Analysis not implemented for {}'.format(alg['name']))

            collected_results.append({
                # Trial info
                'alg': alg['name'],
                'task': task['name'],
                'num_trajs': num_trajs,
                'run': run,
                # Expert performance
                'ex_traj_returns': ex_traj_returns,
                'ex_traj_lengths': ex_traj_lengths,
                # Learned policy performance
                'alg_traj_returns': alg_traj_returns,
                'alg_traj_lengths': alg_traj_lengths,
            })

    collected_results = pd.DataFrame(collected_results)
    with pd.HDFStore(results_full_path, 'w') as outf:
        outf['results'] = collected_results
Esempio n. 32
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    # MDP options
    parser.add_argument('policy', type=str)
    parser.add_argument('output_dir', type=str)
    parser.add_argument('--deterministic', default=1, type=int)
    parser.add_argument('--max_steps', type=int, required=True)
    parser.add_argument('--env_name', type=str, default=None)
    args = parser.parse_args()

    util.mkdir_p(args.output_dir)
    assert not os.listdir(args.output_dir), '%s is not empty' % args.output_dir
    print 'Writing to', args.output_dir

    # Load the saved state
    policy_file, policy_key = util.split_h5_name(args.policy)
    print 'Loading policy parameters from %s in %s' % (policy_key, policy_file)
    with h5py.File(policy_file, 'r') as f:
        train_args = json.loads(f.attrs['args'])
        dset = f[policy_key]
        import pprint
        pprint.pprint(dict(dset.attrs))

    # Initialize the MDP
    env_name = train_args['env_name'] if args.env_name is None else args.env_name
    print 'Loading environment', env_name
    mdp = rllabenv.RLLabMDP(env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' % (mdp.obs_space.dim, mdp.action_space.storage_size))

    util.header('Max steps is {}'.format(args.max_steps))

    # Initialize the policy and load its parameters
    enable_obsnorm = bool(train_args['enable_obsnorm']) if 'enable_obsnorm' in train_args else train_args['obsnorm_mode'] != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GaussianPolicy')
    else:
        policy_cfg = rl.GibbsPolicyConfig(
            hidden_spec=train_args['policy_hidden_spec'],
            enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space, 'GibbsPolicy')
    policy.load_h5(policy_file, policy_key)

    # Animate
    sim = mdp.new_sim()
    steps = 0
    exit = False
    while not exit:
        sim.reset()
        while not sim.done:
            a = policy.sample_actions(sim.obs[None,:], bool(args.deterministic))[0][0,:]
            sim.step(a)
            sim.draw()
            viewer = sim.env.viewer
            data, w, h = viewer.get_image()
            image = np.fromstring(data, dtype='uint8').reshape(h, w, 3)[::-1,:,:]
            cv2.imwrite('%s/img_%08d.png' % (args.output_dir, steps), image[:,:,::-1])

            print steps
            steps += 1

            if steps >= args.max_steps:
                exit = True
                break
Esempio n. 33
0
def main():
    np.set_printoptions(suppress=True, precision=5, linewidth=1000)

    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', choices=MODES, required=True)
    # Expert dataset
    parser.add_argument('--data', type=str, required=True)
    parser.add_argument(
        '--resume_training',
        action='store_true',
        help=
        "Resume training from a checkpoint: --policy_checkpoint. Currently only supports GAIL with nn policy, reward and vf"
    )
    parser.add_argument(
        '--checkpoint',
        type=str,
        help="Load from checkpoint if provided and if --resume_training")
    parser.add_argument(
        '--limit_trajs',
        type=int,
        required=True,
        help=
        "How many expert trajectories to be used for training. If None : full dataset is used."
    )
    parser.add_argument(
        '--data_subsamp_freq',
        type=int,
        required=True,
        help=
        "A number between 0 and max_traj_len. Rate of subsampling of expert trajectories while creating the dataset of expert transitions (state-action)"
    )
    # MDP options
    parser.add_argument('--env_name', type=str, required=True)
    parser.add_argument('--max_traj_len', type=int, default=None)
    # Policy architecture
    parser.add_argument('--policy_hidden_spec',
                        type=str,
                        default=SIMPLE_ARCHITECTURE)
    parser.add_argument('--tiny_policy', action='store_true')
    parser.add_argument('--obsnorm_mode',
                        choices=OBSNORM_MODES,
                        default='expertdata')
    # Behavioral cloning optimizer
    parser.add_argument('--bclone_lr', type=float, default=1e-3)
    parser.add_argument('--bclone_batch_size', type=int, default=128)
    # parser.add_argument('--bclone_eval_nsa', type=int, default=128*100)
    parser.add_argument('--bclone_eval_ntrajs', type=int, default=20)
    parser.add_argument('--bclone_eval_freq', type=int, default=1000)
    parser.add_argument('--bclone_train_frac', type=float, default=.7)
    # Imitation optimizer
    parser.add_argument('--discount', type=float, default=.995)

    parser.add_argument('--lam', type=float, default=.97)
    parser.add_argument('--max_iter', type=int, default=1000000)
    parser.add_argument('--policy_max_kl', type=float, default=.01)
    parser.add_argument('--policy_cg_damping',
                        type=float,
                        default=.1,
                        help="TRPO parameter")
    parser.add_argument('--no_vf', type=int, default=0)
    parser.add_argument('--vf_max_kl', type=float, default=.01)
    parser.add_argument('--vf_cg_damping', type=float, default=.1)
    parser.add_argument('--policy_ent_reg', type=float, default=0.)
    parser.add_argument('--reward_type', type=str, default='nn')
    # parser.add_argument('--linear_reward_bin_features', type=int, default=0)
    parser.add_argument('--reward_max_kl',
                        type=float,
                        default=.01,
                        help="TRPO parameter")
    parser.add_argument('--reward_lr', type=float, default=.01)
    parser.add_argument('--reward_steps', type=int, default=1)
    parser.add_argument('--reward_ent_reg_weight', type=float, default=.001)
    parser.add_argument('--reward_include_time', type=int, default=0)
    parser.add_argument('--sim_batch_size', type=int, default=None)
    parser.add_argument('--min_total_sa', type=int, default=50000)
    parser.add_argument('--favor_zero_expert_reward', type=int, default=0)
    # Saving stuff
    parser.add_argument('--print_freq', type=int, default=1)
    parser.add_argument('--save_freq', type=int, default=20)
    parser.add_argument('--plot_freq', type=int, default=0)
    parser.add_argument('--log', type=str, required=False)
    # CVaR parameters
    parser.add_argument('--useCVaR', action='store_true')
    parser.add_argument('--CVaR_alpha', type=float, default=0.9)
    parser.add_argument('--CVaR_beta', type=float, default=0.)
    parser.add_argument('--CVaR_lr', type=float, default=0.01)
    # !!! The following argument --disc_CVaR_weight is not of use and should be removed
    parser.add_argument(
        '--disc_CVaR_weight',
        type=float,
        default=1.,
        help=
        "Weight given to CVaR loss for the discriminator. Added by Anirban for smooth convergence."
    )
    parser.add_argument('--CVaR_Lambda_not_trainable', action='store_false')
    parser.add_argument('--CVaR_Lambda_val_if_not_trainable',
                        type=float,
                        default=0.5)
    #Filtering expert trajectories
    parser.add_argument('--use_expert_traj_filtering', action='store_true')
    parser.add_argument('--expert_traj_filt_percentile_threshold',
                        type=float,
                        default=20)
    # Additive state prior formulation
    parser.add_argument('--use_additiveStatePrior', action='store_true')
    parser.add_argument('--additiveStatePrior_weight', type=float, default=1.)
    parser.add_argument('--n_gmm_components', type=int, default=5)
    parser.add_argument('--cov_type_gmm', type=str, default='diag')
    parser.add_argument('--familiarity_alpha', type=float, default=10000000)
    parser.add_argument('--familiarity_beta', type=float, default=100)

    parser.add_argument('--kickThreshold_percentile',
                        type=float,
                        default=100.0)
    parser.add_argument('--appendFlag', action='store_true')

    args = parser.parse_args()

    if args.useCVaR:
        print ">>>>>>>>>>>>>>>>>>> TRAINING RAIL <<<<<<<<<<<<<<<<<<<"
    elif args.use_additiveStatePrior:
        print ">>>>>>>>>>>>>>>>>>> USING ADDITIVE STATE PRIOR <<<<<<<<<<<<<<<<<<<"
    else:
        print ">>>>>>>>> TRAINING GAIL <<<<<<<<<<"

    # Initialize the MDP
    if args.tiny_policy:
        assert args.policy_hidden_spec == SIMPLE_ARCHITECTURE, 'policy_hidden_spec must remain unspecified if --tiny_policy is set'
        args.policy_hidden_spec = TINY_ARCHITECTURE
    argstr = json.dumps(vars(args), separators=(',', ':'), indent=2)
    print(argstr)
    print "\n\n========== Policy network specifications loaded ===========\n\n"

    mdp = rlgymenv.RLGymMDP(args.env_name)
    util.header('MDP observation space, action space sizes: %d, %d\n' %
                (mdp.obs_space.dim, mdp.action_space.storage_size))

    print "\n\n========== MDP initialized ===========\n\n"

    # Initialize the policy
    enable_obsnorm = args.obsnorm_mode != 'none'
    if isinstance(mdp.action_space, policyopt.ContinuousSpace):
        policy_cfg = rl.GaussianPolicyConfig(
            hidden_spec=args.policy_hidden_spec,
            min_stdev=0.,
            init_logstdev=0.,
            enable_obsnorm=enable_obsnorm)
        policy = rl.GaussianPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                   'GaussianPolicy', args.useCVaR)
    else:
        policy_cfg = rl.GibbsPolicyConfig(hidden_spec=args.policy_hidden_spec,
                                          enable_obsnorm=enable_obsnorm)
        policy = rl.GibbsPolicy(policy_cfg, mdp.obs_space, mdp.action_space,
                                'GibbsPolicy', args.useCVaR)

    offset = 0
    #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
    if args.resume_training:
        if args.checkpoint is not None:
            file, policy_key = util.split_h5_name(args.checkpoint)
            offset = int(policy_key.split('/')[-1][4:])
            print '\n**************************************************'
            print 'Resuming from checkpoint : %d of %s' % (offset, file)
            print '**************************************************\n'

            if args.appendFlag and file != args.log:
                raise RuntimeError(
                    'Log file and checkpoint should have the same name if appendFlag is on. %s vs %s'
                    % file, args.log)

            policy_file = file[:-3] + '_policy.h5'  # Because we're naming the file as *_policy.h5 itself
            policy.load_h5(policy_file, policy_key)

    util.header('Policy architecture')
    for v in policy.get_trainable_variables():
        util.header('- %s (%d parameters)' % (v.name, v.get_value().size))
    util.header('Total: %d parameters' % (policy.get_num_params(), ))

    print "\n\n========== Policy initialized ===========\n\n"

    # Load expert data

    exobs_Bstacked_Do, exa_Bstacked_Da, ext_Bstacked = load_dataset(
        args.data,
        args.limit_trajs,
        args.data_subsamp_freq,
        len_filtering=args.use_expert_traj_filtering,
        len_filter_threshold=args.expert_traj_filt_percentile_threshold)

    assert exobs_Bstacked_Do.shape[1] == mdp.obs_space.storage_size
    assert exa_Bstacked_Da.shape[1] == mdp.action_space.storage_size
    assert ext_Bstacked.ndim == 1

    print "\n\n========== Expert data loaded ===========\n\n"

    print '\n==================== Hyperparams ===================='
    print '\texpert_traj_filt_percentile_threshold = %f' % args.expert_traj_filt_percentile_threshold
    print '\tfamiliarity_alpha = %f' % args.familiarity_alpha
    print '\tfamiliarity_beta = %f' % args.familiarity_beta
    print '\tkickThreshold_percentile = %f' % args.kickThreshold_percentile
    print '==============================================\n'

    # Start optimization
    max_traj_len = args.max_traj_len if args.max_traj_len is not None else mdp.env_spec.timestep_limit
    print 'Max traj len:', max_traj_len

    if args.mode == 'bclone':
        # For behavioral cloning, only print output when evaluating
        args.print_freq = args.bclone_eval_freq
        args.save_freq = args.bclone_eval_freq

        reward, vf = None, None  #There is no role of the reward function or value function in behavioral cloning
        opt = imitation.BehavioralCloningOptimizer(
            mdp,
            policy,
            lr=args.bclone_lr,
            batch_size=args.bclone_batch_size,
            obsfeat_fn=lambda o: o,
            ex_obs=exobs_Bstacked_Do,
            ex_a=exa_Bstacked_Da,
            eval_sim_cfg=policyopt.SimConfig(
                min_num_trajs=args.bclone_eval_ntrajs,
                min_total_sa=-1,
                batch_size=args.sim_batch_size,
                max_traj_len=max_traj_len),
            eval_freq=args.bclone_eval_freq,
            train_frac=args.bclone_train_frac)

        print "======= Behavioral Cloning optimizer initialized ======="

    elif args.mode == 'ga':
        if args.reward_type == 'nn':
            reward = imitation.TransitionClassifier(  #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                max_kl=args.reward_max_kl,
                adam_lr=args.reward_lr,
                adam_steps=args.reward_steps,
                ent_reg_weight=args.reward_ent_reg_weight,
                enable_inputnorm=True,
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                varscope_name='TransitionClassifier',
                useCVaR=args.useCVaR,
                CVaR_loss_weightage=args.disc_CVaR_weight)
            #Load from checkpoint if provided <<<<<<<<<<<<<=============================>>>>>>>>>>>>>>>>.
            if args.resume_training:
                if args.checkpoint is not None:
                    file, reward_key = util.split_h5_name(args.checkpoint)
                    reward_file = file[:-3] + '_reward.h5'
                    print reward_file
                    reward.load_h5(reward_file, reward_key)

        elif args.reward_type in ['l2ball', 'simplex']:
            reward = imitation.LinearReward(
                obsfeat_space=mdp.obs_space,
                action_space=mdp.action_space,
                mode=args.reward_type,
                enable_inputnorm=True,
                favor_zero_expert_reward=bool(args.favor_zero_expert_reward),
                include_time=bool(args.reward_include_time),
                time_scale=1. / mdp.env_spec.timestep_limit,
                exobs_Bex_Do=exobs_Bstacked_Do,
                exa_Bex_Da=exa_Bstacked_Da,
                ext_Bex=ext_Bstacked)
        else:
            raise NotImplementedError(args.reward_type)

        vf = None if bool(
            args.no_vf) else rl.ValueFunc(  #Add resume training functionality
                hidden_spec=args.policy_hidden_spec,
                obsfeat_space=mdp.obs_space,
                enable_obsnorm=args.obsnorm_mode != 'none',
                enable_vnorm=True,
                max_kl=args.vf_max_kl,
                damping=args.vf_cg_damping,
                time_scale=1. / mdp.env_spec.timestep_limit,
                varscope_name='ValueFunc')
        if args.resume_training:
            if args.checkpoint is not None:
                file, vf_key = util.split_h5_name(args.checkpoint)
                vf_file = file[:-3] + '_vf.h5'
                vf.load_h5(vf_file, vf_key)
        if args.useCVaR:
            opt = imitation.ImitationOptimizer_CVaR(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=True),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked,
                #For CVaR
                CVaR_alpha=args.CVaR_alpha,
                CVaR_beta=args.CVaR_beta,
                CVaR_lr=args.CVaR_lr,
                CVaR_Lambda_trainable=args.CVaR_Lambda_not_trainable,
                CVaR_Lambda_val_if_not_trainable=args.
                CVaR_Lambda_val_if_not_trainable,
                offset=offset + 1)
        elif args.use_additiveStatePrior:
            opt = imitation.ImitationOptimizer_additiveStatePrior(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=False),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked,
                n_gmm_components=args.n_gmm_components,
                cov_type_gmm=args.cov_type_gmm,
                additiveStatePrior_weight=args.additiveStatePrior_weight,
                alpha=args.familiarity_alpha,
                beta=args.familiarity_beta,
                kickThreshold_percentile=args.kickThreshold_percentile,
                offset=offset + 1)
        else:
            opt = imitation.ImitationOptimizer(
                mdp=mdp,
                discount=args.discount,
                lam=args.lam,
                policy=policy,
                sim_cfg=policyopt.SimConfig(min_num_trajs=-1,
                                            min_total_sa=args.min_total_sa,
                                            batch_size=args.sim_batch_size,
                                            max_traj_len=max_traj_len),
                step_func=rl.TRPO(max_kl=args.policy_max_kl,
                                  damping=args.policy_cg_damping,
                                  useCVaR=False),
                reward_func=reward,
                value_func=vf,
                policy_obsfeat_fn=lambda obs: obs,
                reward_obsfeat_fn=lambda obs: obs,
                policy_ent_reg=args.policy_ent_reg,
                ex_obs=exobs_Bstacked_Do,
                ex_a=exa_Bstacked_Da,
                ex_t=ext_Bstacked)

    # Set observation normalization
    if args.obsnorm_mode == 'expertdata':
        policy.update_obsnorm(exobs_Bstacked_Do)
        if reward is not None:
            reward.update_inputnorm(opt.reward_obsfeat_fn(exobs_Bstacked_Do),
                                    exa_Bstacked_Da)
        if vf is not None:
            vf.update_obsnorm(opt.policy_obsfeat_fn(exobs_Bstacked_Do))

        print "======== Observation normalization done ========"

    # Run optimizer
    print "======== Optimization begins ========"

    # Trial: make checkpoints for policy, reward and vf
    policy_log = nn.TrainingLog(args.log[:-3] + '_policy.h5',
                                [('args', argstr)], args.appendFlag)
    reward_log = nn.TrainingLog(args.log[:-3] + '_reward.h5',
                                [('args', argstr)], args.appendFlag)
    vf_log = nn.TrainingLog(args.log[:-3] + '_vf.h5', [('args', argstr)],
                            args.appendFlag)

    kickStatesData = []

    print '\n**************************************'
    print 'Running iterations from %d to %d' % (offset + 1, args.max_iter)

    for i in xrange(offset + 1, args.max_iter):
        # for i in range(1): #FIXME: this is just for studying the insides of the training algo

        # All training a.k.a. optimization happens in the next line!!! -_-
        # pdb.set_trace()
        iter_info = opt.step(
            i, kickStatesData) if args.use_additiveStatePrior else opt.step(i)

        #========= The rest is fluff =============

        #Log and plot
        #pdb.set_trace()
        policy_log.write(
            iter_info,
            print_header=i % (20 * args.print_freq) == 0,
            # display=False
            display=i % args.print_freq == 0  ## FIXME: AS remove comment
        )
        # reward_log.write(iter_info,
        #         print_header=i % (20*args.print_freq) == 0,
        #         display=False
        #         # display=i % args.print_freq == 0 ## FIXME: AS remove comment
        #         )
        # vf_log.write(iter_info,
        #         print_header=i % (20*args.print_freq) == 0,
        #         display=False
        #         # display=i % args.print_freq == 0 ## FIXME: AS remove comment
        #         )

        #FIXME: problem running this on 211 and 138. No problem on 151
        if args.save_freq != 0 and i % args.save_freq == 0 and args.log is not None:
            policy_log.write_snapshot(policy, i)
            reward_log.write_snapshot(reward, i)
            vf_log.write_snapshot(vf, i)

            # analysisFile=open(args.log[:-3]+'_kickedStates' + str(i) + '.pkl', 'wb')
            analysisFile = open(args.log[:-3] + '_kickedStates.pkl', 'wb')
            pkl.dump({'kickStatesData': kickStatesData},
                     analysisFile,
                     protocol=2)
            analysisFile.close()

        if args.plot_freq != 0 and i % args.plot_freq == 0:
            exdata_N_Doa = np.concatenate([exobs_Bstacked_Do, exa_Bstacked_Da],
                                          axis=1)
            pdata_M_Doa = np.concatenate(
                [opt.last_sampbatch.obs.stacked, opt.last_sampbatch.a.stacked],
                axis=1)

            # Plot reward
            import matplotlib.pyplot as plt
            _, ax = plt.subplots()
            idx1, idx2 = 0, 1
            range1 = (min(exdata_N_Doa[:, idx1].min(),
                          pdata_M_Doa[:, idx1].min()),
                      max(exdata_N_Doa[:, idx1].max(),
                          pdata_M_Doa[:, idx1].max()))
            range2 = (min(exdata_N_Doa[:, idx2].min(),
                          pdata_M_Doa[:, idx2].min()),
                      max(exdata_N_Doa[:, idx2].max(),
                          pdata_M_Doa[:, idx2].max()))
            reward.plot(ax, idx1, idx2, range1, range2, n=100)

            # Plot expert data
            ax.scatter(exdata_N_Doa[:, idx1],
                       exdata_N_Doa[:, idx2],
                       color='blue',
                       s=1,
                       label='expert')

            # Plot policy samples
            ax.scatter(pdata_M_Doa[:, idx1],
                       pdata_M_Doa[:, idx2],
                       color='red',
                       s=1,
                       label='apprentice')

            ax.legend()
            plt.show()