Beispiel #1
0
class PendulumPID(object):

    def __init__(self, Kp, Kd, Kp_swing, **kwargs):
        self._env = TfEnv(GymEnv('Pendulum-v0',
                                 record_video=kwargs.get("record_video", False),
                                 record_log=kwargs.get("record_log", False)))
        # self._env = PendulumEnv()
        config_path = kwargs.get("config_path", None)
        config = {}
        if config_path is not None:
            config = read_yaml_file(config_path)
        
        self._alpha_tol = kwargs.get("stability_region", config.get("stablity-active-region", 0.0))
        self._Kmag = kwargs.get("swing_up_torque", config.get("max-swing-up-torque", 1.0))
        self._dt = kwargs.get("time_delta", config.get("time-delta", 1.0))
        self._length = kwargs.get("length", config.get("length", 1.0))
        self._mass = kwargs.get("mass", config.get("mass", 1.0))
        self._g = kwargs.get("g_val", config.get("gravitation", 10.0))
        self._target = kwargs.get("target", config.get("target-angle", 0.0))
        
        self.reset(Kp, Kd, Kp_swing, Ki=kwargs.get('Ki', 0.0))
        print("PID init")
        
    def reset(self, Kp, Kd, Kp_swing, Ki=0.0):
        self._int, self._diff, self._Ki = 0.0, 0.0, Ki
        self._Kp, self._Kd, self._Kp_swing = Kp, Kd, Kp_swing
        self._last_obs = self._env.reset()
        self._alpha_dot_prev = self._last_obs[2]
    
    @property
    def env(self):
        return self._env

    def step(self):
        Ip = self._length / 2.0
        alpha, alpha_dot = np.arccos(self._last_obs[0]), self._last_obs[2]
        # alpha_dotdot = (alpha_dot - self._alpha_dot_prev)/self._dt
        self._alpha_dot_prev = alpha_dot

        PE = self._mass * self._g * Ip * np.sin(alpha)
        KE = self._mass * alpha_dot**2 * self._length**2 * 0.5
        MAX_PE = self._mass * self._g * Ip
        # INR = self._mass * alpha_dotdot * Ip**2
        # Kp_swing = 0.02

        if abs(alpha - self._target) > self._alpha_tol:  # swing up
            # TODO: the units do not agree find a solution.
            new_taw = self._Kp_swing * np.sign(alpha_dot) * (MAX_PE - PE)
            # new_taw = np.sign(alpha_dot) * (self._Kmag / (1 + np.exp(-self._Kp_swing * alpha)))
            error = alpha
        else:                                            # stabilization pid
            error = np.sign(alpha_dot) * (KE + PE)
            self._int += error * self._dt
            _Cd = (error - self._diff) / self._dt if self._dt > 0 else 0
            new_taw = self._Kp * error + self._Ki * self._int + self._Kd * _Cd
            self._diff = error
        self._last_obs, r, d, info = self._env.step([new_taw])
        info.update({"error": error, "action": [new_taw]})

        return self._last_obs, r, d, info
Beispiel #2
0
def test_pointmaze(policy):
    test_env = TfEnv(CustomGymEnv('PointMazeRight-v0'))
    for i in range(5):
        done = False
        s = test_env.reset()
        reward = 0
        steps = 0
        while not done:
            a = np.random.choice(policy.shape[1], policy[s])
            s_, r, _, done = test_env.step(a)
            steps += 1
            reward += r
        print('Average episode reward is {}'.format(reward / steps))
Beispiel #3
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=str, default='path to snapshot file')
    parser.add_argument('--pixel', action='store_true')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--multistep', action='store_true')
    parser.add_argument('--step_size', type=int, default=5)
    parser.add_argument('--zero_action', action='store_true')
    parser.add_argument('--gt_action', action='store_true')

    args = parser.parse_args()

    with tf.Session() as sess:
        data = joblib.load(args.file)
        _encoder = data['encoder']
        _inverse_model = data['inverse_model']
        _forward_model = data['forward_model']

        if args.pixel:
            env = TfEnv(normalize(env=GymEnv(PIXEL_ENV,record_video=False, \
            log_dir='/tmp/gym_test',record_log=False)))
        else:
            env = TfEnv(normalize(env=GymEnv(STATE_ENV,record_video=False, \
            log_dir='/tmp/gym_test',record_log=False)))

        # Rebuild models
        act_space = env.action_space
        obs_space = env.observation_space
        qpos_dim = env.wrapped_env._wrapped_env.env.env.init_qpos.shape[0]

        s1_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        s2_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
        a_ph = tf.placeholder(tf.float32, [None, act_space.flat_dim])
        
        clipped_a = tf.clip_by_value(a_ph, -1.0, 1.0)
        encoder1 = _encoder.get_weight_tied_copy(observation_input=s1_ph)
        encoder2 = _encoder.get_weight_tied_copy(observation_input=s2_ph)
        inverse_model = _inverse_model.get_weight_tied_copy(feature_input1=encoder1.output, 
                                                            feature_input2=encoder2.output)
        forward_model = _forward_model.get_weight_tied_copy(feature_input=encoder1.output,
                                                            action_input=clipped_a)

        # Load test data
        dataset_paths, datasets = load_dataset(args.pixel, args.multistep)


        env.reset()
        for dataset_path, data_dict in zip(dataset_paths, datasets):
            
            ef_xyz_pred_diff = []
            ef_xyz_diff = []
            action_diff = []
            qpos_diff = []
            qpos_pred_diff = []
            if args.multistep:
                print ("===== Using multisteping testing, stepsize: %d" % args.step_size)
            
            print ("========================================")
            print ("===== Evaluating inverse model on %s" % dataset_path)
            # states = data_dict['states']
            # next_states = data_dict['next_states']
            # obs = data_dict['obs']
            # next_obs = data_dict['next_obs']
            # actions = data_dict['actions']
            if args.multistep:
                states, next_states, obs, next_obs, actions = load_data_multistep(data_dict, pixel=args.pixel, step_size=args.step_size)
            else:
                states, next_states, obs, next_obs, actions = load_data(data_dict, args.pixel)
            actions = np.clip(actions, -1.0, 1.0)

            if args.render:
                fig, [ax1, ax2, ax3] = plt.subplots(1, 3)
                plt.ion()
                ax1.set_title("t=0")
                ax2.set_title("t=1 after action")
                ax3.set_title("t=1 after predicted action")


            for state, next_state, ob, next_ob, action in zip(states, next_states, obs, next_obs, actions):
                # print (state.shape)
                if args.multistep:
                    # Set state, get real img1
                    set_state(env, state[0], qpos_dim)
                    _end_ef_pos = get_ef_pos(env)
                    _qpos = get_qpos(env)
                    if args.render:
                        img = get_render_img(env)

                    o = ob[0]
                    # next_o = next_ob[0]
                    next_o = next_ob[-1]
                    for _ in range(args.step_size):
                        # Get predicted action from inverse model
                        pred_action = sess.run(inverse_model.output, {
                            s1_ph: [o],
                            s2_ph: [next_o],
                        })[0]
                        if args.gt_action:
                        	pred_action = action[_]

                        if args.zero_action:
                        	pred_action = np.zeros_like(action[_])

                        # ob = next_o
                        # next_o = next_ob[_]

                        # Step predicted action
                        o, r, d, env_info = env.step(pred_action)

                    # Get sim_img2 and sim ef position
                    s_end_ef_pos = get_ef_pos(env)
                    s_qpos = get_qpos(env)
                    if args.render:
                        s_img = get_render_img(env)


                    # Get real img2 and real ef position
                    set_state(env, next_state[args.step_size-1], qpos_dim)
                    o_end_ef_pos = get_ef_pos(env)
                    o_qpos = get_qpos(env)
                    if args.render:
                        o_img = get_render_img(env)


                else:
                    # Set state, get real img1
                    # import pdb; pdb.set_trace()
                    set_state(env, state, qpos_dim)
                    _end_ef_pos = get_ef_pos(env)
                    # print ("Real: ", _end_ef_pos)
                    _qpos = get_qpos(env)
                    if args.render:
                        img = get_render_img(env)

                    # Get predicted action from inverse model
                    pred_action = sess.run(inverse_model.output, {
                        s1_ph: [ob],
                        s2_ph: [next_ob],
                    })[0]

                    if args.zero_action:
                        pred_action = np.zeros_like(pred_action)
                    if args.gt_action:
                    	pred_action = action


                    # Step action
                    env.step(pred_action)

                    # print (np.linalg.norm(next_state - get_state(env)))

                    # Get sim_img2 and sim ef position
                    s_end_ef_pos = get_ef_pos(env)
                    # print ("Sim pos", s_end_ef_pos)
                    s_qpos = get_qpos(env)
                    if args.render:
                        s_img = get_render_img(env)

                    # Get real img2 and real ef position
                    set_state(env, next_state, qpos_dim)
                    o_end_ef_pos = get_ef_pos(env)
                    o_qpos = get_qpos(env)

                    # print (np.linalg.norm(s_qpos - o_qpos))
                    
                    # print (np.linalg.norm(o_end_ef_pos - s_end_ef_pos))

                if args.render:
                    o_img = get_render_img(env)
                
                if args.render:
                    ax1.imshow(img)
                    ax2.imshow(o_img)
                    ax3.imshow(s_img)
                    plt.show()
                    plt.pause(0.1)

                    # print ("Actual action: ", action)
                    # print ("Predicted action: ", pred_action)

                ef_xyz_pred_diff.append(np.linalg.norm(o_end_ef_pos - s_end_ef_pos))
                ef_xyz_diff.append(np.linalg.norm(o_end_ef_pos - _end_ef_pos))
                qpos_pred_diff.append(np.linalg.norm(o_qpos - s_qpos))
                qpos_diff.append(np.linalg.norm(o_qpos - _qpos))
                
                action_diff.append(((action - pred_action)**2).mean())

            # print ("===== 1. real s1, real s2 end effector position L2 distance       mean:  %.5f, std: %.5f" % (np.mean(ef_xyz_diff), np.std(ef_xyz_diff)))
            # print ("===== 2. real s2, sim  s2 end effector position L2 distance       mean:  %.5f, std: %.5f" % (np.mean(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff)))
            # print ("===== 3. real s1, real s2 joint position        L2 distance       mean:  %.5f, std: %.5f" % (np.mean(qpos_diff), np.std(qpos_diff)))
            # print ("===== 4. real s2, sim  s2 joint position        L2 distance       mean:  %.5f, std: %.5f" % (np.mean(qpos_pred_diff), np.std(qpos_pred_diff)))
           # if not args.multistep:
                #print ("===== 5. action - pred_action (per dim)      sq L2 distance       mean:  %.5f, std: %.5f" % (np.mean(action_diff), np.std(action_diff)))
            # print ("===== 6. action                                                   mean:  %.5f, std: %.5f" % (np.mean(np.abs(actions).mean(axis=1)), np.std(actions.mean(axis=1))))

            print ("===== 1. real s1, real s2 end effector position L2 distance       med:  %.5f, std: %.5f" % (np.median(ef_xyz_diff), np.std(ef_xyz_diff)))
            print ("===== 2. real s2, sim  s2 end effector position L2 distance       med:  %.5f, std: %.5f" % (np.median(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff)))
            print ("===== 3. real s1, real s2 joint position        L2 distance       med:  %.5f, std: %.5f" % (np.median(qpos_diff), np.std(qpos_diff)))
            print ("===== 4. real s2, sim  s2 joint position        L2 distance       med:  %.5f, std: %.5f" % (np.median(qpos_pred_diff), np.std(qpos_pred_diff)))
            if not args.multistep:
                    print ("===== 5. action - pred_action (per dim)      sq L2 distance       med:  %.5f, std: %.5f" % (np.median(action_diff), np.std(action_diff)))
            print ("===== 6. action                                                   med:  %.5f, std: %.5f" % (np.median(np.abs(np.median(actions, axis=1))), np.std(np.median(actions, axis=1))))
Beispiel #4
0
def main():

    name = 'Exp180512_simple_baseline_striker'

    EPI.init('striker', num_of_params=2)

    sess = tf.Session()
    sess.__enter__()
    algo = pickle.load(open(os.getcwd() + "/" + name + "/pickle.p", "rb"))

    env = TfEnv(normalize(GymEnv('StrikerAvg-v0')))
    core_env = env.wrapped_env.wrapped_env.env.env

    target_sample_size = 1000
    egreedy = 0.2

    data = []
    rollouts = []
    while len(rollouts) < target_sample_size:
        observation = env.reset()
        core_env.change_env(np.array([0.1, 0.1]))
        old_ball_pos = core_env.model.data.qpos[-9:-7]
        for i in range(200):
            if np.random.rand() < egreedy:
                action = env.action_space.sample()
            else:
                action, d = algo.policy.get_action(observation)
            ball_pos = core_env.model.data.qpos[-9:-7]
            if np.linalg.norm(ball_pos - old_ball_pos) > 0.005:
                full_state = core_env.state_vector()
                rollouts.append([full_state, action])
            next_observation, reward, terminal, reward_dict = env.step(action)
            observation = next_observation
            old_ball_pos = ball_pos
            if terminal or len(rollouts) == target_sample_size:
                break

    print('Rollout...')
    for i in range(5):
        for j in range(5):
            env_id = int((i * 5 + j))  # default: 1, 2
            core_env.change_env(scale=np.array([i * 0.1, j * 0.1]))
            print(core_env.env_id)
            print(core_env.scale)

            for rollout in rollouts:
                state = rollout[0]
                observation = core_env.force_reset_model(qpos=state[:16],
                                                         qvel=state[16:])
                action = rollout[1]
                before = np.concatenate([
                    core_env.model.data.qpos[7:9, 0],
                    core_env.model.data.qvel[7:9, 0],
                    core_env.get_body_com("tips_arm")
                ])
                next_observation, reward, terminal, reward_dict = env.step(
                    action)
                after = np.concatenate([
                    core_env.model.data.qpos[7:9, 0],
                    core_env.model.data.qvel[7:9, 0],
                    core_env.get_body_com("tips_arm")
                ])
                data.append(
                    np.concatenate([
                        before, after,
                        np.array([core_env.env_id]), core_env.scale
                    ]))
                observation = next_observation

    data = np.array(data)

    g = lambda s, num: [s + str(i) for i in range(num)]
    columns = g('obs', 7) + g('next_obs', 7) + g('env_id', 1) + g('env_vec', 2)
    df = pd.DataFrame(data, columns=columns)
    df.to_csv('../EPI/envs/striker_data_vine.csv')
Beispiel #5
0
        start_state = env_ref._wrapped_env.start_state
        goal_state = env_ref._wrapped_env.goal_state
        env._wrapped_env.__init__(env._wrapped_env.params,
                                  grid=grid,
                                  b0=b0,
                                  start_state=start_state,
                                  goal_state=goal_state)
        env._wrapped_env.generate_grid = False
        env._wrapped_env.generate_b0_start_goal = False

        o = env.reset()
        agent.reset()
        # agent.reset()
        path_length = 0

        while True:
            a, agent_info = agent.get_action(o)
            next_o, r, d, env_info = env.step(a)
            path_length += 1
            if d:
                break
            o = next_o

        if path_length < max_path_length:
            success += 1
            path_lengths = np.append(path_lengths, path_length)
    mean_path_length = np.mean(path_lengths)
    print(name)
    print('success: ', success)
    print('mean length: ', mean_path_length)
Beispiel #6
0
    n_itr = 1
    sess.run(tf.global_variables_initializer())
    # sampler_cls.start_worker()
    for itr in range(n_itr):
        # rollout(env, policy, animated=True, max_path_length=1000)
        o = env.reset()
        policy.reset()
        d = False
        while not d:
            env.render()
            flat_obs = policy.observation_space.flatten(o)
            mean, log_std = [x[0] for x in policy._f_dist([flat_obs])]
            # rnd = np.random.normal(size=mean.shape)
            # action = rnd * np.exp(log_std) + mean
            action = mean
            next_o, r, d, env_info = env.step(action)
            o = next_o

    # sampler_cls.shutdown_worker()
    if created_session:
        sess.close()

    # done = False
    # obs = env.reset()
    # rewards = []
    # pdb.set_trace()
    # while not done:
    #     action, actor_info = policy.get_actions(obs.reshape(-1,111))
    #     obs, reward, done, info =  env.step(action)
    #     rewards.append(reward)
    #
Beispiel #7
0
from inverse_rl.envs.env_utils import CustomGymEnv
from inverse_rl.utils.log_utils import rllab_logdir
from inverse_rl.utils.hyper_sweep import run_sweep_parallel, run_sweep_serial

#Loads a policy from the given pickle-file and records a video
if __name__ == "__main__":
    #filename='data/ant_data_collect/2018_05_25_13_42_59_0/itr_1499.pkl'
    #filename='data/ant_data_collect/2018_05_23_15_21_40_0/itr_1499.pkl'
    #filename='data/ant_data_collect/2018_05_19_07_56_37_1/itr_1499.pkl'
    #filename='data/ant_data_collect/2018_05_19_07_56_37_0/itr_1485.pkl'
    #filename='data/ant_state_irl/2018_05_26_08_51_16_0/itr_999.pkl'
    #filename='data/ant_state_irl/2018_05_26_08_51_16_1/itr_999.pkl'
    #filename='data/ant_state_irl/2018_05_26_08_51_16_2/itr_999.pkl'
    filename = 'data/ant_transfer/2018_05_26_16_06_05_4/itr_999.pkl'
    import gym
    import joblib
    import rllab.misc.logger as rllablogger
    tf.reset_default_graph()
    with tf.Session(config=get_session_config()) as sess:
        rllablogger.set_snapshot_dir("data/video")
        saved = joblib.load(filename)
        env = TfEnv(
            CustomGymEnv('CustomAnt-v0', record_video=True, record_log=True)
        )  #'DisabledAnt-v0' #Switch for the DisabledAnt for the transfer task
        policy = saved['policy']
        observation = env.reset()
        for _ in range(1000):
            env.render()
            action, rest = policy.get_action(observation)
            observation, reward, done, info = env.step(action)
Beispiel #8
0
j = 0
try: 
	obs_list = np.zeros([NUM] + list(obs_shape), np.uint8)
	state_list = np.zeros([NUM] + list(state_shape), np.float32)
	action_list = np.zeros([NUM] + list(action_shape), np.float32)
	done_list = np.zeros([NUM],  np.uint8)
	term_list = np.zeros([NUM], np.uint8)

	for j in range(TOTAL_NUM / NUM):
		i = 0
		while i < NUM:
			if i % 10000 == 0:
				print ("Collected: %d samples"%i)
			action = env.action_space.sample()
			next_obs, r, done, _ = env.step(action)

			obs_list[i] = obs
			action_list[i] = action
			done_list[i] = done
			term_list[i] = False

			if done:
				obs = env.reset()
			else:
				next_obs = obs
			i += 1

		save_dict = { "obs":obs_list,"action_list":action_list,"done_list":done_list,"term_list":term_list, "state_list":state_list }
		with open("/home/fred/pixel-data/{}.pkl".format(j), 'wb+') as handle:
			pickle.dump(save_dict, handle)
Beispiel #9
0
def main():

    name = 'Exp180418_simple_baseline_hopper'

    EPI.init('hopper', num_of_params=8)

    sess = tf.Session()
    sess.__enter__()
    algo = pickle.load(open(os.getcwd() + "/" + name + "/pickle.p", "rb"))

    env = TfEnv(normalize(GymEnv('HopperAvg-v0')))
    core_env = env.wrapped_env.wrapped_env.env.env

    target_sample_size = 500
    egreedy = 0.2

    data = []
    rollouts = []
    sample_size = 0
    while sample_size < target_sample_size:
        observation = env.reset()
        core_env.change_env(scale=np.array(
            [0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1]),
                            env_id=0)
        episode_size = 0
        while True:
            if np.random.rand() < egreedy:
                action = env.action_space.sample()
            else:
                action, d = algo.policy.get_action(observation)
            full_state = core_env.state_vector()
            rollouts.append([full_state, action])
            next_observation, reward, terminal, reward_dict = env.step(action)
            episode_size += 1
            sample_size += 1
            observation = next_observation
            if terminal or sample_size == target_sample_size:
                break

    print('Rollout...')
    scale_list = pd.read_csv('../EPI/envs/hopper_env_list.csv').values
    for i in range(100):
        env_id = i
        core_env.change_env(scale=scale_list[i, 1:], env_id=i)
        print(core_env.env_id)
        print(core_env.scale)
        for rollout in rollouts:
            state = rollout[0]
            observation = core_env.force_reset_model(qpos=state[0:6],
                                                     qvel=state[6:12])
            action = rollout[1]
            next_observation, reward, terminal, reward_dict = env.step(action)
            data.append(
                np.concatenate([
                    observation, action, next_observation,
                    np.array([env_id]), core_env.scale,
                    np.array([reward, terminal * 1])
                ]))
            sample_size += 1
            observation = next_observation

    data = np.array(data)

    g = lambda s, num: [s + str(i) for i in range(num)]
    columns = g('obs', len(observation)) + g('ac', len(action)) + g(
        'next_obs', len(observation)) + g('env_id', 1) + g(
            'env_vec', 8) + ['reward'] + ['terminal']
    df = pd.DataFrame(data, columns=columns)
    df.to_csv('../EPI/envs/hopper_data_vine.csv')
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('env', type=str, help='name of gym env')
    parser.add_argument('name', type=str, help='name of database to store')
    parser.add_argument('--num',
                        type=int,
                        default=300000,
                        help='number of samples to collect')
    parser.add_argument('--with_state', action='store_true')
    parser.add_argument('--state_obs', action='store_true')
    parser.add_argument('--start_index', type=int, default=0)
    parser.add_argument('--restore_env', type=str, default=None)
    parser.add_argument('--policy_traj', type=str, default=None)
    args = parser.parse_args()

    # Build env
    env = TfEnv(normalize(env=GymEnv(args.env,record_video=False, \
    log_dir='/tmp/gym_test',record_log=False)))

    if args.restore_env is not None:
        with tf.Session() as sess:
            data = joblib.load(args.restore_env)
            env = data['env']

    with tf.Session() as sess:
        if args.policy_traj is not None:
            data = joblib.load(args.policy_traj)
            env = data['env']
            policy = data['policy']
        for i in range(args.num // NUM_CHUNK):
            filename = args.name + '/' + str(i +
                                             args.start_index) + '.tfrecord'
            writer = tf.python_io.TFRecordWriter(filename)
            logger.log('Start collecting data, saving to {}'.format(filename))

            obs = env.reset()
            env_infos = dict()
            next_obs = None

            start_time = time.time()
            j = 0
            while j < NUM_CHUNK:
                if args.policy_traj is not None:
                    policy_action, _ = policy.get_action(obs)
                    action = np.clip(policy_action, -1, 1)
                    # import pdb; pdb.set_trace()
                else:
                    # print("random action")
                    action = env.action_space.sample()

                next_obs, reward, done, env_infos = env.step(action)
                # env.render()
                if args.state_obs:
                    # import pdb; pdb.set_trace()
                    feature = {
                        'obs': _floats_feature(obs),
                        'next_obs': _floats_feature(next_obs),
                        'action': _floats_feature(action),
                    }
                else:
                    feature = {
                        'obs':
                        _bytes_feature(obs.astype(np.uint8).tostring()),
                        'next_obs':
                        _bytes_feature(next_obs.astype(np.uint8).tostring()),
                        'action':
                        _floats_feature(action),
                    }

                if args.with_state:
                    state = get_state(env)
                    feature['state'] = _floats_feature(state)

                if env_infos['contact']:
                    j += 1
                    # env.render()
                    # print("transition involve contact, saving index {}".format(j))
                    example = tf.train.Example(features=tf.train.Features(
                        feature=feature))

                    writer.write(example.SerializeToString())

                if done:
                    obs = env.reset()
                else:
                    obs = next_obs
            writer.close()

            logger.log(
                'Finished collecting, elapsed time: {}'.format(time.time() -
                                                               start_time))
Beispiel #11
0
    rewards = []

    pdb.set_trace()
    observation = env.reset()

    for _ in range(T):
        # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains
        # sufficient statistics for the action distribution. It should at least contain entries that would be
        # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym().
        # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is
        # not needed.
        action, _ = policy.get_action(observation)
        # Recall that the last entry of the tuple stores diagnostic information about the environment. In our
        # case it is not needed.
        pdb.set_trace()
        next_observation, reward, terminal, _ = env.step(action)
        observations.append(observation)
        actions.append(action)
        rewards.append(reward)
        observation = next_observation
        if terminal:
            # Finish rollout if terminal state reached
            break

    # We need to compute the empirical return for each time step along the
    # trajectory
    returns = []
    return_so_far = 0
    for t in range(len(rewards) - 1, -1, -1):
        return_so_far = rewards[t] + discount * return_so_far
        returns.append(return_so_far)
def main():

    import matplotlib.pyplot as plt
    plt.ion()

    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str, help="name of gym env")
    parser.add_argument('model_path', type=str, help="path of trained model")
    parser.add_argument('--cos_forward', action='store_true')
    parser.add_argument('--norm_input', action='store_true')
    parser.add_argument('--mode',
                        type=str,
                        choices=['render', 'record'],
                        default='render')
    parser.add_argument('--data_path', type=str, default='/tmp/data')
    parser.add_argument('--num_sample', type=int, default=100000)

    args = parser.parse_args()

    with tf.Session() as sess:
        data = joblib.load(args.model_path)
        _encoder = data["encoder"]
        _inverse_model = data["inverse_model"]
        _forward_model = data["forward_model"]

        env = TfEnv(
            normalize(env=GymEnv('Box3dReachPixel-v11',
                                 record_video=False,
                                 log_dir='/tmp/gym_test',
                                 record_log=False)))

        s1_ph = tf.placeholder(tf.float32,
                               [None] + list(env.observation_space.shape))
        s2_ph = tf.placeholder(tf.float32,
                               [None] + list(env.observation_space.shape))

        action_ph = tf.placeholder(tf.float32,
                                   [None] + list(env.action_space.shape))

        encoder1 = _encoder.get_weight_tied_copy(observation_input=s1_ph)
        encoder2 = _encoder.get_weight_tied_copy(observation_input=s2_ph)

        inverse_model = _inverse_model.get_weight_tied_copy(
            feature_input1=encoder1.output, feature_input2=encoder2.output)
        forward_model = _forward_model.get_weight_tied_copy(
            feature_input=encoder1.output, action_input=action_ph)
        if args.cos_forward:
            forward_loss = cos_loss(encoder2.output, forward_model.output)
        else:
            forward_loss = tf.reduce_mean(
                tf.square(encoder2.output - forward_model.output))

        inverse_loss = tf.reduce_mean(
            tf.square(action_ph - inverse_model.output))

        # Start running the env
        obs = env.reset()
        next_obs = None
        x = []
        inverse_losses_results = []
        forward_losses_results = []

        if args.mode == 'render':
            f, (ax1, ax2) = plt.subplots(2)
            ax1.set_title("Inverse loss")
            ax2.set_title("Forward loss")
        elif args.mode == 'record':
            images = np.zeros([args.num_sample, 500, 500, 3], dtype='uint8')
            inverse_losses = np.zeros(args.num_sample, dtype='float32')
            forward_losses = np.zeros(args.num_sample, dtype='float32')
            boxes_contacts = np.zeros(args.num_sample, dtype='uint8')
            table_contacts = np.zeros(args.num_sample, dtype='uint8')

        for t in range(args.num_sample):
            if t % LOG_FREQ == 0:
                print("Sample: {}".format(t))
            action = env.action_space.sample()
            next_obs, reward, done, env_info = env.step(action)
            if args.mode == 'render':
                env.render()
            elif args.mode == 'record':
                img = env.wrapped_env._wrapped_env.env.env.render(
                    mode='rgb_array')
                images[t, :, :, :] = img

            inverse_loss_result, forward_loss_result = sess.run(
                [inverse_loss, forward_loss], {
                    s1_ph: [obs / 255.0 - 0.5],
                    s2_ph: [next_obs / 255.0 - 0.5],
                    action_ph: [action]
                })

            if args.mode == 'render':
                x.append(t)
                inverse_losses_results.append(inverse_loss_result)
                forward_losses_results.append(forward_loss_result)
                ax1.plot(x, inverse_losses_results, c="blue")
                ax2.plot(x, forward_losses_results, c="blue")
                plt.pause(0.001)
                plt.show()
            elif args.mode == 'record':
                boxes_contacts[t] = env_info["contact_reward"]
                table_contacts[t] = env_info["table_contact_reward"]
                forward_losses[t] = forward_loss_result
                inverse_losses[t] = inverse_loss_result
            if done:
                obs = env.reset()
            else:
                obs = next_obs

        if args.mode == 'record':
            data_dict = dict(images=images,
                             forward_losses=forward_losses,
                             inverse_losses=inverse_losses,
                             boxes_contacts=boxes_contacts,
                             table_contacts=table_contacts)
            joblib.dump(data_dict, args.data_path)
            print("Saved data to {}".format(args.data_path))
Beispiel #13
0
params['obs_len'] = len(params['observe_directions'])
params['num_state'] = params['grid_n'] * params['grid_m']
params['traj_limit'] = 4 * (params['grid_n'] * params['grid_m']
                            )  # 4 * (params['grid_n'] + params['grid_m'])
params['R_step'] = [params['R_step']] * params['num_action']
params['R_step'][params['stayaction']] = params['R_stay']
params['kdist'] = -0.1

env = GridBase(params)
env.generate_grid = True
env.generate_b0_start_goal = True
env.reset()
env.generate_grid = False
env.generate_b0_start_goal = False
env1 = TfEnv(env)
env1_params = env1.get_param_values()
print(env1.step(1))

env2 = TfEnv(GridBase(params))
env2.set_param_values(env1.get_param_values())
env2.reset()
print(env2.step(1))

env3 = TfEnv(GridBase(params))
env3._wrapped_env.__init__(env1._wrapped_env.params,grid=env1._wrapped_env.grid,b0=env1._wrapped_env.b0,\
                start_state=env1._wrapped_env.start_state,goal_state=env1._wrapped_env.goal_state)
env3._wrapped_env.generate_grid = False
env3._wrapped_env.generate_b0_start_goal = False
env3.reset()
print(env3.step(1))
Beispiel #14
0
from sandbox.rocky.tf.envs.base import TfEnv
from inverse_rl.envs.env_utils import CustomGymEnv
import gym

env = TfEnv(
    CustomGymEnv('CustomAnt-v0',
                 record_video=False,
                 record_log=False,
                 force_reset=False))
done = False
obs = env.reset()
while not done:
    env.render()
    action = env.action_space.sample()
    _, _, done, _ = env.step(action)

env = TfEnv(
    CustomGymEnv('DisabledAnt-v0',
                 record_video=False,
                 record_log=False,
                 force_reset=False))
done = False
obs = env.reset()
while not done:
    env.render()
    action = env.action_space.sample()
    _, _, done, _ = env.step(action)
Beispiel #15
0
mbs = []
# warmup
rollouts = []
for i in range(100):
    rollout = []
    env.reset()
    for t in range(100):
        obs = env.get_current_obs()
        states = [obs]
        for i in range(n_layers):
            obs = abstractors[i](obs)
            states.append(obs)
        rollout.append(states)

        action = policy.get_action(obs)
        env.step(action)
    rollouts.append(rollout)

# fit generative model to warmup
for abs_layer in range(n_layers):
    obs = []
    nexts = []
    for rollout in rollouts:
        for t in range(len(rollout) - 1):
            obs.append(rollout[t][abs_layer + 1])
            nexts.append(rollout[t + 1][abs_layer + 1])
    models[abs_layer].fit(np.array(obs), nexts, n_steps=5000)

# fit planner
for abs_layer in range(n_layers):
    obs = []