class PendulumPID(object): def __init__(self, Kp, Kd, Kp_swing, **kwargs): self._env = TfEnv(GymEnv('Pendulum-v0', record_video=kwargs.get("record_video", False), record_log=kwargs.get("record_log", False))) # self._env = PendulumEnv() config_path = kwargs.get("config_path", None) config = {} if config_path is not None: config = read_yaml_file(config_path) self._alpha_tol = kwargs.get("stability_region", config.get("stablity-active-region", 0.0)) self._Kmag = kwargs.get("swing_up_torque", config.get("max-swing-up-torque", 1.0)) self._dt = kwargs.get("time_delta", config.get("time-delta", 1.0)) self._length = kwargs.get("length", config.get("length", 1.0)) self._mass = kwargs.get("mass", config.get("mass", 1.0)) self._g = kwargs.get("g_val", config.get("gravitation", 10.0)) self._target = kwargs.get("target", config.get("target-angle", 0.0)) self.reset(Kp, Kd, Kp_swing, Ki=kwargs.get('Ki', 0.0)) print("PID init") def reset(self, Kp, Kd, Kp_swing, Ki=0.0): self._int, self._diff, self._Ki = 0.0, 0.0, Ki self._Kp, self._Kd, self._Kp_swing = Kp, Kd, Kp_swing self._last_obs = self._env.reset() self._alpha_dot_prev = self._last_obs[2] @property def env(self): return self._env def step(self): Ip = self._length / 2.0 alpha, alpha_dot = np.arccos(self._last_obs[0]), self._last_obs[2] # alpha_dotdot = (alpha_dot - self._alpha_dot_prev)/self._dt self._alpha_dot_prev = alpha_dot PE = self._mass * self._g * Ip * np.sin(alpha) KE = self._mass * alpha_dot**2 * self._length**2 * 0.5 MAX_PE = self._mass * self._g * Ip # INR = self._mass * alpha_dotdot * Ip**2 # Kp_swing = 0.02 if abs(alpha - self._target) > self._alpha_tol: # swing up # TODO: the units do not agree find a solution. new_taw = self._Kp_swing * np.sign(alpha_dot) * (MAX_PE - PE) # new_taw = np.sign(alpha_dot) * (self._Kmag / (1 + np.exp(-self._Kp_swing * alpha))) error = alpha else: # stabilization pid error = np.sign(alpha_dot) * (KE + PE) self._int += error * self._dt _Cd = (error - self._diff) / self._dt if self._dt > 0 else 0 new_taw = self._Kp * error + self._Ki * self._int + self._Kd * _Cd self._diff = error self._last_obs, r, d, info = self._env.step([new_taw]) info.update({"error": error, "action": [new_taw]}) return self._last_obs, r, d, info
def test_pointmaze(policy): test_env = TfEnv(CustomGymEnv('PointMazeRight-v0')) for i in range(5): done = False s = test_env.reset() reward = 0 steps = 0 while not done: a = np.random.choice(policy.shape[1], policy[s]) s_, r, _, done = test_env.step(a) steps += 1 reward += r print('Average episode reward is {}'.format(reward / steps))
def main(): parser = argparse.ArgumentParser() parser.add_argument('file', type=str, default='path to snapshot file') parser.add_argument('--pixel', action='store_true') parser.add_argument('--render', action='store_true') parser.add_argument('--multistep', action='store_true') parser.add_argument('--step_size', type=int, default=5) parser.add_argument('--zero_action', action='store_true') parser.add_argument('--gt_action', action='store_true') args = parser.parse_args() with tf.Session() as sess: data = joblib.load(args.file) _encoder = data['encoder'] _inverse_model = data['inverse_model'] _forward_model = data['forward_model'] if args.pixel: env = TfEnv(normalize(env=GymEnv(PIXEL_ENV,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) else: env = TfEnv(normalize(env=GymEnv(STATE_ENV,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) # Rebuild models act_space = env.action_space obs_space = env.observation_space qpos_dim = env.wrapped_env._wrapped_env.env.env.init_qpos.shape[0] s1_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) s2_ph = tf.placeholder(tf.float32, [None] + list(obs_space.shape)) a_ph = tf.placeholder(tf.float32, [None, act_space.flat_dim]) clipped_a = tf.clip_by_value(a_ph, -1.0, 1.0) encoder1 = _encoder.get_weight_tied_copy(observation_input=s1_ph) encoder2 = _encoder.get_weight_tied_copy(observation_input=s2_ph) inverse_model = _inverse_model.get_weight_tied_copy(feature_input1=encoder1.output, feature_input2=encoder2.output) forward_model = _forward_model.get_weight_tied_copy(feature_input=encoder1.output, action_input=clipped_a) # Load test data dataset_paths, datasets = load_dataset(args.pixel, args.multistep) env.reset() for dataset_path, data_dict in zip(dataset_paths, datasets): ef_xyz_pred_diff = [] ef_xyz_diff = [] action_diff = [] qpos_diff = [] qpos_pred_diff = [] if args.multistep: print ("===== Using multisteping testing, stepsize: %d" % args.step_size) print ("========================================") print ("===== Evaluating inverse model on %s" % dataset_path) # states = data_dict['states'] # next_states = data_dict['next_states'] # obs = data_dict['obs'] # next_obs = data_dict['next_obs'] # actions = data_dict['actions'] if args.multistep: states, next_states, obs, next_obs, actions = load_data_multistep(data_dict, pixel=args.pixel, step_size=args.step_size) else: states, next_states, obs, next_obs, actions = load_data(data_dict, args.pixel) actions = np.clip(actions, -1.0, 1.0) if args.render: fig, [ax1, ax2, ax3] = plt.subplots(1, 3) plt.ion() ax1.set_title("t=0") ax2.set_title("t=1 after action") ax3.set_title("t=1 after predicted action") for state, next_state, ob, next_ob, action in zip(states, next_states, obs, next_obs, actions): # print (state.shape) if args.multistep: # Set state, get real img1 set_state(env, state[0], qpos_dim) _end_ef_pos = get_ef_pos(env) _qpos = get_qpos(env) if args.render: img = get_render_img(env) o = ob[0] # next_o = next_ob[0] next_o = next_ob[-1] for _ in range(args.step_size): # Get predicted action from inverse model pred_action = sess.run(inverse_model.output, { s1_ph: [o], s2_ph: [next_o], })[0] if args.gt_action: pred_action = action[_] if args.zero_action: pred_action = np.zeros_like(action[_]) # ob = next_o # next_o = next_ob[_] # Step predicted action o, r, d, env_info = env.step(pred_action) # Get sim_img2 and sim ef position s_end_ef_pos = get_ef_pos(env) s_qpos = get_qpos(env) if args.render: s_img = get_render_img(env) # Get real img2 and real ef position set_state(env, next_state[args.step_size-1], qpos_dim) o_end_ef_pos = get_ef_pos(env) o_qpos = get_qpos(env) if args.render: o_img = get_render_img(env) else: # Set state, get real img1 # import pdb; pdb.set_trace() set_state(env, state, qpos_dim) _end_ef_pos = get_ef_pos(env) # print ("Real: ", _end_ef_pos) _qpos = get_qpos(env) if args.render: img = get_render_img(env) # Get predicted action from inverse model pred_action = sess.run(inverse_model.output, { s1_ph: [ob], s2_ph: [next_ob], })[0] if args.zero_action: pred_action = np.zeros_like(pred_action) if args.gt_action: pred_action = action # Step action env.step(pred_action) # print (np.linalg.norm(next_state - get_state(env))) # Get sim_img2 and sim ef position s_end_ef_pos = get_ef_pos(env) # print ("Sim pos", s_end_ef_pos) s_qpos = get_qpos(env) if args.render: s_img = get_render_img(env) # Get real img2 and real ef position set_state(env, next_state, qpos_dim) o_end_ef_pos = get_ef_pos(env) o_qpos = get_qpos(env) # print (np.linalg.norm(s_qpos - o_qpos)) # print (np.linalg.norm(o_end_ef_pos - s_end_ef_pos)) if args.render: o_img = get_render_img(env) if args.render: ax1.imshow(img) ax2.imshow(o_img) ax3.imshow(s_img) plt.show() plt.pause(0.1) # print ("Actual action: ", action) # print ("Predicted action: ", pred_action) ef_xyz_pred_diff.append(np.linalg.norm(o_end_ef_pos - s_end_ef_pos)) ef_xyz_diff.append(np.linalg.norm(o_end_ef_pos - _end_ef_pos)) qpos_pred_diff.append(np.linalg.norm(o_qpos - s_qpos)) qpos_diff.append(np.linalg.norm(o_qpos - _qpos)) action_diff.append(((action - pred_action)**2).mean()) # print ("===== 1. real s1, real s2 end effector position L2 distance mean: %.5f, std: %.5f" % (np.mean(ef_xyz_diff), np.std(ef_xyz_diff))) # print ("===== 2. real s2, sim s2 end effector position L2 distance mean: %.5f, std: %.5f" % (np.mean(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff))) # print ("===== 3. real s1, real s2 joint position L2 distance mean: %.5f, std: %.5f" % (np.mean(qpos_diff), np.std(qpos_diff))) # print ("===== 4. real s2, sim s2 joint position L2 distance mean: %.5f, std: %.5f" % (np.mean(qpos_pred_diff), np.std(qpos_pred_diff))) # if not args.multistep: #print ("===== 5. action - pred_action (per dim) sq L2 distance mean: %.5f, std: %.5f" % (np.mean(action_diff), np.std(action_diff))) # print ("===== 6. action mean: %.5f, std: %.5f" % (np.mean(np.abs(actions).mean(axis=1)), np.std(actions.mean(axis=1)))) print ("===== 1. real s1, real s2 end effector position L2 distance med: %.5f, std: %.5f" % (np.median(ef_xyz_diff), np.std(ef_xyz_diff))) print ("===== 2. real s2, sim s2 end effector position L2 distance med: %.5f, std: %.5f" % (np.median(ef_xyz_pred_diff), np.std(ef_xyz_pred_diff))) print ("===== 3. real s1, real s2 joint position L2 distance med: %.5f, std: %.5f" % (np.median(qpos_diff), np.std(qpos_diff))) print ("===== 4. real s2, sim s2 joint position L2 distance med: %.5f, std: %.5f" % (np.median(qpos_pred_diff), np.std(qpos_pred_diff))) if not args.multistep: print ("===== 5. action - pred_action (per dim) sq L2 distance med: %.5f, std: %.5f" % (np.median(action_diff), np.std(action_diff))) print ("===== 6. action med: %.5f, std: %.5f" % (np.median(np.abs(np.median(actions, axis=1))), np.std(np.median(actions, axis=1))))
def main(): name = 'Exp180512_simple_baseline_striker' EPI.init('striker', num_of_params=2) sess = tf.Session() sess.__enter__() algo = pickle.load(open(os.getcwd() + "/" + name + "/pickle.p", "rb")) env = TfEnv(normalize(GymEnv('StrikerAvg-v0'))) core_env = env.wrapped_env.wrapped_env.env.env target_sample_size = 1000 egreedy = 0.2 data = [] rollouts = [] while len(rollouts) < target_sample_size: observation = env.reset() core_env.change_env(np.array([0.1, 0.1])) old_ball_pos = core_env.model.data.qpos[-9:-7] for i in range(200): if np.random.rand() < egreedy: action = env.action_space.sample() else: action, d = algo.policy.get_action(observation) ball_pos = core_env.model.data.qpos[-9:-7] if np.linalg.norm(ball_pos - old_ball_pos) > 0.005: full_state = core_env.state_vector() rollouts.append([full_state, action]) next_observation, reward, terminal, reward_dict = env.step(action) observation = next_observation old_ball_pos = ball_pos if terminal or len(rollouts) == target_sample_size: break print('Rollout...') for i in range(5): for j in range(5): env_id = int((i * 5 + j)) # default: 1, 2 core_env.change_env(scale=np.array([i * 0.1, j * 0.1])) print(core_env.env_id) print(core_env.scale) for rollout in rollouts: state = rollout[0] observation = core_env.force_reset_model(qpos=state[:16], qvel=state[16:]) action = rollout[1] before = np.concatenate([ core_env.model.data.qpos[7:9, 0], core_env.model.data.qvel[7:9, 0], core_env.get_body_com("tips_arm") ]) next_observation, reward, terminal, reward_dict = env.step( action) after = np.concatenate([ core_env.model.data.qpos[7:9, 0], core_env.model.data.qvel[7:9, 0], core_env.get_body_com("tips_arm") ]) data.append( np.concatenate([ before, after, np.array([core_env.env_id]), core_env.scale ])) observation = next_observation data = np.array(data) g = lambda s, num: [s + str(i) for i in range(num)] columns = g('obs', 7) + g('next_obs', 7) + g('env_id', 1) + g('env_vec', 2) df = pd.DataFrame(data, columns=columns) df.to_csv('../EPI/envs/striker_data_vine.csv')
start_state = env_ref._wrapped_env.start_state goal_state = env_ref._wrapped_env.goal_state env._wrapped_env.__init__(env._wrapped_env.params, grid=grid, b0=b0, start_state=start_state, goal_state=goal_state) env._wrapped_env.generate_grid = False env._wrapped_env.generate_b0_start_goal = False o = env.reset() agent.reset() # agent.reset() path_length = 0 while True: a, agent_info = agent.get_action(o) next_o, r, d, env_info = env.step(a) path_length += 1 if d: break o = next_o if path_length < max_path_length: success += 1 path_lengths = np.append(path_lengths, path_length) mean_path_length = np.mean(path_lengths) print(name) print('success: ', success) print('mean length: ', mean_path_length)
n_itr = 1 sess.run(tf.global_variables_initializer()) # sampler_cls.start_worker() for itr in range(n_itr): # rollout(env, policy, animated=True, max_path_length=1000) o = env.reset() policy.reset() d = False while not d: env.render() flat_obs = policy.observation_space.flatten(o) mean, log_std = [x[0] for x in policy._f_dist([flat_obs])] # rnd = np.random.normal(size=mean.shape) # action = rnd * np.exp(log_std) + mean action = mean next_o, r, d, env_info = env.step(action) o = next_o # sampler_cls.shutdown_worker() if created_session: sess.close() # done = False # obs = env.reset() # rewards = [] # pdb.set_trace() # while not done: # action, actor_info = policy.get_actions(obs.reshape(-1,111)) # obs, reward, done, info = env.step(action) # rewards.append(reward) #
from inverse_rl.envs.env_utils import CustomGymEnv from inverse_rl.utils.log_utils import rllab_logdir from inverse_rl.utils.hyper_sweep import run_sweep_parallel, run_sweep_serial #Loads a policy from the given pickle-file and records a video if __name__ == "__main__": #filename='data/ant_data_collect/2018_05_25_13_42_59_0/itr_1499.pkl' #filename='data/ant_data_collect/2018_05_23_15_21_40_0/itr_1499.pkl' #filename='data/ant_data_collect/2018_05_19_07_56_37_1/itr_1499.pkl' #filename='data/ant_data_collect/2018_05_19_07_56_37_0/itr_1485.pkl' #filename='data/ant_state_irl/2018_05_26_08_51_16_0/itr_999.pkl' #filename='data/ant_state_irl/2018_05_26_08_51_16_1/itr_999.pkl' #filename='data/ant_state_irl/2018_05_26_08_51_16_2/itr_999.pkl' filename = 'data/ant_transfer/2018_05_26_16_06_05_4/itr_999.pkl' import gym import joblib import rllab.misc.logger as rllablogger tf.reset_default_graph() with tf.Session(config=get_session_config()) as sess: rllablogger.set_snapshot_dir("data/video") saved = joblib.load(filename) env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=True, record_log=True) ) #'DisabledAnt-v0' #Switch for the DisabledAnt for the transfer task policy = saved['policy'] observation = env.reset() for _ in range(1000): env.render() action, rest = policy.get_action(observation) observation, reward, done, info = env.step(action)
j = 0 try: obs_list = np.zeros([NUM] + list(obs_shape), np.uint8) state_list = np.zeros([NUM] + list(state_shape), np.float32) action_list = np.zeros([NUM] + list(action_shape), np.float32) done_list = np.zeros([NUM], np.uint8) term_list = np.zeros([NUM], np.uint8) for j in range(TOTAL_NUM / NUM): i = 0 while i < NUM: if i % 10000 == 0: print ("Collected: %d samples"%i) action = env.action_space.sample() next_obs, r, done, _ = env.step(action) obs_list[i] = obs action_list[i] = action done_list[i] = done term_list[i] = False if done: obs = env.reset() else: next_obs = obs i += 1 save_dict = { "obs":obs_list,"action_list":action_list,"done_list":done_list,"term_list":term_list, "state_list":state_list } with open("/home/fred/pixel-data/{}.pkl".format(j), 'wb+') as handle: pickle.dump(save_dict, handle)
def main(): name = 'Exp180418_simple_baseline_hopper' EPI.init('hopper', num_of_params=8) sess = tf.Session() sess.__enter__() algo = pickle.load(open(os.getcwd() + "/" + name + "/pickle.p", "rb")) env = TfEnv(normalize(GymEnv('HopperAvg-v0'))) core_env = env.wrapped_env.wrapped_env.env.env target_sample_size = 500 egreedy = 0.2 data = [] rollouts = [] sample_size = 0 while sample_size < target_sample_size: observation = env.reset() core_env.change_env(scale=np.array( [0.1, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1]), env_id=0) episode_size = 0 while True: if np.random.rand() < egreedy: action = env.action_space.sample() else: action, d = algo.policy.get_action(observation) full_state = core_env.state_vector() rollouts.append([full_state, action]) next_observation, reward, terminal, reward_dict = env.step(action) episode_size += 1 sample_size += 1 observation = next_observation if terminal or sample_size == target_sample_size: break print('Rollout...') scale_list = pd.read_csv('../EPI/envs/hopper_env_list.csv').values for i in range(100): env_id = i core_env.change_env(scale=scale_list[i, 1:], env_id=i) print(core_env.env_id) print(core_env.scale) for rollout in rollouts: state = rollout[0] observation = core_env.force_reset_model(qpos=state[0:6], qvel=state[6:12]) action = rollout[1] next_observation, reward, terminal, reward_dict = env.step(action) data.append( np.concatenate([ observation, action, next_observation, np.array([env_id]), core_env.scale, np.array([reward, terminal * 1]) ])) sample_size += 1 observation = next_observation data = np.array(data) g = lambda s, num: [s + str(i) for i in range(num)] columns = g('obs', len(observation)) + g('ac', len(action)) + g( 'next_obs', len(observation)) + g('env_id', 1) + g( 'env_vec', 8) + ['reward'] + ['terminal'] df = pd.DataFrame(data, columns=columns) df.to_csv('../EPI/envs/hopper_data_vine.csv')
def main(): parser = argparse.ArgumentParser() parser.add_argument('env', type=str, help='name of gym env') parser.add_argument('name', type=str, help='name of database to store') parser.add_argument('--num', type=int, default=300000, help='number of samples to collect') parser.add_argument('--with_state', action='store_true') parser.add_argument('--state_obs', action='store_true') parser.add_argument('--start_index', type=int, default=0) parser.add_argument('--restore_env', type=str, default=None) parser.add_argument('--policy_traj', type=str, default=None) args = parser.parse_args() # Build env env = TfEnv(normalize(env=GymEnv(args.env,record_video=False, \ log_dir='/tmp/gym_test',record_log=False))) if args.restore_env is not None: with tf.Session() as sess: data = joblib.load(args.restore_env) env = data['env'] with tf.Session() as sess: if args.policy_traj is not None: data = joblib.load(args.policy_traj) env = data['env'] policy = data['policy'] for i in range(args.num // NUM_CHUNK): filename = args.name + '/' + str(i + args.start_index) + '.tfrecord' writer = tf.python_io.TFRecordWriter(filename) logger.log('Start collecting data, saving to {}'.format(filename)) obs = env.reset() env_infos = dict() next_obs = None start_time = time.time() j = 0 while j < NUM_CHUNK: if args.policy_traj is not None: policy_action, _ = policy.get_action(obs) action = np.clip(policy_action, -1, 1) # import pdb; pdb.set_trace() else: # print("random action") action = env.action_space.sample() next_obs, reward, done, env_infos = env.step(action) # env.render() if args.state_obs: # import pdb; pdb.set_trace() feature = { 'obs': _floats_feature(obs), 'next_obs': _floats_feature(next_obs), 'action': _floats_feature(action), } else: feature = { 'obs': _bytes_feature(obs.astype(np.uint8).tostring()), 'next_obs': _bytes_feature(next_obs.astype(np.uint8).tostring()), 'action': _floats_feature(action), } if args.with_state: state = get_state(env) feature['state'] = _floats_feature(state) if env_infos['contact']: j += 1 # env.render() # print("transition involve contact, saving index {}".format(j)) example = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(example.SerializeToString()) if done: obs = env.reset() else: obs = next_obs writer.close() logger.log( 'Finished collecting, elapsed time: {}'.format(time.time() - start_time))
rewards = [] pdb.set_trace() observation = env.reset() for _ in range(T): # policy.get_action() returns a pair of values. The second one returns a dictionary, whose values contains # sufficient statistics for the action distribution. It should at least contain entries that would be # returned by calling policy.dist_info(), which is the non-symbolic analog of policy.dist_info_sym(). # Storing these statistics is useful, e.g., when forming importance sampling ratios. In our case it is # not needed. action, _ = policy.get_action(observation) # Recall that the last entry of the tuple stores diagnostic information about the environment. In our # case it is not needed. pdb.set_trace() next_observation, reward, terminal, _ = env.step(action) observations.append(observation) actions.append(action) rewards.append(reward) observation = next_observation if terminal: # Finish rollout if terminal state reached break # We need to compute the empirical return for each time step along the # trajectory returns = [] return_so_far = 0 for t in range(len(rewards) - 1, -1, -1): return_so_far = rewards[t] + discount * return_so_far returns.append(return_so_far)
def main(): import matplotlib.pyplot as plt plt.ion() parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str, help="name of gym env") parser.add_argument('model_path', type=str, help="path of trained model") parser.add_argument('--cos_forward', action='store_true') parser.add_argument('--norm_input', action='store_true') parser.add_argument('--mode', type=str, choices=['render', 'record'], default='render') parser.add_argument('--data_path', type=str, default='/tmp/data') parser.add_argument('--num_sample', type=int, default=100000) args = parser.parse_args() with tf.Session() as sess: data = joblib.load(args.model_path) _encoder = data["encoder"] _inverse_model = data["inverse_model"] _forward_model = data["forward_model"] env = TfEnv( normalize(env=GymEnv('Box3dReachPixel-v11', record_video=False, log_dir='/tmp/gym_test', record_log=False))) s1_ph = tf.placeholder(tf.float32, [None] + list(env.observation_space.shape)) s2_ph = tf.placeholder(tf.float32, [None] + list(env.observation_space.shape)) action_ph = tf.placeholder(tf.float32, [None] + list(env.action_space.shape)) encoder1 = _encoder.get_weight_tied_copy(observation_input=s1_ph) encoder2 = _encoder.get_weight_tied_copy(observation_input=s2_ph) inverse_model = _inverse_model.get_weight_tied_copy( feature_input1=encoder1.output, feature_input2=encoder2.output) forward_model = _forward_model.get_weight_tied_copy( feature_input=encoder1.output, action_input=action_ph) if args.cos_forward: forward_loss = cos_loss(encoder2.output, forward_model.output) else: forward_loss = tf.reduce_mean( tf.square(encoder2.output - forward_model.output)) inverse_loss = tf.reduce_mean( tf.square(action_ph - inverse_model.output)) # Start running the env obs = env.reset() next_obs = None x = [] inverse_losses_results = [] forward_losses_results = [] if args.mode == 'render': f, (ax1, ax2) = plt.subplots(2) ax1.set_title("Inverse loss") ax2.set_title("Forward loss") elif args.mode == 'record': images = np.zeros([args.num_sample, 500, 500, 3], dtype='uint8') inverse_losses = np.zeros(args.num_sample, dtype='float32') forward_losses = np.zeros(args.num_sample, dtype='float32') boxes_contacts = np.zeros(args.num_sample, dtype='uint8') table_contacts = np.zeros(args.num_sample, dtype='uint8') for t in range(args.num_sample): if t % LOG_FREQ == 0: print("Sample: {}".format(t)) action = env.action_space.sample() next_obs, reward, done, env_info = env.step(action) if args.mode == 'render': env.render() elif args.mode == 'record': img = env.wrapped_env._wrapped_env.env.env.render( mode='rgb_array') images[t, :, :, :] = img inverse_loss_result, forward_loss_result = sess.run( [inverse_loss, forward_loss], { s1_ph: [obs / 255.0 - 0.5], s2_ph: [next_obs / 255.0 - 0.5], action_ph: [action] }) if args.mode == 'render': x.append(t) inverse_losses_results.append(inverse_loss_result) forward_losses_results.append(forward_loss_result) ax1.plot(x, inverse_losses_results, c="blue") ax2.plot(x, forward_losses_results, c="blue") plt.pause(0.001) plt.show() elif args.mode == 'record': boxes_contacts[t] = env_info["contact_reward"] table_contacts[t] = env_info["table_contact_reward"] forward_losses[t] = forward_loss_result inverse_losses[t] = inverse_loss_result if done: obs = env.reset() else: obs = next_obs if args.mode == 'record': data_dict = dict(images=images, forward_losses=forward_losses, inverse_losses=inverse_losses, boxes_contacts=boxes_contacts, table_contacts=table_contacts) joblib.dump(data_dict, args.data_path) print("Saved data to {}".format(args.data_path))
params['obs_len'] = len(params['observe_directions']) params['num_state'] = params['grid_n'] * params['grid_m'] params['traj_limit'] = 4 * (params['grid_n'] * params['grid_m'] ) # 4 * (params['grid_n'] + params['grid_m']) params['R_step'] = [params['R_step']] * params['num_action'] params['R_step'][params['stayaction']] = params['R_stay'] params['kdist'] = -0.1 env = GridBase(params) env.generate_grid = True env.generate_b0_start_goal = True env.reset() env.generate_grid = False env.generate_b0_start_goal = False env1 = TfEnv(env) env1_params = env1.get_param_values() print(env1.step(1)) env2 = TfEnv(GridBase(params)) env2.set_param_values(env1.get_param_values()) env2.reset() print(env2.step(1)) env3 = TfEnv(GridBase(params)) env3._wrapped_env.__init__(env1._wrapped_env.params,grid=env1._wrapped_env.grid,b0=env1._wrapped_env.b0,\ start_state=env1._wrapped_env.start_state,goal_state=env1._wrapped_env.goal_state) env3._wrapped_env.generate_grid = False env3._wrapped_env.generate_b0_start_goal = False env3.reset() print(env3.step(1))
from sandbox.rocky.tf.envs.base import TfEnv from inverse_rl.envs.env_utils import CustomGymEnv import gym env = TfEnv( CustomGymEnv('CustomAnt-v0', record_video=False, record_log=False, force_reset=False)) done = False obs = env.reset() while not done: env.render() action = env.action_space.sample() _, _, done, _ = env.step(action) env = TfEnv( CustomGymEnv('DisabledAnt-v0', record_video=False, record_log=False, force_reset=False)) done = False obs = env.reset() while not done: env.render() action = env.action_space.sample() _, _, done, _ = env.step(action)
mbs = [] # warmup rollouts = [] for i in range(100): rollout = [] env.reset() for t in range(100): obs = env.get_current_obs() states = [obs] for i in range(n_layers): obs = abstractors[i](obs) states.append(obs) rollout.append(states) action = policy.get_action(obs) env.step(action) rollouts.append(rollout) # fit generative model to warmup for abs_layer in range(n_layers): obs = [] nexts = [] for rollout in rollouts: for t in range(len(rollout) - 1): obs.append(rollout[t][abs_layer + 1]) nexts.append(rollout[t + 1][abs_layer + 1]) models[abs_layer].fit(np.array(obs), nexts, n_steps=5000) # fit planner for abs_layer in range(n_layers): obs = []