def env_factory(env_name): gym_env = gym.make(env_name) gym_spec = gym.spec(env_name) if gym_spec.max_episode_steps in [0, None]: # Add TimeLimit wrapper. gym_env = time_limit.TimeLimit(gym_env, max_episode_steps=1000) tf_env = tf_py_environment.TFPyEnvironment(gym_wrapper.GymWrapper(gym_env)) return tf_env
def __init__(self, env_params): self.name = env_params['environment'] self.eval_interval = env_params['EvalIntervalMilSteps'] * 1000000 self.eval_episodes = env_params['EvalEpisodes'] if env_params['environment'] == "MountainCarContinuous-v1": self.instance = MCv1() elif env_params['environment'] == "Pendulum-v1": self.instance = time_limit.TimeLimit(pendulum_v1.PendulumEnv(), 200) else: self.instance = gym.make(env_params['environment']) # total number of steps allowed in a run self.TOTAL_STEPS_LIMIT = env_params['TotalMilSteps'] * 1000000 # self.TOTAL_EPISODES_LIMIT = env_params['TotalEpisodes'] # maximum number of steps allowed for each episode # if -1 takes default setting from gym if env_params['EpisodeSteps'] != -1: self.EPISODE_STEPS_LIMIT = env_params['EpisodeSteps'] self.instance._max_episode_steps = env_params['EpisodeSteps'] else: self.EPISODE_STEPS_LIMIT = self.instance._max_episode_steps # state info self.state_dim = self.get_state_dim() self.state_range = self.get_state_range() self.state_min = self.get_state_min() self.state_max = self.get_state_max() self.state_bounded = False if np.any( np.isinf(self.instance.observation_space.high)) or np.any( np.isinf(self.instance.observation_space.low)) else True # action info self.action_dim = self.get_action_dim() self.action_range = self.get_action_range() self.action_min = self.get_action_min() self.action_max = self.get_action_max()
action_permutation=act_perm, soft_mirror=soft_mirror) if __name__ == '__main__': append_o = []#[0.4458616 , 0.63732893, 0.98086248, 0.94058195, 0.01685923] if len(sys.argv) > 1: if sys.argv[1] == 'Minitaur': from pybullet_envs.minitaur.envs import minitaur_reactive_env from gym.wrappers import time_limit env = time_limit.TimeLimit(minitaur_reactive_env.MinitaurReactiveEnv(render=True, accurate_motor_model_enabled=True, urdf_version='rainbow_dash_v0', train_UP=len(append_o) > 0, resample_MP=False), max_episode_steps=1000) else: env = gym.make(sys.argv[1]) else: env = gym.make('DartWalker3dRestricted-v1') if len(append_o) > 0 and sys.argv[1] != 'Minitaur': from gym import spaces env.env.obs_dim += len(append_o) high = np.inf * np.ones(env.env.obs_dim) low = -high env.env.observation_space = spaces.Box(low, high) env.observation_space = spaces.Box(low, high)
def episode_limit(env): return time_limit.TimeLimit(env, max_episode_steps=max_steps)
policy_path = args.policy_path osi_iteration = args.osi_iteration training_sample_num = args.training_sample_num dyn_params = args.dyn_params # setup the environments # if use minitaur environment, set up differently if args.env == 'Minitaur': from pybullet_envs.minitaur.envs import minitaur_reactive_env from gym.wrappers import time_limit env_hist = time_limit.TimeLimit( minitaur_reactive_env.MinitaurReactiveEnv( render=False, accurate_motor_model_enabled=True, urdf_version='rainbow_dash_v0', include_obs_history=OSI_hist, include_act_history=0, train_UP=False), max_episode_steps=1000) env_up = time_limit.TimeLimit( minitaur_reactive_env.MinitaurReactiveEnv( render=False, accurate_motor_model_enabled=True, urdf_version='rainbow_dash_v0', include_obs_history=1, include_act_history=0, train_UP=True), max_episode_steps=1000) else: env_hist = gym.make(args.env)
def main(_): tf.random.set_seed(FLAGS.seed) np.random.seed(FLAGS.seed) hparam_str = make_hparam_string(seed=FLAGS.seed, env_name=FLAGS.env_name) summary_writer = tf.summary.create_file_writer( os.path.join(FLAGS.save_dir, 'tb', hparam_str)) summary_writer.set_as_default() if FLAGS.d4rl: d4rl_env = gym.make(FLAGS.env_name) gym_spec = gym.spec(FLAGS.env_name) if gym_spec.max_episode_steps in [0, None]: # Add TimeLimit wrapper. gym_env = time_limit.TimeLimit(d4rl_env, max_episode_steps=1000) else: gym_env = d4rl_env gym_env.seed(FLAGS.seed) env = tf_py_environment.TFPyEnvironment( gym_wrapper.GymWrapper(gym_env)) behavior_dataset = D4rlDataset( d4rl_env, normalize_states=FLAGS.normalize_states, normalize_rewards=FLAGS.normalize_rewards, noise_scale=FLAGS.noise_scale, bootstrap=FLAGS.bootstrap) else: env = suite_mujoco.load(FLAGS.env_name) env.seed(FLAGS.seed) env = tf_py_environment.TFPyEnvironment(env) data_file_name = os.path.join( FLAGS.data_dir, FLAGS.env_name, '0', f'dualdice_{FLAGS.behavior_policy_std}.pckl') behavior_dataset = Dataset(data_file_name, FLAGS.num_trajectories, normalize_states=FLAGS.normalize_states, normalize_rewards=FLAGS.normalize_rewards, noise_scale=FLAGS.noise_scale, bootstrap=FLAGS.bootstrap) tf_dataset = behavior_dataset.with_uniform_sampling( FLAGS.sample_batch_size) tf_dataset_iter = iter(tf_dataset) if FLAGS.d4rl: with tf.io.gfile.GFile(FLAGS.d4rl_policy_filename, 'rb') as f: policy_weights = pickle.load(f) actor = utils.D4rlActor(env, policy_weights, is_dapg='dapg' in FLAGS.d4rl_policy_filename) else: actor = Actor(env.observation_spec().shape[0], env.action_spec()) actor.load_weights(behavior_dataset.model_filename) policy_returns = utils.estimate_monte_carlo_returns( env, FLAGS.discount, actor, FLAGS.target_policy_std, FLAGS.num_mc_episodes) logging.info('Estimated Per-Step Average Returns=%f', policy_returns) if 'fqe' in FLAGS.algo or 'dr' in FLAGS.algo: model = QFitter(env.observation_spec().shape[0], env.action_spec().shape[0], FLAGS.lr, FLAGS.weight_decay, FLAGS.tau) elif 'mb' in FLAGS.algo: model = ModelBased(env.observation_spec().shape[0], env.action_spec().shape[0], learning_rate=FLAGS.lr, weight_decay=FLAGS.weight_decay) elif 'dual_dice' in FLAGS.algo: model = DualDICE(env.observation_spec().shape[0], env.action_spec().shape[0], FLAGS.weight_decay) if 'iw' in FLAGS.algo or 'dr' in FLAGS.algo: behavior = BehaviorCloning(env.observation_spec().shape[0], env.action_spec(), FLAGS.lr, FLAGS.weight_decay) @tf.function def get_target_actions(states): return actor(tf.cast(behavior_dataset.unnormalize_states(states), env.observation_spec().dtype), std=FLAGS.target_policy_std)[1] @tf.function def get_target_logprobs(states, actions): log_probs = actor(tf.cast(behavior_dataset.unnormalize_states(states), env.observation_spec().dtype), actions=actions, std=FLAGS.target_policy_std)[2] if tf.rank(log_probs) > 1: log_probs = tf.reduce_sum(log_probs, -1) return log_probs min_reward = tf.reduce_min(behavior_dataset.rewards) max_reward = tf.reduce_max(behavior_dataset.rewards) min_state = tf.reduce_min(behavior_dataset.states, 0) max_state = tf.reduce_max(behavior_dataset.states, 0) @tf.function def update_step(): (states, actions, next_states, rewards, masks, weights, _) = next(tf_dataset_iter) initial_actions = get_target_actions(behavior_dataset.initial_states) next_actions = get_target_actions(next_states) if 'fqe' in FLAGS.algo or 'dr' in FLAGS.algo: model.update(states, actions, next_states, next_actions, rewards, masks, weights, FLAGS.discount, min_reward, max_reward) elif 'mb' in FLAGS.algo: model.update(states, actions, next_states, rewards, masks, weights) elif 'dual_dice' in FLAGS.algo: model.update(behavior_dataset.initial_states, initial_actions, behavior_dataset.initial_weights, states, actions, next_states, next_actions, masks, weights, FLAGS.discount) if 'iw' in FLAGS.algo or 'dr' in FLAGS.algo: behavior.update(states, actions, weights) gc.collect() for i in tqdm.tqdm(range(FLAGS.num_updates), desc='Running Training'): update_step() if i % FLAGS.eval_interval == 0: if 'fqe' in FLAGS.algo: pred_returns = model.estimate_returns( behavior_dataset.initial_states, behavior_dataset.initial_weights, get_target_actions) elif 'mb' in FLAGS.algo: pred_returns = model.estimate_returns( behavior_dataset.initial_states, behavior_dataset.initial_weights, get_target_actions, FLAGS.discount, min_reward, max_reward, min_state, max_state) elif FLAGS.algo in ['dual_dice']: pred_returns, pred_ratio = model.estimate_returns( iter(tf_dataset)) tf.summary.scalar('train/pred ratio', pred_ratio, step=i) elif 'iw' in FLAGS.algo or 'dr' in FLAGS.algo: discount = FLAGS.discount _, behavior_log_probs = behavior(behavior_dataset.states, behavior_dataset.actions) target_log_probs = get_target_logprobs( behavior_dataset.states, behavior_dataset.actions) offset = 0.0 rewards = behavior_dataset.rewards if 'dr' in FLAGS.algo: # Doubly-robust is effectively the same as importance-weighting but # transforming rewards at (s,a) to r(s,a) + gamma * V^pi(s') - # Q^pi(s,a) and adding an offset to each trajectory equal to V^pi(s0). offset = model.estimate_returns( behavior_dataset.initial_states, behavior_dataset.initial_weights, get_target_actions) q_values = (model(behavior_dataset.states, behavior_dataset.actions) / (1 - discount)) n_samples = 10 next_actions = [ get_target_actions(behavior_dataset.next_states) for _ in range(n_samples) ] next_q_values = sum([ model(behavior_dataset.next_states, next_action) / (1 - discount) for next_action in next_actions ]) / n_samples rewards = rewards + discount * next_q_values - q_values # Now we compute the self-normalized importance weights. # Self-normalization happens over trajectories per-step, so we # restructure the dataset as [num_trajectories, num_steps]. num_trajectories = len(behavior_dataset.initial_states) max_trajectory_length = np.max(behavior_dataset.steps) + 1 trajectory_weights = behavior_dataset.initial_weights trajectory_starts = np.where( np.equal(behavior_dataset.steps, 0))[0] batched_rewards = np.zeros( [num_trajectories, max_trajectory_length]) batched_masks = np.zeros( [num_trajectories, max_trajectory_length]) batched_log_probs = np.zeros( [num_trajectories, max_trajectory_length]) for traj_idx, traj_start in enumerate(trajectory_starts): traj_end = (trajectory_starts[traj_idx + 1] if traj_idx + 1 < len(trajectory_starts) else len(rewards)) traj_length = traj_end - traj_start batched_rewards[ traj_idx, :traj_length] = rewards[traj_start:traj_end] batched_masks[traj_idx, :traj_length] = 1. batched_log_probs[traj_idx, :traj_length] = ( -behavior_log_probs[traj_start:traj_end] + target_log_probs[traj_start:traj_end]) batched_weights = ( batched_masks * (discount**np.arange(max_trajectory_length))[None, :]) clipped_log_probs = np.clip(batched_log_probs, -6., 2.) cum_log_probs = batched_masks * np.cumsum(clipped_log_probs, axis=1) cum_log_probs_offset = np.max(cum_log_probs, axis=0) cum_probs = np.exp(cum_log_probs - cum_log_probs_offset[None, :]) avg_cum_probs = ( np.sum(cum_probs * trajectory_weights[:, None], axis=0) / (1e-10 + np.sum( batched_masks * trajectory_weights[:, None], axis=0))) norm_cum_probs = cum_probs / (1e-10 + avg_cum_probs[None, :]) weighted_rewards = batched_weights * batched_rewards * norm_cum_probs trajectory_values = np.sum(weighted_rewards, axis=1) avg_trajectory_value = ( (1 - discount) * np.sum(trajectory_values * trajectory_weights) / np.sum(trajectory_weights)) pred_returns = offset + avg_trajectory_value pred_returns = behavior_dataset.unnormalize_rewards(pred_returns) tf.summary.scalar('train/pred returns', pred_returns, step=i) logging.info('pred returns=%f', pred_returns) tf.summary.scalar('train/true minus pred returns', policy_returns - pred_returns, step=i) logging.info('true minus pred returns=%f', policy_returns - pred_returns)
def train(env_id, num_timesteps, seed, batch_size, clip, schedule, mirror, warmstart, train_up, dyn_params): from policy_transfer.ppo import ppo_sgd U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) if env_id == 'Minitaur': from pybullet_envs.minitaur.envs import minitaur_reactive_env from gym.wrappers import time_limit env = time_limit.TimeLimit(minitaur_reactive_env.MinitaurReactiveEnv( render=False, accurate_motor_model_enabled=True, urdf_version='rainbow_dash_v0', train_UP=False, resample_MP=False), max_episode_steps=1000) else: env = gym.make(env_id) if train_up: if env.env.train_UP is not True: env.env.train_UP = True env.env.resample_MP = True from gym import spaces env.env.param_manager.activated_param = dyn_params env.env.param_manager.controllable_param = dyn_params env.env.obs_dim += len(env.env.param_manager.activated_param) high = np.inf * np.ones(env.env.obs_dim) low = -high env.env.observation_space = spaces.Box(low, high) env.observation_space = spaces.Box(low, high) if hasattr(env.env, 'obs_perm'): obpermapp = np.arange( len(env.env.obs_perm), len(env.env.obs_perm) + len(env.env.param_manager.activated_param)) env.env.obs_perm = np.concatenate( [env.env.obs_perm, obpermapp]) with open(logger.get_dir() + "/envinfo.txt", "w") as text_file: text_file.write(str(env.env.__dict__)) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3) def policy_mirror_fn(name, ob_space, ac_space): return MirrorPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, observation_permutation=env.env.env.obs_perm, action_permutation=env.env.env.act_perm, soft_mirror=(mirror == 2)) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True) env.seed(seed + MPI.COMM_WORLD.Get_rank()) gym.logger.setLevel(logging.WARN) if mirror: pol_func = policy_mirror_fn else: pol_func = policy_fn if len(warmstart) > 0: warstart_params = joblib.load(warmstart) else: warstart_params = None ppo_sgd.learn( env, pol_func, max_timesteps=num_timesteps, timesteps_per_batch=int(batch_size), clip_param=clip, entcoeff=0.0, optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule=schedule, callback=callback, init_policy_params=warstart_params, ) env.close()
run_cma = args.run_cma == 'True' max_step = args.max_step use_sparse_rew = args.sparse_rew == 'True' if args.env == 'Minitaur': from pybullet_envs.minitaur.envs import minitaur_reactive_env from gym.wrappers import time_limit obs_in = 1 act_in = 0 if testing_mode == 'HIST': obs_in = 10 act_in = 10 env = time_limit.TimeLimit(minitaur_reactive_env.MinitaurReactiveEnv(render=False, accurate_motor_model_enabled = True, urdf_version='rainbow_dash_v0', include_obs_history=obs_in, include_act_history=act_in, train_UP=False, resample_MP=False), max_episode_steps=1000) else: env = gym.make(args.env) if hasattr(env.env, 'disableViewer'): env.env.disableViewer = False def policy_fn(name, ob_space, ac_space): hid_size = 64 return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=hid_size, num_hid_layers=3) def policy_mirror_fn(name, ob_space, ac_space): obpermapp = np.arange(len(env.env.obs_perm), len(env.env.obs_perm)+UP_dim) ob_perm = np.concatenate([env.env.obs_perm, obpermapp])