def run_experiment(args): torch.set_num_threads(1) # see: https://github.com/pytorch/pytorch/issues/13757 from apex import env_factory, create_logger # # Environment # if(args.env in ["Cassie-v0", "Cassie-mimic-v0", "Cassie-mimic-walking-v0"]): # # NOTE: importing cassie for some reason breaks openai gym, BUG ? # from cassie import CassieEnv, CassieTSEnv, CassieIKEnv # from cassie.no_delta_env import CassieEnv_nodelta # from cassie.speed_env import CassieEnv_speed # from cassie.speed_double_freq_env import CassieEnv_speed_dfreq # from cassie.speed_no_delta_env import CassieEnv_speed_no_delta # # set up cassie environment # # import gym_cassie # # env_fn = gym_factory(args.env_name) # #env_fn = make_env_fn(state_est=args.state_est) # #env_fn = functools.partial(CassieEnv_speed_dfreq, "walking", clock_based = True, state_est=args.state_est) # env_fn = functools.partial(CassieIKEnv, clock_based=True, state_est=args.state_est) # print(env_fn().clock_inds) # obs_dim = env_fn().observation_space.shape[0] # action_dim = env_fn().action_space.shape[0] # # Mirror Loss # if args.mirror: # if args.state_est: # # with state estimator # env_fn = functools.partial(SymmetricEnv, env_fn, mirrored_obs=[0.1, 1, 2, 3, 4, -10, -11, 12, 13, 14, -5, -6, 7, 8, 9, 15, 16, 17, 18, 19, 20, -26, -27, 28, 29, 30, -21, -22, 23, 24, 25, 31, 32, 33, 37, 38, 39, 34, 35, 36, 43, 44, 45, 40, 41, 42, 46, 47, 48], mirrored_act=[-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4]) # else: # # without state estimator # env_fn = functools.partial(SymmetricEnv, env_fn, mirrored_obs=[0.1, 1, 2, 3, 4, 5, -13, -14, 15, 16, 17, # 18, 19, -6, -7, 8, 9, 10, 11, 12, 20, 21, 22, 23, 24, 25, -33, # -34, 35, 36, 37, 38, 39, -26, -27, 28, 29, 30, 31, 32, 40, 41, 42], # mirrored_act = [-5, -6, 7, 8, 9, -0.1, -1, 2, 3, 4]) # else: # import gym # env_fn = gym_factory(args.env_name) # #max_episode_steps = env_fn()._max_episode_steps # obs_dim = env_fn().observation_space.shape[0] # action_dim = env_fn().action_space.shape[0] # max_episode_steps = 1000 # wrapper function for creating parallelized envs env_fn = env_factory(args.env_name, state_est=args.state_est, mirror=args.mirror, speed=args.speed) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) if args.previous is not None: policy = torch.load(args.previous) print("loaded model from {}".format(args.previous)) else: policy = GaussianMLP_Actor( obs_dim, action_dim, env_name=args.env_name, nonlinearity=torch.nn.functional.relu, bounded=True, init_std=np.exp(-2), learn_std=False, normc_init=False ) policy_copy = GaussianMLP_Actor( obs_dim, action_dim, env_name=args.env_name, nonlinearity=torch.nn.functional.relu, bounded=True, init_std=np.exp(-2), learn_std=False, normc_init=False ) critic = GaussianMLP_Critic( obs_dim, env_name=args.env_name, nonlinearity=torch.nn.functional.relu, bounded=True, init_std=np.exp(-2), learn_std=False, normc_init=False ) policy.obs_mean, policy.obs_std = map(torch.Tensor, get_normalization_params(iter=args.input_norm_steps, noise_std=1, policy=policy, env_fn=env_fn)) critic.obs_mean = policy.obs_mean policy_copy.obs_mean = policy.obs_mean critic.obs_std = policy.obs_std policy_copy.obs_std = policy.obs_std policy_copy.train(0) policy.train(0) critic.train(0) print("obs_dim: {}, action_dim: {}".format(obs_dim, action_dim)) if args.mirror: algo = MirrorPPO(args=vars(args)) else: algo = PPO(args=vars(args)) # create a tensorboard logging object logger = create_logger(args) print() print("Synchronous Distributed Proximal Policy Optimization:") print("\tenv: {}".format(args.env_name)) print("\tmax traj len: {}".format(args.max_traj_len)) print("\tseed: {}".format(args.seed)) print("\tmirror: {}".format(args.mirror)) print("\tnum procs: {}".format(args.num_procs)) print("\tlr: {}".format(args.lr)) print("\teps: {}".format(args.eps)) print("\tlam: {}".format(args.lam)) print("\tgamma: {}".format(args.gamma)) print("\tentropy coeff: {}".format(args.entropy_coeff)) print("\tclip: {}".format(args.clip)) print("\tminibatch size: {}".format(args.minibatch_size)) print("\tepochs: {}".format(args.epochs)) print("\tnum steps: {}".format(args.num_steps)) print("\tuse gae: {}".format(args.use_gae)) print("\tmax grad norm: {}".format(args.max_grad_norm)) print("\tmax traj len: {}".format(args.max_traj_len)) print() algo.train(env_fn, policy, policy_copy, critic, args.n_itr, logger=logger)
def __init__(self, args): self.logger = create_logger(args)
def run_experiment(args): # wrapper function for creating parallelized envs env_thunk = env_factory(args.env_name) with env_thunk() as env: obs_space = env.observation_space.shape[0] act_space = env.action_space.shape[0] # wrapper function for creating parallelized policies def policy_thunk(): from rl.policies.actor import FF_Actor, LSTM_Actor, Linear_Actor if args.load_model is not None: return torch.load(args.load_model) else: if not args.recurrent: policy = Linear_Actor(obs_space, act_space, hidden_size=args.hidden_size).float() else: policy = LSTM_Actor(obs_space, act_space, hidden_size=args.hidden_size).float() # policy parameters should be zero initialized according to ARS paper for p in policy.parameters(): p.data = torch.zeros(p.shape) return policy # the 'black box' function that will get passed into ARS def eval_fn(policy, env, reward_shift, traj_len, visualize=False, normalize=False): if hasattr(policy, 'init_hidden_state'): policy.init_hidden_state() state = torch.tensor(env.reset()).float() rollout_reward = 0 done = False timesteps = 0 while not done and timesteps < traj_len: if normalize: state = policy.normalize_state(state) action = policy.forward(state).detach().numpy() state, reward, done, _ = env.step(action) state = torch.tensor(state).float() rollout_reward += reward - reward_shift timesteps+=1 return rollout_reward, timesteps import locale locale.setlocale(locale.LC_ALL, '') print("Augmented Random Search:") print("\tenv: {}".format(args.env_name)) print("\tseed: {}".format(args.seed)) print("\ttimesteps: {:n}".format(args.timesteps)) print("\tstd: {}".format(args.std)) print("\tdeltas: {}".format(args.deltas)) print("\tstep size: {}".format(args.lr)) print("\treward shift: {}".format(args.reward_shift)) print() algo = ARS(policy_thunk, env_thunk, deltas=args.deltas, step_size=args.lr, std=args.std, workers=args.workers, redis_addr=args.redis) if args.algo not in ['v1', 'v2']: print("Valid arguments for --algo are 'v1' and 'v2'") exit(1) elif args.algo == 'v2': normalize_states = True else: normalize_states = False def black_box(p, env): return eval_fn(p, env, args.reward_shift, args.traj_len, normalize=normalize_states) avg_reward = 0 timesteps = 0 i = 0 logger = create_logger(args) # if args.save_model is None: # args.save_model = os.path.join(logger.dir, 'actor.pt') args.save_model = os.path.join(logger.dir, 'actor.pt') env = env_thunk() while timesteps < args.timesteps: if not i % args.average_every: avg_reward = 0 print() start = time.time() samples = algo.step(black_box) elapsed = time.time() - start iter_reward = 0 for eval_rollout in range(10): reward, _ = eval_fn(algo.policy, env, 0, args.traj_len, normalize=normalize_states) iter_reward += reward / 10 timesteps += samples avg_reward += iter_reward secs_per_sample = 1000 * elapsed / samples print(("iter {:4d} | " "ret {:6.2f} | " "last {:3d} iters: {:6.2f} | " "{:0.4f}s per 1k steps | " "timesteps {:10n}").format(i+1, \ iter_reward, (i%args.average_every)+1, \ avg_reward/((i%args.average_every)+1), \ secs_per_sample, timesteps), \ end="\r") i += 1 logger.add_scalar('eval', iter_reward, timesteps) torch.save(algo.policy, args.save_model)
def run_experiment(args): from time import time from apex import env_factory, create_logger from rl.policies.critic import FF_Critic, LSTM_Critic from rl.policies.actor import FF_Actor, LSTM_Actor import locale, os locale.setlocale(locale.LC_ALL, '') # wrapper function for creating parallelized envs env = env_factory(args.env_name)() eval_env = env_factory(args.env_name)() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if hasattr(env, 'seed'): env.seed(args.seed) obs_space = env.observation_space.shape[0] act_space = env.action_space.shape[0] if args.recurrent: actor = LSTM_Actor(obs_space, act_space, hidden_size=args.hidden_size, env_name=args.env_name, hidden_layers=args.layers) critic = LSTM_Critic(obs_space, act_space, hidden_size=args.hidden_size, env_name=args.env_name, hidden_layers=args.layers) else: actor = FF_Actor(obs_space, act_space, hidden_size=args.hidden_size, env_name=args.env_name, hidden_layers=args.layers) critic = FF_Critic(obs_space, act_space, hidden_size=args.hidden_size, env_name=args.env_name, hidden_layers=args.layers) algo = DPG(actor, critic, args.a_lr, args.c_lr, discount=args.discount, tau=args.tau, center_reward=args.center_reward, normalize=args.normalize) replay_buff = ReplayBuffer(obs_space, act_space, args.timesteps) if algo.recurrent: print("Recurrent Deterministic Policy Gradients:") else: print("Deep Deterministic Policy Gradients:") print("\tenv: {}".format(args.env_name)) print("\tseed: {}".format(args.seed)) print("\ttimesteps: {:n}".format(args.timesteps)) print("\tactor_lr: {}".format(args.a_lr)) print("\tcritic_lr: {}".format(args.c_lr)) print("\tdiscount: {}".format(args.discount)) print("\ttau: {}".format(args.tau)) print("\tnorm reward: {}".format(args.center_reward)) print("\tbatch_size: {}".format(args.batch_size)) print("\twarmup period: {:n}".format(args.start_timesteps)) print() iter = 0 episode_reward = 0 episode_timesteps = 0 # create a tensorboard logging object logger = create_logger(args) if args.save_actor is None: args.save_actor = os.path.join(logger.dir, 'actor.pt') if args.save_critic is None: args.save_critic = os.path.join(logger.dir, 'critic.pt') # Keep track of some statistics for each episode training_start = time() episode_start = time() episode_loss = 0 update_steps = 0 best_reward = None # Fill replay buffer, update policy until n timesteps have passed timesteps = 0 state = env.reset().astype(np.float32) while timesteps < args.timesteps: buffer_ready = (algo.recurrent and iter > args.batch_size) or ( not algo.recurrent and replay_buff.size > args.batch_size) warmup = timesteps < args.start_timesteps state, r, done = collect_experience(algo.behavioral_actor, env, replay_buff, state, episode_timesteps, max_len=args.traj_len, random_action=warmup, noise=args.expl_noise, do_trajectory=algo.recurrent, normalize=algo.normalize) episode_reward += r episode_timesteps += 1 timesteps += 1 # Update the policy once our replay buffer is big enough if buffer_ready and done and not warmup: update_steps = 0 if not algo.recurrent: num_updates = episode_timesteps * args.updates else: num_updates = args.updates for _ in range(num_updates): u_loss, u_steps = algo.update_policy(replay_buff, args.batch_size, traj_len=args.traj_len) episode_loss += u_loss / num_updates update_steps += u_steps if done: episode_elapsed = (time() - episode_start) episode_secs_per_sample = episode_elapsed / episode_timesteps logger.add_scalar(args.env_name + ' episode length', episode_timesteps, iter) logger.add_scalar(args.env_name + ' episode reward', episode_reward, iter) logger.add_scalar(args.env_name + ' critic loss', episode_loss, iter) completion = 1 - float(timesteps) / args.timesteps avg_sample_r = (time() - training_start) / timesteps secs_remaining = avg_sample_r * args.timesteps * completion hrs_remaining = int(secs_remaining // (60 * 60)) min_remaining = int(secs_remaining - hrs_remaining * 60 * 60) // 60 if iter % args.eval_every == 0 and iter != 0: eval_reward = eval_policy(algo.behavioral_actor, eval_env, max_traj_len=args.traj_len) logger.add_scalar(args.env_name + ' eval episode', eval_reward, iter) logger.add_scalar(args.env_name + ' eval timestep', eval_reward, timesteps) print( "evaluation after {:4d} episodes | return: {:7.3f} | timesteps {:9n}{:100s}" .format(iter, eval_reward, timesteps, '')) if best_reward is None or eval_reward > best_reward: torch.save(algo.behavioral_actor, args.save_actor) torch.save(algo.behavioral_critic, args.save_critic) best_reward = eval_reward print("\t(best policy so far! saving to {})".format( args.save_actor)) try: print( "episode {:5d} | episode timestep {:5d}/{:5d} | return {:5.1f} | update timesteps: {:7n} | {:3.1f}s/1k samples | approx. {:3d}h {:02d}m remain\t\t\t\t" .format(iter, episode_timesteps, args.traj_len, episode_reward, update_steps, 1000 * episode_secs_per_sample, hrs_remaining, min_remaining), end='\r') except NameError: pass if done: if hasattr(algo.behavioral_actor, 'init_hidden_state'): algo.behavioral_actor.init_hidden_state() episode_start, episode_reward, episode_timesteps, episode_loss = time( ), 0, 0, 0 iter += 1
def run_experiment(args): from apex import env_factory, create_logger # wrapper function for creating parallelized envs env_fn = env_factory(args.env_name, state_est=args.state_est, mirror=args.mirror, history=args.history) max_traj_len = args.max_traj_len # Start ray ray.init(num_gpus=0, include_webui=True, redis_address=args.redis_address) # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] max_action = 1.0 #max_action = float(env.action_space.high[0]) print() print("Synchronous Twin-Delayed Deep Deterministic policy gradients:") print("\tenv: {}".format(args.env_name)) print("\tmax traj len: {}".format(args.max_traj_len)) print("\tseed: {}".format(args.seed)) print("\tmirror: {}".format(args.mirror)) print("\tnum procs: {}".format(args.num_procs)) print("\tmin steps: {}".format(args.min_steps)) print("\ta_lr: {}".format(args.a_lr)) print("\tc_lr: {}".format(args.c_lr)) print("\ttau: {}".format(args.tau)) print("\tgamma: {}".format(args.discount)) print("\tact noise: {}".format(args.act_noise)) print("\tparam noise: {}".format(args.param_noise)) if (args.param_noise): print("\tnoise scale: {}".format(args.noise_scale)) print("\tbatch size: {}".format(args.batch_size)) print("\tpolicy noise: {}".format(args.policy_noise)) print("\tnoise clip: {}".format(args.noise_clip)) print("\tpolicy freq: {}".format(args.policy_freq)) print() # Initialize policy, replay buffer policy = TD3(state_dim, action_dim, max_action, a_lr=args.a_lr, c_lr=args.c_lr, env_name=args.env_name) replay_buffer = ReplayBuffer() # create a tensorboard logging object logger = create_logger(args) # Initialize param noise (or set to None) param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None total_timesteps = 0 total_updates = 0 timesteps_since_eval = 0 episode_num = 0 # Evaluate untrained policy once ret, eplen = evaluate_policy(env_fn(), policy) logger.add_scalar("Test/Return", ret, total_updates) logger.add_scalar("Test/Eplen", eplen, total_updates) policy.save(logger.dir) while total_timesteps < args.max_timesteps: # collect parallel experience and add to replay buffer merged_transitions, episode_timesteps = parallel_collect_experience( policy, env_fn, args.act_noise, args.min_steps, max_traj_len, num_procs=args.num_procs) replay_buffer.add_parallel(merged_transitions) total_timesteps += episode_timesteps timesteps_since_eval += episode_timesteps episode_num += args.num_procs # Logging rollouts print("Total T: {} Episode Num: {} Episode T: {}".format( total_timesteps, episode_num, episode_timesteps)) # update the policy avg_q1, avg_q2, q_loss, pi_loss, avg_action = policy.train( replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq) total_updates += episode_timesteps # this is how many iterations we did updates for # Logging training logger.add_scalar("Train/avg_q1", avg_q1, total_updates) logger.add_scalar("Train/avg_q2", avg_q2, total_updates) logger.add_scalar("Train/q_loss", q_loss, total_updates) logger.add_scalar("Train/pi_loss", pi_loss, total_updates) logger.add_histogram("Train/avg_action", avg_action, total_updates) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval = 0 ret, eplen = evaluate_policy(env_fn(), policy) # Logging Eval logger.add_scalar("Test/Return", ret, total_updates) logger.add_scalar("Test/Eplen", eplen, total_updates) logger.add_histogram("Test/avg_action", avg_action, total_updates) # Logging Totals logger.add_scalar("Misc/Timesteps", total_timesteps, total_updates) logger.add_scalar("Misc/ReplaySize", replay_buffer.ptr, total_updates) print("Total T: {}\tEval Return: {}\t Eval Eplen: {}".format( total_timesteps, ret, eplen)) if args.save_models: policy.save() # Final evaluation ret, eplen = evaluate_policy(env_fn(), policy) logger.add_scalar("Test/Return", ret, total_updates) logger.add_scalar("Test/Eplen", eplen, total_updates) # Final Policy Save if args.save_models: policy.save()
def run_experiment(args): from apex import env_factory, create_logger torch.set_num_threads(1) # wrapper function for creating parallelized envs env_fn = env_factory(args.env_name, traj=args.traj, state_est=args.state_est, dynamics_randomization=args.dyn_random, mirror=args.mirror, clock_based=args.clock_based, history=args.history) obs_dim = env_fn().observation_space.shape[0] action_dim = env_fn().action_space.shape[0] # Set seeds torch.manual_seed(args.seed) np.random.seed(args.seed) if args.previous is not None: policy = torch.load(args.previous + "actor.pt") critic = torch.load(args.previous + "critic.pt") # TODO: add ability to load previous hyperparameters, if this is something that we event want # with open(args.previous + "experiment.pkl", 'rb') as file: # args = pickle.loads(file.read()) print("loaded model from {}".format(args.previous)) else: if args.recurrent: policy = Gaussian_LSTM_Actor(obs_dim, action_dim, fixed_std=np.exp(-2), env_name=args.env_name) critic = LSTM_V(obs_dim) else: policy = Gaussian_FF_Actor(obs_dim, action_dim, fixed_std=np.exp(-2), env_name=args.env_name) critic = FF_V(obs_dim) with torch.no_grad(): policy.obs_mean, policy.obs_std = map( torch.Tensor, get_normalization_params(iter=args.input_norm_steps, noise_std=1, policy=policy, env_fn=env_fn)) critic.obs_mean = policy.obs_mean critic.obs_std = policy.obs_std print("obs_dim: {}, action_dim: {}".format(obs_dim, action_dim)) # create a tensorboard logging object logger = create_logger(args) algo = PPO(args=vars(args), save_path=logger.dir) print() print("Synchronous Distributed Proximal Policy Optimization:") print("\tenv: {}".format(args.env_name)) print("\trun name: {}".format(args.run_name)) print("\tmax traj len: {}".format(args.max_traj_len)) print("\tseed: {}".format(args.seed)) print("\tmirror: {}".format(args.mirror)) print("\tnum procs: {}".format(args.num_procs)) print("\tlr: {}".format(args.lr)) print("\teps: {}".format(args.eps)) print("\tlam: {}".format(args.lam)) print("\tgamma: {}".format(args.gamma)) print("\tentropy coeff: {}".format(args.entropy_coeff)) print("\tclip: {}".format(args.clip)) print("\tminibatch size: {}".format(args.minibatch_size)) print("\tepochs: {}".format(args.epochs)) print("\tnum steps: {}".format(args.num_steps)) print("\tuse gae: {}".format(args.use_gae)) print("\tmax grad norm: {}".format(args.max_grad_norm)) print("\tmax traj len: {}".format(args.max_traj_len)) print() algo.train(env_fn, policy, critic, args.n_itr, logger=logger)