def aux(nom1, nom2, g, e, sequences, tree): if not nom1 in tree.keys(): seq1 = [sequences[int(nom1)]] else: g1, g2 = tree[nom1][0], tree[nom1][1] seq1 = aux(g1, g2, g, e, sequences, tree) if not nom2 in tree.keys(): seq2 = [sequences[int(nom2)]] else: g1, g2 = tree[nom2][0], tree[nom2][1] seq2 = aux(g1, g2, g, e, sequences, tree) traceback = TD3.NW_affine_multi(seq1, seq2, g, e, f)[1] l1, l2 = TD3.affiche_multi(seq1, seq2, traceback) alignments = l1 + l2 return alignments
def test(test_epoch): env = gym.make('FetchReach-v1') #state_dim = env.observation_space.shape[0] state_dim = env.observation_space["desired_goal"].shape[0] + env.observation_space["observation"].shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") policy = TD3.TD3(state_dim,action_dim,max_action) #loadModel(policy,"models/model_36021.pt") filename = 'TD3_FetchReach-v1_311204_500000.0' policy.load(filename, './pytorch_models') for _ in range(1000): tr_obser = env.reset() total_reward = 0 step_count = 0 while(True): env.render() tr_obser = np.concatenate((tr_obser["observation"],tr_obser["desired_goal"]),axis = 0) tr_action = policy.select_action(tr_obser) tr_obser, tr_reward, is_terminal, _ = env.step(tr_action) total_reward += tr_reward step_count += 1 if(is_terminal): break env.close() return
def play(env): p_td3 = TD3.TD3(env.observation_space, env.action_space, 1) p_td3.load("TD3_Tennis_12", directory="pytorch_models") p_ddpg = mDDPG.DDPG(env.observation_space, env.action_space, 1) p_ddpg.load("mDDPG_Tennis_12", directory="pytorch_models") policies = [p_ddpg, p_td3] scores = [] for _ in range(100): obs = env.reset(train_mode=True) done = False episode_score = np.zeros(env.num_agents, dtype=np.float64) while not done: action = [ policy.select_action(np.array(observation)) for policy, observation in zip(policies, obs) ] obs, reward, done = env.step(action) episode_score += reward done = np.any(done) print("Scored: {:.2f} {:.2f}".format(episode_score[0], episode_score[1])) scores.append(episode_score.max()) scores = np.array(scores) print("Mean score {:0.2f} over {}".format(scores.mean(), len(scores)))
def eval(args): file_name = f"{args.policy}_{args.domain_name}_{args.seed}" print("---------------------------------------") print(f"Policy: {args.policy}, Env: {args.domain_name}, Seed: {args.seed}") print("---------------------------------------") log_path = safe_path( os.path.join(args.log_root, '{}_{}_base'.format(args.domain_name, args.task_name))) result_path = safe_path(os.path.join(log_path, 'results')) model_path = safe_path(os.path.join(log_path, 'models_bak')) env = dmc2gym.make(domain_name=args.domain_name, task_name=args.task_name, seed=0, visualize_reward=False, from_pixels=False, height=256, width=256, frame_skip=args.frame_skip) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(os.path.join(model_path, '{}'.format(policy_file))) # Evaluate untrained policy eval_policy(policy, env, args.seed)
def main(): """ Training AC model :return: """ # 1. 创建环境 env = gym.make('Pendulum-v0') # 2. 实验重现 env.seed(RANDOMSEED) np.random.seed(RANDOMSEED) tf.random.set_seed(RANDOMSEED) # 2. DDPG训练方法 td3 = TD3.TD3(env) td3.train()
def run_policy(env_name): test_rewards = [] env_name = "MountainCarContinuous-v0" env_name = "ContinuousCartPoleEnv" random_seed = 0 n_episodes = 10 lr = 0.002 max_timesteps = 2000 render = True save_gif = True filename = "TD3_{}_{}".format(env_name, random_seed) filename += '_solved' env = gym.make(env_name) directory = "./preTrained/"+str(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) policy = TD3(lr, state_dim, action_dim, max_action) policy.load_actor(directory, filename) for ep in range(1, n_episodes + 1): ep_reward = 0 state = env.reset() for t in range(max_timesteps): action = policy.select_action(state) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.render() if save_gif: img = env.render(mode='rgb_array') img = Image.fromarray(img) img.save('./gif/{}.jpg'.format(t)) if done: break test_rewards.append(ep_reward) print('Evaluation Episode: {}\tEpisode Reward: {}'.format(ep, int(ep_reward))) #ep_reward = 0 env.close() plot(test_rewards)
def eval(args): file_name = f"{args.policy}_{args.env}_{args.seed}" print("---------------------------------------") print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}") print("---------------------------------------") log_path = safe_path(os.path.join(args.log_root, '{}_3leg'.format(args.env))) model_path = safe_path(os.path.join(log_path, 'models')) env = gym.make(args.env) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(os.path.join(model_path,'{}'.format(policy_file))) # Evaluate untrained policy imgpath = safe_path(os.path.join(model_path,'eval_imgs')) eval_policy(policy, args.env, args.seed, 100, model_path,imgpath)
def aux(nom1, nom2, g, e, sequences, tree): if not nom1 in tree.keys(): # si nom1 est une feuille seq1 = [sequences[int(nom1)]] desc1 = [desc[int(nom1)]] else: g1, g2 = tree[nom1][0], tree[nom1][1] seq1, desc1 = aux(g1, g2, g, e, sequences, tree) if not nom2 in tree.keys(): seq2 = [sequences[int(nom2)]] desc2 = [desc[int(nom2)]] else: g1, g2 = tree[nom2][0], tree[nom2][1] seq2, desc2 = aux(g1, g2, g, e, sequences, tree) traceback = NW_affine_multi_structure(seq1, seq2, g, e, cout_structural, desc)[1] l1, l2 = TD3.affiche_multi(seq1, seq2, traceback) alignments = l1 + l2 new_desc = desc1 + desc2 return alignments, new_desc
def moduleShow(args): env = gym.make(args.env_name) state_dim = env.observation_space["observation"].shape[ 0] + env.observation_space["desired_goal"].shape[0] #state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] obs = env.reset()
def play(): unity = UnityEnvironment(file_name=executable(), no_graphics=False) env = UnityWrapper(unity, train_mode=False) policy = TD3.TD3(env.observation_space, env.action_space, 1) policy.load("TD3_Reacher-v2_3", directory="pytorch_models") for _ in range(5): obs = env.reset() done = False episode_score = np.zeros(20, dtype=np.float64) while not done: action = [ policy.select_action(np.array(observation)) for observation in obs ] obs, reward, done, _ = env.step(action) episode_score += reward done = np.any(done) print("Scored: {:.2f}".format(episode_score.mean())) unity.close()
env = gym.make(args.env_name) eval_env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy and buffer policy = TD3.TD3(state_dim, action_dim, max_action, device, K=args.K, rpf=args.rpf) if args.priority: replay_buffer = utils.PriorityReplayBuffer( timesteps=args.max_timesteps, alpha=args.alpha, beta=args.beta, eps=args.eps) else: replay_buffer = utils.ReplayBuffer() total_timesteps = 0 total_episodes = 0 episode_timesteps = 0 done = True
if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer() # Evaluate untrained policy evaluations = [evaluate_policy(policy)] total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 done = True while total_timesteps < args.max_timesteps:
#if not os.path.exists("./data"): # os.makedirs("./data") # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.seed) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.seed) # Load model policy.load(filename, './pytorch_models/') #policy.load(filename, './pre_models/') # Start evaluation _ = evaluate_policy(policy, filename, eval_episodes=args.eval_episodes, visualize=args.visualize)
"action_dim": action_dim, "max_action": max_action, "discount": config['gamma'], "tau": config['tau'], "lr": config['lr'], "hidden_size": config['hidden_size'], 'cuda': config['cuda'], 'parameter_noise_mean': config['param_noise_mean'], 'parameter_noise_std': config['param_noise_std'] } # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = config['policy_noise'] * max_action kwargs["noise_clip"] = config['noise_clip'] * max_action kwargs["policy_freq"] = config['policy_freq'] agent = TD3.TD3(**kwargs) # Memory device = torch.device('cuda:' + str(config['cuda'])) if torch.cuda.is_available( ) and config['cuda'] >= 0 else torch.device('cpu') memory = ReplayMemory(state_dim, action_dim, config['replay_size'], device) # Training Loop total_numsteps = 0 updates = 0 # make model save path from os import path import time current_time = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
eval_env = gym.make(env_name) eval_env.seed(seed + 100) avg_reward = 0. for _ in range(eval_episodes): state, done = eval_env.reset(), False while not done: action = policy.select_action(np.array(state)) state, reward, done, _ = eval_env.step(action) eval_env.render('rgb_array') avg_reward += reward avg_reward /= eval_episodes eval_env.close() #VVI print("---------------------------------------") print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}") print("---------------------------------------") return avg_reward env_name = "Pendulum-v0" env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) model = TD3.TD3(state_dim, action_dim, max_action) path = "models\TD3_Pendulum-v0" model.load(path) eval_policy(model, env_name, 0, 1)
env = dm_control2gym.make(domain_name=domain, task_name=task) env_max_steps = 1000 else: env = gym.make(args.env_name) env_max_steps = env._max_episode_steps env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) if args.policy_name == 'TD3': policy = TD3.load('policy', 'results/{}'.format(args.name)) elif args.policy_name == 'EmbeddedTD3': policy = EmbeddedTD3.load('policy', 'results/{}'.format(args.name)) elif args.policy_name == 'random': if args.decoder: decoder = load_decoder(args.env_name, args.decoder) policy = RandomEmbeddedPolicy(1, decoder, None) elif args.dummy_decoder: decoder = DummyDecoder(action_dim, args.dummy_traj_len, env.action_space) policy = RandomEmbeddedPolicy(1, decoder, 1) else: policy = RandomPolicy(env.action_space) elif args.policy_name == 'constant': policy = ConstantPolicy(env.action_space) else:
def main(args): file_name = f"{args.policy}_{args.env}_{args.seed}" print("---------------------------------------") print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}") print("---------------------------------------") log_path = safe_path(os.path.join(args.log_root, '{}_base'.format(args.env))) result_path = safe_path(os.path.join(log_path, 'results')) model_path = safe_path(os.path.join(log_path, 'models')) ''' ### s2r hacks s2r_parser = argparse.ArgumentParser() s2r_parser.add_argument("--encoder_type", default="mlp") s2r_parser.add_argument("--end_effector", default=True) s2r_parser.add_argument("--screen_width", type=int, default=480) s2r_parser.add_argument("--screen_height", type=int, default=480) s2r_parser.add_argument("--action_repeat", type=int, default=1) s2r_parser.add_argument("--puck_friction", type=float, default=2.0) s2r_parser.add_argument("--puck_mass", type=float, default=0.01) s2r_parser.add_argument("--unity", default=False) s2r_parser.add_argument("--unity_editor", default=False) s2r_parser.add_argument("--virtual_display", default=None) s2r_parser.add_argument("--port", default=1050) s2r_parser.add_argument("--absorbing_state", default=False) s2r_parser.add_argument("--dr", default=False) s2r_parser.add_argument("--env", default=None) s2r_args = s2r_parser.parse_args() import ipdb;ipdb.set_trace() env = make_s2r_env(args.env, s2r_args, env_type="real") ''' env = gym.make(args.env) if "SawyerPush" in args.env: env = SawyerECWrapper(env, args.env) env._max_episode_steps = 70 # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) try: state_dim = env.observation_space.shape[0] except: state_dim = 16 #env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 success = False reach_reward = 0 push_reward = 0 cylinder_to_target = 100 for t in range(int(args.max_timesteps)): state = flatten_state(state) episode_timesteps += 1 # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: action = ( policy.select_action(np.array(state)) + np.random.normal(0, max_action * args.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, info = env.step(action) next_state = flatten_state(next_state) done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0 if ("first_success" in info.keys() and info["first_success"]): success = True # reach_reward += info["reward_reach"] # push_reward += info["reward_push"] # cylinder_to_target = min(cylinder_to_target, info["cylinder_to_target"]) # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True # reach_reward /= episode_timesteps # push_reward /= episode_timesteps # Reach Reward: {reach_reward:.3f} Push Reward: {push_reward:.3f} cylinder_to_target: {cylinder_to_target:.3f} print( f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Success: {success}") # Reset environment success = False state, done = env.reset(), False episode_reward = 0 reach_reward, push_reward = 0, 0 cylinder_to_target = 100 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % args.eval_freq == 0: evaluations.append(eval_policy(policy, args.env, args.seed)) np.save(os.path.join(result_path, '{}'.format(file_name)), evaluations) if args.save_model: policy.save(os.path.join(model_path, '{}'.format(file_name)))
kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": float(args.tau), "learning_rate": float(args.learning_rate), "epsilon": float(epsilon) } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) if args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) if args.policy == "newDDPG": policy = newDDPG.DDPG(**kwargs) if args.policy == "newTD3": policy = newTD3.TD3(**kwargs) if args.policy == "A2C": policy = A2C.A2C(**kwargs) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") replay_buffer = utils.ReplayBuffer(state_dim, action_dim=action_dim)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--policy_name", default="TD3") # Policy name parser.add_argument("--env_name", default="Pendulum-v0") # OpenAI gym environment name parser.add_argument("--replay_buffer", default="prioritized") # Replay Buffer type parser.add_argument("--replay_buffer_size", default=5e4, type=int) # Replay Buffer capacity parser.add_argument("--replay_buffer_alpha", default=0.6, type=float) # Replay Buffer prioritization weight parser.add_argument("--seed", default=0, type=int) # Sets Gym, PyTorch and Numpy seeds parser.add_argument("--start_timesteps", default=1e4, type=int) # How many time steps purely random policy is run for parser.add_argument("--eval_freq", default=1e3, type=float) # How often (time steps) we evaluate parser.add_argument("--max_timesteps", default=5e4, type=float) # Max time steps to run environment for parser.add_argument("--save_models", default="True", type=bool) # Whether or not models are saved parser.add_argument("--expl_noise", default=0.1, type=float) # Std of Gaussian exploration noise parser.add_argument("--batch_size", default=100, type=int) # Batch size for both actor and critic parser.add_argument("--discount", default=0.99, type=float) # Discount factor parser.add_argument("--tau", default=0.005, type=float) # Target network update rate parser.add_argument("--policy_noise", default=0.2, type=float) # Noise added to target policy during critic update parser.add_argument("--noise_clip", default=0.5, type=float) # Range to clip target policy noise parser.add_argument("--policy_freq", default=2, type=int) # Frequency of delayed policy updates parser.add_argument("--lr_actor", default=0.001, type=float) # Learning rate of actor parser.add_argument("--lr_critic", default=0.001, type=float) # Learning rate of critic parser.add_argument("--prioritized_replay_eps", default=1e-3, type=float) # Replay Buffer epsilon (PRE) parser.add_argument("--prioritized_replay_beta0", default=0.4, type=float) # Replay Buffer initial beta (PRE) args = parser.parse_args() #Training kwargs kwargs = { "policy_name": args.policy_name, "env_name": args.env_name, "replay_buffer": args.replay_buffer, "replay_buffer_size": args.replay_buffer_size, "replay_buffer_alpha": args.replay_buffer_alpha, "seed": args.seed, "start_timesteps": args.start_timesteps, "eval_freq": args.eval_freq, "max_timesteps": args.max_timesteps, "save_models": args.save_models, "expl_noise": args.expl_noise, "batch_size": args.batch_size, "discount": args.discount, "tau": args.tau, "policy_noise": args.policy_noise, "noise_clip": args.noise_clip, "policy_freq": args.policy_freq, "lr_actor": args.lr_actor, "prioritized_replay_eps": args.prioritized_replay_eps, "prioritized_replay_beta0": args.prioritized_replay_beta0 } # cls os.system('cls' if os.name == 'nt' else 'clear') if not os.path.exists("./results"): os.makedirs("./results") if args.save_models and not os.path.exists("./pytorch_models"): os.makedirs("./pytorch_models") # Time stamp for repeated test names ts = time.time() ts = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H-%M-%S') test_name = "%s_%s_%s_%s" % (args.policy_name, args.env_name, str(args.seed), ts) plot_name = "%s_%s_%s_%s_plot.png" % (args.policy_name, args.env_name, str(args.seed), ts) kwargs_name = "%s_%s_%s_%s_kwargs.csv" % (args.policy_name, args.env_name, str(args.seed), ts) scores_name = "%s_%s_%s_%s_scores.csv" % (args.policy_name, args.env_name, str(args.seed), ts) print("---------------------------------------") print("Settings: %s" % (test_name)) utils.save_kwargs(kwargs, "./results/%s" % (kwargs_name)) print("---------------------------------------") # Environment and Agent instantiation env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Instantiate Replay Buffer if args.replay_buffer == "vanilla": replay_buffer = rb.ReplayBuffer(size = args.replay_buffer_size) PER = False elif args.replay_buffer == "prioritized": replay_buffer = rb.PrioritizedReplayBuffer(size = int(np.round(np.sqrt(args.replay_buffer_size))), alpha = args.replay_buffer_alpha) PER = True prioritized_replay_beta_iters = args.max_timesteps prioritized_replay_beta0 = args.prioritized_replay_beta0 beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p = prioritized_replay_beta0, final_p = 1.0) # Instantiate policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action, args.lr_actor, args.lr_critic, PER, args.prioritized_replay_eps) # Evaluate untrained policy evaluations = [evaluate_policy(env, policy)] # Training loop ####################################### total_timesteps = 0 timesteps_since_eval = 0 episode_num = 0 episode_rewards = [] done = True while total_timesteps < args.max_timesteps: if done: if total_timesteps != 0: print('Total T: {} Episode Num: {} Episode T: {} Reward: {}'.format(total_timesteps, episode_num, episode_timesteps, episode_reward)) episode_rewards.append(episode_reward) # PER Beta scheduled update if PER: beta = beta_schedule.value(total_timesteps) else: beta = 0. # Policy update step if args.policy_name == "TD3": policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, beta) else: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, beta) # Evaluate episode if timesteps_since_eval >= args.eval_freq: timesteps_since_eval %= args.eval_freq evaluations.append(evaluate_policy(env, policy)) # save evaluation #if args.save_models: policy.save(test_name, directory="./pytorch_models") #np.save("./results/%s" % (test_name), evaluations) # Reset environment obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Select action randomly or according to policy if total_timesteps < args.start_timesteps: action = env.action_space.sample() else: action = policy.select_action(np.array(obs)) if args.expl_noise != 0: action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high) # Perform action new_obs, reward, done, _ = env.step(action) done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done) episode_reward += reward # Push experience into replay buffer experience = (obs, action, reward, new_obs, done_bool) replay_buffer.add(experience) obs = new_obs episode_timesteps += 1 total_timesteps += 1 timesteps_since_eval += 1 # Final evaluation evaluations.append(evaluate_policy(env, policy)) # Save results if args.save_models: policy.save("%s" % (test_name), directory="./pytorch_models") #np.save("./results/%s" % (evaluations_file), evaluations) #np.save("./results/%s" % ('rewards.txt'), episode_rewards) utils.save_scores(episode_rewards, "./results/%s" % (scores_name)) utils.plot(episode_rewards, "./results/%s" % (plot_name), 1)
def load_policy(load_from): # Initialize policy start_step = 0 if args.policy == "TD3": import TD3 # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * kwargs['max_action'] kwargs["noise_clip"] = args.noise_clip * kwargs['max_action'] kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) elif args.policy == "OurDDPG": import OurDDPG policy = OurDDPG.DDPG(**kwargs) elif args.policy == "DDPG": import DDPG policy = DDPG.DDPG(**kwargs) # create experiment directory (may not be used) exp_cnt = 0 load_model_path = '' results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt) while os.path.exists(results_dir): exp_cnt+=1 results_dir = os.path.join(args.savedir, args.exp_name+'%02d'%exp_cnt) # load model if necessary if load_from != "": if os.path.isdir(load_from): print("loading latest model from dir: {}".format(load_from)) # find last file search_path = os.path.join(load_from, '*.pt') model_files = glob(search_path) if not len(model_files): print('could not find model exp files at {}'.format(search_path)) raise else: load_model_path = sorted(model_files)[-1] else: load_model_path = load_from print("loading model from file: {}".format(load_model_path)) policy.load(load_model_path) # TODO # utils.load_info_dict(load_model_base) try: start_step = int(load_model_path[-13:-3]) except: try: start_step = policy.step except: print('unable to get start step from name - set it manually') # store in old dir if not args.continue_in_new_dir: results_dir = os.path.split(load_model_path)[0] print("continuing in loaded directory") print(results_dir) else: print("resuming in new directory") print(results_dir) else: if not os.path.exists(results_dir): os.makedirs(results_dir) print('storing results in: {}'.format(results_dir)) return policy, start_step, results_dir, load_model_path
def f(a, b): return TD3.cout_blosum(a, b, g=g)
def experiment(variant): print('CUDA status:', torch.cuda.is_available()) env = make_env(variant['env']) # Set seeds variant['seed'] = int(variant['seed']) env.seed(int(variant['seed'])) torch.manual_seed(int(variant['seed'])) np.random.seed(int(variant['seed'])) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = {"state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": variant['discount'], "tau": variant['tau'], 'network_class': NETWORK_CLASSES[variant['network_class']]} # custom network kwargs mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim']) dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], dropout_p=variant['dropout_p']) variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_dim=variant['first_dim'], sigma=variant['sigma']) fourier_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], fourier_dim=variant['fourier_dim'], sigma=variant['sigma'], concatenate_fourier=variant['concatenate_fourier'], train_B=variant['train_B']) siren_network_kwargs = dict(n_hidden=variant['n_hidden'], hidden_dim=variant['hidden_dim'], first_omega_0=variant['omega'], hidden_omega_0=variant['omega']) if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}: kwargs['network_kwargs'] = mlp_network_kwargs elif variant['network_class'] == 'DropoutMLP': kwargs['network_kwargs'] = dropout_mlp_network_kwargs elif variant['network_class'] == 'VariableInitMLP': kwargs['network_kwargs'] = variable_init_mlp_network_kwargs elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}: kwargs['network_kwargs'] = fourier_network_kwargs elif variant['network_class'] == 'Siren': kwargs['network_kwargs'] = siren_network_kwargs else: raise NotImplementedError # Initialize policy if variant['policy'] == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = variant['policy_noise * max_action'] kwargs["noise_clip"] = variant['noise_clip * max_action'] kwargs["policy_freq"] = variant['policy_freq'] policy = TD3.TD3(**kwargs) elif variant['policy'] == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif variant['policy'] == "DDPG": policy = DDPG.DDPG(**kwargs) elif variant['policy'] == "SAC": kwargs['lr'] = variant['lr'] kwargs['alpha'] = variant['alpha'] kwargs['automatic_entropy_tuning'] = variant['automatic_entropy_tuning'] kwargs['weight_decay'] = variant['weight_decay'] # left out dmc policy = SAC(**kwargs) elif 'PytorchSAC' in variant['policy']: kwargs['action_range'] = [float(env.action_space.low.min()), float(env.action_space.high.max())] kwargs['actor_lr'] = variant['lr'] kwargs['critic_lr'] = variant['lr'] kwargs['alpha_lr'] = variant['alpha_lr'] kwargs['weight_decay'] = variant['weight_decay'] kwargs['no_target'] = variant['no_target'] kwargs['mlp_policy'] = variant['mlp_policy'] kwargs['mlp_qf'] = variant['mlp_qf'] del kwargs['max_action'] if variant['policy'] == 'PytorchSAC': policy = PytorchSAC(**kwargs) elif variant['policy'] == 'RandomNoisePytorchSAC': kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = RandomNoiseSACAgent(**kwargs) elif variant['policy'] == 'SmoothedPytorchSAC': kwargs['n_critic_samples'] = variant['n_critic_samples'] kwargs['noise_dist'] = variant['noise_dist'] kwargs['noise_scale'] = variant['noise_scale'] policy = SmoothedSACAgent(**kwargs) elif variant['policy'] == 'FuncRegPytorchSAC': kwargs['critic_target_update_frequency'] = variant['critic_freq'] kwargs['fr_weight'] = variant['fr_weight'] policy = FuncRegSACAgent(**kwargs) else: raise NotImplementedError if variant['load_model'] != "": raise RuntimeError # load replay buffer replay_buffer = torch.load(os.path.join(variant['replay_buffer_folder'], 'generated_replay_buffer.pt')) policy_optimizer = torch.optim.Adam(policy.actor.parameters(), lr=variant['lr']) qf_optimizer = torch.optim.Adam(policy.critic.Q1.parameters(), lr=variant['lr']) # split into train and val for both action and q_value indices = np.arange(replay_buffer.max_size) random.shuffle(indices) train_indices = indices[:int(0.9 * len(indices))] val_indices = indices[int(0.9 * len(indices)):] train_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[train_indices]).float(), torch.tensor(replay_buffer.action[train_indices]).float(), torch.tensor(replay_buffer.correct_action[train_indices]).float(), torch.tensor(replay_buffer.q_value[train_indices]).float()) val_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[val_indices]).float(), torch.tensor(replay_buffer.action[val_indices]).float(), torch.tensor(replay_buffer.correct_action[val_indices]).float(), torch.tensor(replay_buffer.q_value[val_indices]).float()) # train a network on it train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=variant['batch_size'], shuffle=True, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=variant['batch_size'], shuffle=True, pin_memory=True) train_q_losses = [] train_policy_losses = [] val_q_losses = [] val_policy_losses = [] for _ in trange(variant['n_train_epochs']): total_q_loss = 0 total_policy_loss = 0 for (state, action, correct_action, q) in train_loader: state = state.to(DEVICE) action = action.to(DEVICE) correct_action = correct_action.to(DEVICE) q = q.to(DEVICE) q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1)) policy_preds = policy.actor(state).mean q_loss = F.mse_loss(q_preds, q) policy_loss = F.mse_loss(policy_preds, correct_action) qf_optimizer.zero_grad() policy_optimizer.zero_grad() q_loss.backward() policy_loss.backward() qf_optimizer.step() policy_optimizer.step() total_q_loss += q_loss.item() total_policy_loss += policy_loss.item() # get validation stats total_val_q_loss = 0 total_val_policy_loss = 0 with torch.no_grad(): for (state, action, correct_action, q) in val_loader: state = state.to(DEVICE) action = action.to(DEVICE) correct_action = correct_action.to(DEVICE) q = q.to(DEVICE) q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1)) policy_preds = policy.actor(state).mean q_loss = F.mse_loss(q_preds, q) policy_loss = F.mse_loss(policy_preds, correct_action) total_val_q_loss += q_loss.item() total_val_policy_loss += policy_loss.item() train_q_losses.append(total_q_loss / len(train_loader)) train_policy_losses.append(total_policy_loss / len(train_loader)) val_q_losses.append(total_val_q_loss / len(val_loader)) val_policy_losses.append(total_val_policy_loss / len(val_loader)) print(f'train: qf loss: {train_q_losses[-1]:.4f}, policy loss: {train_policy_losses[-1]:.4f}') print(f'val: qf loss: {val_q_losses[-1]:.4f}, policy loss: {val_policy_losses[-1]:.4f}') # evaluate the resulting policy for 100 episodes eval_return = eval_policy(policy, variant['env'], variant['seed'], eval_episodes=variant['eval_episodes']) # save the results to_save = dict( train_q_losses=train_q_losses, train_policy_losses=train_policy_losses, val_q_losses=val_q_losses, val_policy_losses=val_policy_losses, eval_return=eval_return, qf=policy.critic.Q1.state_dict(), policy=policy.actor.state_dict() ) torch.save(to_save, os.path.join(variant['replay_buffer_folder'], f'{variant["network_class"]}_distillation.pt'))
def __init__(self): self.computer = 'kelsey' super().__init__('walk_node_{}'.format( self.computer)) #Remember to use ros_bridge self.pkl_folder = 'pkl_walk_vanilla' self.pause_on_nn = False self.score = 0 self.reward = 0.0 self.distance_new = 0.0 self.distance_old = 0.0 self.reward_total = 0.0 self.joint_states_init = [ 0.0 - hip_x_min_temp, 0.0 - hip_y_min_temp, 0.0 - knee_min_temp, 0.0 - ankle_y_min_temp, 0.0 - ankle_x_min_temp, 0.0 - hip_x_min_temp, 0.0 - hip_y_min_temp, 0.0 - knee_min_temp, 0.0 - ankle_y_min_temp, 0.0 - ankle_x_min_temp, 0.0 - hip_z_min_temp, 0.0 - hip_z_min_temp ] #, [0.0, 0.0, 0.0, 0.0]]#, [0.0], [0.0], [0.0, 0.0, 0.0]] self.joint_states = self.joint_states_init self.true_joint_states_init = [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] self.true_joint_states = self.true_joint_states_init self.score_hist = [] self.distance_hist_long = [] self.fallen_status = 0 self.vel_init = [0, 0, 0] self.positions_init = [0, 0.80] self.rpy_init = [0., 0., math.pi] self.rpy_vel_init = [0, 0, 0] self.y_pos_init = [0.0] self.s = self.s1 = classes.State(self.joint_states, self.rpy_init, self.rpy_vel_init, self.positions_init, self.vel_init, self.true_joint_states, self.y_pos_init) self.gamma = 0.99 # discount for future rewards self.batch_size = 128 # num of transitions sampled from replay buffer self.num_actions = 12 self.action_init = [ (hip_x_min_temp / (hip_x_min_temp - hip_x_max_temp)), (hip_y_min_temp / (hip_y_min_temp - hip_y_max_temp)), (knee_min_temp / (knee_min_temp - knee_max_temp)), (ankle_y_min_temp / (ankle_y_min_temp - ankle_y_max_temp)), (ankle_x_min_temp / (ankle_x_min_temp - ankle_x_max_temp)), (hip_x_min_temp / (hip_x_min_temp - hip_x_max_temp)), (hip_y_min_temp / (hip_y_min_temp - hip_y_max_temp)), (knee_min_temp / (knee_min_temp - knee_max_temp)), (ankle_y_min_temp / (ankle_y_min_temp - ankle_y_max_temp)), (ankle_x_min_temp / (ankle_x_min_temp - ankle_x_max_temp)), (hip_z_min_temp / (hip_z_min_temp - hip_z_max_temp)), (hip_z_min_temp / (hip_z_min_temp - hip_z_max_temp)) ] self.action_init = np.array([(x * 2) - 1 for x in self.action_init]) self.exploration_noise_init = 0.08 #.10, 0.05 self.exploration_noise = self.exploration_noise_init self.polyak = 0.995 # target policy update parameter (1-tau) self.policy_noise = 0.12 #.20, .10 # target policy smoothing noise self.noise_clip = 0.5 self.policy_delay = 2 # delayed policy updates parameter self.testing = False if self.testing == True: self.policy_noise = 0.0 self.exploration_noise_init = 0.0 self.exploration_noise = 0.0 self.last_saved_index = 150000 #minimax at 1165000; lowered effort to 3.92 and renamed i to 1000 at 1419000; instant at 8000 # policy .05, exp .04 @ 137000; reverted noise to policy .12 and exp .08 @ 200000 self.distance_hist = [] if self.last_saved_index > 0: self.read_pkl = True else: self.read_pkl = False self.i = self.last_saved_index v = True self.j = 0 lr = .0001 self.num_states = 22 remove_states = [] load_weights = True read_replay_buffer = True add_num_states = 0 add_actions = 0 layer_height = 250 if self.read_pkl == True: #agent = NewAgent.load_model('./pkl/agent_{}.pkl'.format(i)) if load_weights == True: print('reading weights...') self.agent = TD3.TD3(lr=lr, state_dim=self.num_states + add_num_states - len(remove_states), action_dim=self.num_actions + add_actions, max_action=1.0, layer_height=layer_height) self.agent.load('./{}'.format(self.pkl_folder), self.i, additional_dims=add_num_states, additional_actions=add_actions, remove_dimensions_=remove_states) self.num_actions = self.num_actions + add_actions #print('STATES:{}'.format(agent.)) #agent.state_dim += add_num_states #if add_state > 0: else: print('WARNING: LOADING FULL AGENT') self.agent = TD3.TD3.load_model('./{}/agent_{}.pkl'.format( self.pkl_folder, self.i)) self.agent.use_scheduler = False if read_replay_buffer == True: print('reading replay buffer...') self.replay_buffer = pickle.load( open('./{}/replay_{}.pkl'.format(self.pkl_folder, self.i), 'rb')) else: self.replay_buffer = ReplayBuffer() else: print('creating agent') #agent = NewAgent(alpha=0.000005, beta=0.00001, input_dims=[3], gamma=1.01, layer1_size=30, layer2_size=30, n_outputs=1, n_actions=26) # 26=13*2 #agent = Agent(alpha=0.000025, beta=0.00025, input_dims=[19], tau=0.001, env='dummy', sigma=.5, # batch_size=100, layer1_size=200, layer2_size=250, n_actions=12, max_size=100000) self.agent = TD3.TD3(lr=lr, state_dim=self.num_states, action_dim=self.num_actions, max_action=1.0, layer_height=layer_height) self.replay_buffer = ReplayBuffer() self.state_sub = self.create_subscription( State, '/tori_state_{}'.format(self.computer), self.state_callback) # listens for state updates self.reward_sub = self.create_subscription( State, '/tori_state_{}'.format(self.computer), self.reward_callback) # self.replay_sub = self.create_subscription( Replay, '/replay', self.replay_callback) # listens for replay messages self.replay_pub = self.create_publisher( Replay, '/replay', qos_profile=1) # publishes replay to this/other computers self.joint_angles_pub = self.create_publisher( ToriJointAngles, '/tori_joint_command_{}'.format(self.computer) ) # tells control_motion the desired joint positions self.checkpoint_sub = self.create_subscription( Float64, '/checkpoint_{}'.format(self.computer), self.checkpoint_callback) self.checkpoit_pub = self.create_publisher(Float64, '/checkpoint_{}'.format( self.computer), qos_profile=0) # start training self.state_pub = self.create_publisher(State, '/tori_state_{}'.format( self.computer), qos_profile=1) state = State() state.fallen_status = float(self.fallen_status) state.orientation = self.rpy_init state.pos = [0., 0., 0.80 ] #TODO: get position_y_spine, not necessarily minimin state.distance_minimum = -.02 #TODO: check this number state.rpy_vel = [0., 0., 0.] state.vel = [0., 0., 0.] state.sim_time = 0.0 self.state_pub.publish(state) print('published!')
def train(config, args): if not os.path.exists("./results"): os.makedirs("./results") if args.save_model and not os.path.exists("./models"): os.makedirs("./models") import pybulletgym warnings.filterwarnings("ignore") eps_bounds = args.reacher_epsilon_bounds # just aliasing with shorter variable name utils_object = utils.GeneralUtils(args) if args.tune_run: if args.prioritized_replay: args.alpha = float(config["alpha"]) args.beta = float(config["beta"]) args.discount = float(config.get("discount", args.discount)) args.tau = float(config.get("tau", args.tau)) elif args.custom_env and args.use_hindsight: eps_bounds = [float(config["epsilons"][0]), float(config["epsilons"][1])] args.seed = int(config["seed"]) else: args.discount = float(config.get("discount", args.discount)) args.tau = float(config.get("tau", args.tau)) if args.custom_env: gym.envs.register( id='OurReacher-v0', entry_point='our_reacher_env:OurReacherEnv', max_episode_steps=50, reward_threshold=100.0, ) # this is assuming we only use epsilon for custom env or fetch reach, where episode tsteps is 50 !!!! max_episode_steps = 50 # retrieve epsilon range [a, b] = eps_bounds epsilons = utils_object.epsilon_calc(a, b, max_episode_steps) env = gym.make('OurReacher-v0', epsilon=epsilons[0], render=False) else: env = gym.make(args.env) if utils_object.fetch_reach and utils_object.args.fetch_reach_dense: env.env.reward_type = "dense" # Set seeds env.seed(int(args.seed)) torch.manual_seed(args.seed) np.random.seed(args.seed) if utils_object.fetch_reach: state_dim = env.reset()["observation"].shape[0] else: state_dim = env.observation_space.shape[0] if args.use_hindsight: # include both current state and goal state if args.custom_env: state_dim += 2 # reacher nonsense; goal = (x, y) elif utils_object.fetch_reach: state_dim += 3 # include fetchreach goal state (x,y,z position) else: state_dim *= 2 action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq kwargs["prioritized_replay"] = args.prioritized_replay kwargs["use_rank"] = args.use_rank kwargs["use_hindsight"] = args.use_hindsight policy = TD3.TD3(**kwargs) elif args.policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif args.policy == "DDPG": policy = DDPG.DDPG(**kwargs) exp_descriptors = [ args.policy, 'CustomReacher' if args.custom_env else args.env, f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', 'HER' if args.use_hindsight else '', f"{args.decay_type}decay-eps{f'{eps_bounds[0]}-{eps_bounds[1]}' if eps_bounds[0] != eps_bounds[1] else f'{eps_bounds[0]}'}" if args.custom_env else "", f"k{args.k}", datetime.now().strftime('%Y%m%d%H%M') ] if args.tune_run: # fudgy: assumes tune_run for non-HER experiments exp_descriptors = [ args.policy, 'CustomReacher' if args.custom_env else args.env, f"{'rank' if args.use_rank else 'proportional'}PER" if args.prioritized_replay else '', f"tau{args.tau}", f"discount{args.discount}", f"alpha{args.alpha}" if args.prioritized_replay else '', f"beta{args.beta}" if args.prioritized_replay else '', f"k{args.k}", datetime.now().strftime('%Y%m%d%H%M') ] exp_descriptors = [x for x in exp_descriptors if len(x) > 0] file_name = "_".join(exp_descriptors) if args.load_model != "": policy_file = file_name if args.load_model == "default" else args.load_model policy.load(f"./models/{policy_file}") if args.prioritized_replay: replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim, args.max_timesteps, args.start_timesteps, alpha=args.alpha, beta=args.beta) else: replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, args.env, args.seed, utils_object=utils_object)] state, done = env.reset(), False original_episode_reward = 0 episode_reward = 0 episode_timesteps = 0 episode_num = 0 trajectory = [] for t in range(int(args.max_timesteps)): episode_timesteps += 1 x, goal = utils_object.compute_x_goal(state, env) # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: action = ( policy.select_action(np.array(x)) + np.random.normal(0, max_action * args.expl_noise, size=action_dim) ).clip(-max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0 if args.use_hindsight: if utils_object.fetch_reach: goal = state["desired_goal"] next_x = np.concatenate([np.array(next_state["observation"]), goal]) else: # env.set_goal(goal) next_x = np.concatenate([np.array(next_state), goal]) elif utils_object.fetch_reach: next_x = np.array(next_state["observation"]) else: next_x = next_state # Store data in replay buffer if not args.use_hindsight: replay_buffer.add(x, action, next_x, reward, done_bool) trajectory.append((state, action, next_state, reward, done_bool)) state = next_state episode_reward += reward if args.custom_env: original_episode_reward += env.original_rewards # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: if args.use_hindsight: replay_buffer.add_hindsight(trajectory, goal, env, k=args.k, fetch_reach=utils_object.fetch_reach) # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f} Original Reward: {original_episode_reward:.3f}") # Reset environment state, done = env.reset(), False episode_reward = 0 original_episode_reward = 0 episode_timesteps = 0 episode_num += 1 if args.custom_env: epsilon = epsilons[episode_num] env.set_epsilon(epsilon) trajectory = [] # Evaluate episode if (t + 1) % args.eval_freq == 0: evaled_policy = eval_policy(policy, args.env, args.seed, utils_object=utils_object) evaluations.append(evaled_policy) np.save(f"./results/{file_name}", evaluations) if args.save_model: policy.save(f"./models/{file_name}") if args.plot: plotter.plot(file_name, args.custom_env) if args.tune_run: tune.report(episode_reward_mean=evaled_policy[0])
def train(config, start_timesteps, max_timesteps, policy_noise, expl_noise, noise_clip, policy_freq, batch_size, seed, policy, prioritized_replay, env_name, eval_freq, discount, tau, use_rank): if prioritized_replay: alpha = float(config["alpha"]) beta = float(config["beta"]) else: discount = float(config["discount"]) tau = float(config["tau"]) import pybulletgym warnings.filterwarnings("ignore") env = gym.make(env_name) # Set seeds env.seed(seed) torch.manual_seed(seed) np.random.seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": discount, "tau": tau, } # Initialize policy if policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = policy_noise * max_action kwargs["noise_clip"] = noise_clip * max_action kwargs["policy_freq"] = policy_freq kwargs["prioritized_replay"] = prioritized_replay kwargs["use_rank"] = use_rank policy = TD3.TD3(**kwargs) elif policy == "OurDDPG": policy = OurDDPG.DDPG(**kwargs) elif policy == "DDPG": policy = DDPG.DDPG(**kwargs) if prioritized_replay: replay_buffer = utils.PrioritizedReplayBuffer(state_dim, action_dim, max_timesteps, start_timesteps, alpha=alpha, beta=beta) else: replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, env_name, seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < start_timesteps: action = env.action_space.sample() else: action = (policy.select_action(np.array(state)) + np.random.normal( 0, max_action * expl_noise, size=action_dim)).clip( -max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float( done) if episode_timesteps < env._max_episode_steps else 0 # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= start_timesteps: policy.train(replay_buffer, batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % eval_freq == 0: avg_reward = eval_policy(policy, env_name, seed) tune.report(episode_reward_mean=avg_reward) evaluations.append(avg_reward)
def generate_video(args): total_time = args.video_length * 100 exp_path = os.path.join(DATA_DIR, "EXP_{:04d}".format(args.expID)) if not os.path.exists(exp_path): raise FileNotFoundError('checkpoint does not exist') print('*** folder fetched: {} ***'.format(exp_path)) os.makedirs(VIDEO_DIR, exist_ok=True) # Retrieve MuJoCo XML files for visualizing ======================================== env_names = [] args.graphs = dict() # existing envs if not args.custom_xml: for morphology in args.morphologies: env_names += [ name[:-4] for name in os.listdir(XML_DIR) if '.xml' in name and morphology in name ] for name in env_names: args.graphs[name] = utils.getGraphStructure( os.path.join(XML_DIR, '{}.xml'.format(name))) # custom envs else: if os.path.isfile(args.custom_xml): assert '.xml' in os.path.basename( args.custom_xml), 'No XML file found.' name = os.path.basename(args.custom_xml) env_names.append(name[:-4]) # truncate the .xml suffix args.graphs[name[:-4]] = utils.getGraphStructure(args.custom_xml) elif os.path.isdir(args.custom_xml): for name in os.listdir(args.custom_xml): if '.xml' in name: env_names.append(name[:-4]) args.graphs[name[:-4]] = utils.getGraphStructure( os.path.join(args.custom_xml, name)) env_names.sort() # Set up env and policy ================================================ args.limb_obs_size, args.max_action = utils.registerEnvs( env_names, args.max_episode_steps, args.custom_xml) # determine the maximum number of children in all the envs if args.max_children is None: args.max_children = utils.findMaxChildren(env_names, args.graphs) # setup agent policy policy = TD3.TD3(args) try: cp.load_model_only(exp_path, policy) except: raise Exception( 'policy loading failed; check policy params (hint 1: max_children must be the same as the trained policy; hint 2: did the trained policy use torchfold (consider pass --disable_fold)?' ) torch.manual_seed(args.seed) np.random.seed(args.seed) # visualize =========================================================== for env_name in env_names: # create env env = utils.makeEnvWrapper(env_name, seed=args.seed, obs_max_len=None)() policy.change_morphology(args.graphs[env_name]) # create unique temp frame dir count = 0 frame_dir = os.path.join( VIDEO_DIR, "frames_{}_{}_{}".format(args.expID, env_name, count)) while os.path.exists(frame_dir): count += 1 frame_dir = "{}/frames_{}_{}_{}".format(VIDEO_DIR, args.expID, env_name, count) os.makedirs(frame_dir) # create video name without overwriting previously generated videos count = 0 video_name = "%04d_%s_%d" % (args.expID, env_name, count) while os.path.exists("{}/{}.mp4".format(VIDEO_DIR, video_name)): count += 1 video_name = "%04d_%s_%d" % (args.expID, env_name, count) # init env vars done = True print("-" * 50) time_step_counter = 0 printProgressBar(0, total_time) while time_step_counter < total_time: printProgressBar(time_step_counter + 1, total_time, prefix=env_name) if done: obs = env.reset() done = False episode_reward = 0 action = policy.select_action(np.array(obs)) # perform action in the environment new_obs, reward, done, _ = env.step(action) episode_reward += reward # draw image of current frame image_data = env.sim.render(VIDEO_RESOLUATION[0], VIDEO_RESOLUATION[1], camera_name="track") img = Image.fromarray(image_data, "RGB") draw = ImageDraw.Draw(img) font = ImageFont.truetype('./misc/sans-serif.ttf', 24) draw.text((200, 10), "Instant Reward: " + str(reward), (255, 0, 0), font=font) draw.text((200, 35), "Episode Reward: " + str(episode_reward), (255, 0, 0), font=font) img.save( os.path.join(frame_dir, "frame-%.10d.png" % time_step_counter)) obs = new_obs time_step_counter += 1 # redirect output so output does not show on window FNULL = open(os.devnull, 'w') # create video subprocess.call([ 'ffmpeg', '-framerate', '50', '-y', '-i', os.path.join(frame_dir, 'frame-%010d.png'), '-r', '30', '-pix_fmt', 'yuv420p', os.path.join(VIDEO_DIR, '{}.mp4'.format(video_name)) ], stdout=FNULL, stderr=subprocess.STDOUT) subprocess.call(['rm', '-rf', frame_dir])
os.makedirs("./pytorch_models") env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = TD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "BNNTD3": policy = BNNTD3.TD3(state_dim, action_dim, max_action) elif args.policy_name == "BootstrapTD3": if args.actor_branches > 0: actor_branches = args.actor_branches else: actor_branches = args.branches policy = BootstrapTD3.TD3(state_dim, action_dim, max_action, args.branches, actor_branches) elif args.policy_name == "OurDDPG": policy = OurDDPG.DDPG(state_dim, action_dim, max_action) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) replay_buffer = utils.ReplayBuffer()
def main(args): file_name = f"{args.policy}_{args.env}_{args.seed}" print("---------------------------------------") print(f"Policy: {args.policy}, Env: {args.env}, Seed: {args.seed}") print("---------------------------------------") log_path = safe_path( os.path.join(args.log_root, '{}_base'.format(args.env))) result_path = safe_path(os.path.join(log_path, 'results')) model_path = safe_path(os.path.join(log_path, 'models')) env = gym.make(args.env) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, "tau": args.tau, } # Initialize policy if args.policy == "TD3": # Target policy smoothing is scaled wrt the action scale kwargs["policy_noise"] = args.policy_noise * max_action kwargs["noise_clip"] = args.noise_clip * max_action kwargs["policy_freq"] = args.policy_freq policy = TD3.TD3(**kwargs) replay_buffer = utils.ReplayBuffer(state_dim, action_dim) # Evaluate untrained policy evaluations = [eval_policy(policy, env, args.seed)] state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(args.max_timesteps)): episode_timesteps += 1 # Select action randomly or according to policy if t < args.start_timesteps: action = env.action_space.sample() else: action = (policy.select_action(np.array(state)) + np.random.normal( 0, max_action * args.expl_noise, size=action_dim)).clip( -max_action, max_action) # Perform action next_state, reward, done, _ = env.step(action) done_bool = float( done) if episode_timesteps < env._max_episode_steps else 0 # Store data in replay buffer replay_buffer.add(state, action, next_state, reward, done_bool) state = next_state episode_reward += reward # Train agent after collecting sufficient data if t >= args.start_timesteps: policy.train(replay_buffer, args.batch_size) if done: # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}" ) # Reset environment state, done = env.reset(), False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % args.eval_freq == 0: evaluations.append(eval_policy(policy, env, args.seed)) np.save(os.path.join(result_path, '{}'.format(file_name)), evaluations) if args.save_model: policy.save(os.path.join(model_path, '{}'.format(file_name)))
print "---------------------------------------" print "Settings: %s" % (file_name) print "---------------------------------------" env = gym.make(args.env_name) # Set seeds env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = float(env.action_space.high[0]) # Initialize policy if args.policy_name == "TD3": policy = \ TD3.TD3(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro) elif args.policy_name == "OurDDPG": policy = \ OurDDPG.DDPG(state_dim, action_dim, max_action, actor_lr=args.actor_lr, is_ro=args.is_ro) elif args.policy_name == "DDPG": policy = DDPG.DDPG(state_dim, action_dim, max_action) policy.load( "%s_%s_%s.pth" % (args.policy_name, args.env_name, str(args.seed)), "pytorch_models") evaluate_policy(policy, args.eval_episodes)
def train(args): # Set up directories =========================================================== os.makedirs(DATA_DIR, exist_ok=True) os.makedirs(BUFFER_DIR, exist_ok=True) exp_name = "EXP_%04d" % (args.expID) exp_path = os.path.join(DATA_DIR, exp_name) rb_path = os.path.join(BUFFER_DIR, exp_name) os.makedirs(exp_path, exist_ok=True) os.makedirs(rb_path, exist_ok=True) # save arguments with open(os.path.join(exp_path, 'args.txt'), 'w+') as f: json.dump(args.__dict__, f, indent=2) # Retrieve MuJoCo XML files for training ======================================== agent_name = args.agent_name envs_train_names = [agent_name] args.graphs = dict() # existing envs if not args.custom_xml: args.graphs[agent_name] = utils.getGraphStructure( os.path.join(XML_DIR, '{}.xml'.format(agent_name))) # custom envs num_envs_train = len(envs_train_names) print("#" * 50 + '\ntraining envs: {}\n'.format(envs_train_names) + "#" * 50) # Set up training env and policy ================================================ args.limb_obs_size, args.max_action = utils.registerEnvs( envs_train_names, args.max_episode_steps, args.custom_xml) max_num_limbs = max( [len(args.graphs[env_name]) for env_name in envs_train_names]) # create vectorized training env obs_max_len = max( [len(args.graphs[env_name]) for env_name in envs_train_names]) * args.limb_obs_size envs_train = [ utils.makeEnvWrapper(name, obs_max_len, args.seed) for name in envs_train_names ] # envs_train = SubprocVecEnv(envs_train) # vectorized env # set random seeds torch.manual_seed(args.seed) np.random.seed(args.seed) # determine the maximum number of children in all the training envs if args.max_children is None: args.max_children = utils.findMaxChildren(envs_train_names, args.graphs) # setup agent policy policy = TD3.LifeLongTD3(args) # Create new training instance or load previous checkpoint ======================== if cp.has_checkpoint(exp_path, rb_path): print("*** loading checkpoint from {} ***".format(exp_path)) total_timesteps, episode_num, replay_buffer, num_samples, loaded_path = cp.load_checkpoint( exp_path, rb_path, policy, args) print("*** checkpoint loaded from {} ***".format(loaded_path)) else: print("*** training from scratch ***") # init training vars total_timesteps = 0 episode_num = 0 num_samples = 0 # different replay buffer for each env; avoid using too much memory if there are too many envs # Initialize training variables ================================================ writer = SummaryWriter("%s/%s/" % (DATA_DIR, exp_name)) s = time.time() # TODO: may have to change the following codes into the loop timesteps_since_saving = 0 this_training_timesteps = 0 episode_timesteps = 0 episode_reward = 0 episode_reward_buffer = 0 done = True # Start training =========================================================== for env_handle, env_name in zip(envs_train, envs_train_names): env = env_handle() obs = env.reset() replay_buffer = utils.ReplayBuffer(max_size=args.rb_max) policy.change_morphology(args.graphs[env_name]) policy.graph = args.graphs[env_name] task_timesteps = 0 done = False episode_timesteps = 0 episode_reward = 0 episode_reward_buffer = 0 while task_timesteps < args.max_timesteps: # train and log after one episode for each env if done: # log updates and train policy if this_training_timesteps != 0: policy.train(replay_buffer, episode_timesteps, args.batch_size, args.discount, args.tau, args.policy_noise, args.noise_clip, args.policy_freq, graphs=args.graphs, env_name=env_name) # add to tensorboard display writer.add_scalar('{}_episode_reward'.format(env_name), episode_reward, task_timesteps) writer.add_scalar('{}_episode_len'.format(env_name), episode_timesteps, task_timesteps) # print to console print( "-" * 50 + "\nExpID: {}, FPS: {:.2f}, TotalT: {}, EpisodeNum: {}, SampleNum: {}, ReplayBSize: {}" .format(args.expID, this_training_timesteps / (time.time() - s), total_timesteps, episode_num, num_samples, len(replay_buffer.storage))) print("{} === EpisodeT: {}, Reward: {:.2f}".format( env_name, episode_timesteps, episode_reward)) this_training_timesteps = 0 s = time.time() # save model and replay buffers if timesteps_since_saving >= args.save_freq: print("!!!!!") timesteps_since_saving = 0 model_saved_path = cp.save_model(exp_path, policy, total_timesteps, episode_num, num_samples, {env_name: replay_buffer}, envs_train_names, args) print("*** model saved to {} ***".format(model_saved_path)) rb_saved_path = cp.save_replay_buffer( rb_path, {env_name: replay_buffer}) print("*** replay buffers saved to {} ***".format( rb_saved_path)) # reset training variables obs = env.reset() done = False episode_reward = 0 episode_timesteps = 0 episode_num += 1 # create reward buffer to store reward for one sub-env when it is not done episode_reward_buffer = 0 # start sampling =========================================================== # sample action randomly for sometime and then according to the policy if task_timesteps < args.start_timesteps: action = np.random.uniform(low=env.action_space.low[0], high=env.action_space.high[0], size=max_num_limbs) else: # remove 0 padding of obs before feeding into the policy (trick for vectorized env) obs = np.array(obs[:args.limb_obs_size * len(args.graphs[env_name])]) policy_action = policy.select_action(obs) if args.expl_noise != 0: policy_action = (policy_action + np.random.normal( 0, args.expl_noise, size=policy_action.size)).clip( env.action_space.low[0], env.action_space.high[0]) # add 0-padding to ensure that size is the same for all envs action = np.append( policy_action, np.array([ 0 for i in range(max_num_limbs - policy_action.size) ])) # perform action in the environment new_obs, reward, done, _ = env.step(action) # record if each env has ever been 'done' # add the instant reward to the cumulative buffer # if any sub-env is done at the momoent, set the episode reward list to be the value in the buffer episode_reward_buffer += reward if done and episode_reward == 0: episode_reward = episode_reward_buffer episode_reward_buffer = 0 writer.add_scalar('{}_instant_reward'.format(env_name), reward, task_timesteps) done_bool = float(done) if episode_timesteps + 1 == args.max_episode_steps: done_bool = 0 done = True # remove 0 padding before storing in the replay buffer (trick for vectorized env) num_limbs = len(args.graphs[env_name]) obs = np.array(obs[:args.limb_obs_size * num_limbs]) new_obs = np.array(new_obs[:args.limb_obs_size * num_limbs]) action = np.array(action[:num_limbs]) # insert transition in the replay buffer replay_buffer.add((obs, new_obs, action, reward, done_bool)) num_samples += 1 # do not increment episode_timesteps if the sub-env has been 'done' if not done: episode_timesteps += 1 total_timesteps += 1 task_timesteps += 1 this_training_timesteps += 1 timesteps_since_saving += 1 obs = new_obs policy.next_task() # save checkpoint after training =========================================================== model_saved_path = cp.save_model(exp_path, policy, total_timesteps, episode_num, num_samples, {envs_train_names[-1]: replay_buffer}, envs_train_names, args) print("*** training finished and model saved to {} ***".format( model_saved_path))