env.seed(args.seed) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_mgr = DiscretePolicy(state_dim, 7) policy_wrk = Policy(state_dim + subgoal_dim, env.action_space.shape[0], log_std=args.log_std) value_mgr = Value(state_dim) value_wrk = Value(state_dim + subgoal_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_mgr.to(device) policy_wrk.to(device) value_mgr.to(device) value_wrk.to(device) # optim_policy_m = torch.optim.Adam(policy_mgr.parameters(), lr=0.01) # optim_policy_w = torch.optim.Adam(policy_wrk.parameters(), lr=0.01) # optim_value_m = torch.optim.Adam(value_mgr.parameters(), lr=0.01) # optim_value_w = torch.optim.Adam(value_wrk.parameters(), lr=0.01) """create agent""" agent = Agent(env, policy_mgr, policy_wrk, device, running_state=running_state, render=args.render,
is_disc_action = len(env_dummy.action_space.shape) == 0 running_state = ZFilter((state_dim, ), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env_dummy.action_space.n) else: policy_net = Policy(state_dim, env_dummy.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load( open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) del env_dummy optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=0.01) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=0.01) """create agent""" agent = Agent(env_factory, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch):
def train(**kwargs): print('here') config = { "lr": kwargs['lr'], "gamma": kwargs['gamma'] } dtype = torch.float64 torch.set_default_dtype(dtype) device = torch.device('cuda', index=args.gpu_index) if torch.cuda.is_available() else torch.device('cpu') if torch.cuda.is_available(): torch.cuda.set_device(args.gpu_index) """environment""" env = gym.make(args.env_name) state_dim = env.observation_space.shape[0] is_disc_action = len(env.action_space.shape) == 0 running_state = ZFilter((state_dim,), clip=5) # running_reward = ZFilter((1,), demean=False, clip=10) """seeding""" np.random.seed(args.seed) torch.manual_seed(args.seed) env.seed(args.seed) # """define actor and critic""" if args.model_path is None: if is_disc_action: policy_net = DiscretePolicy(state_dim, env.action_space.n) else: policy_net = Policy(state_dim, env.action_space.shape[0], log_std=args.log_std) value_net = Value(state_dim) else: policy_net, value_net, running_state = pickle.load(open(args.model_path, "rb")) policy_net.to(device) value_net.to(device) # optimization epoch number and batch size for PPO optim_epochs = 10 optim_batch_size = 64 """create agent""" agent = Agent(env, policy_net, device, running_state=running_state, render=args.render, num_threads=args.num_threads) def update_params(batch, i_iter, config): states = torch.from_numpy(np.stack(batch.state)).to(dtype).to(device) actions = torch.from_numpy(np.stack(batch.action)).to(dtype).to(device) rewards = torch.from_numpy(np.stack(batch.reward)).to(dtype).to(device) masks = torch.from_numpy(np.stack(batch.mask)).to(dtype).to(device) with torch.no_grad(): values = value_net(states) fixed_log_probs = policy_net.get_log_prob(states, actions) """get advantage estimation from the trajectories""" advantages, returns = estimate_advantages(rewards, masks, values, config['gamma'], args.tau, device) """perform mini-batch PPO update""" optim_iter_num = int(math.ceil(states.shape[0] / optim_batch_size)) for _ in range(optim_epochs): perm = np.arange(states.shape[0]) np.random.shuffle(perm) perm = LongTensor(perm).to(device) states, actions, returns, advantages, fixed_log_probs = \ states[perm].clone(), actions[perm].clone(), returns[perm].clone(), advantages[perm].clone(), fixed_log_probs[perm].clone() for i in range(optim_iter_num): ind = slice(i * optim_batch_size, min((i + 1) * optim_batch_size, states.shape[0])) states_b, actions_b, advantages_b, returns_b, fixed_log_probs_b = \ states[ind], actions[ind], advantages[ind], returns[ind], fixed_log_probs[ind] ppo_step(policy_net, value_net, optimizer_policy, optimizer_value, 1, states_b, actions_b, returns_b, advantages_b, fixed_log_probs_b, args.clip_epsilon, args.l2_reg) def main_loop(config): optimizer_policy = torch.optim.Adam(policy_net.parameters(), lr=config['lr']) optimizer_value = torch.optim.Adam(value_net.parameters(), lr=config['lr']) for i_iter in range(args.max_iter_num): """generate multiple trajectories that reach the minimum batch_size""" batch, log = agent.collect_samples(args.min_batch_size) t0 = time.time() update_params(batch, i_iter, config) t1 = time.time() if i_iter % args.log_interval == 0: print('{}\tT_sample {:.4f}\tT_update {:.4f}\tR_min {:.2f}\tR_max {:.2f}\tR_avg {:.2f}'.format( i_iter, log['sample_time'], t1-t0, log['min_reward'], log['max_reward'], log['avg_reward'])) if args.save_model_interval > 0 and (i_iter+1) % args.save_model_interval == 0: to_device(torch.device('cpu'), policy_net, value_net) pickle.dump((policy_net, value_net, running_state), open(os.path.join(assets_dir(), 'learned_models/{}_ppo.p'.format(args.env_name)), 'wb')) to_device(device, policy_net, value_net) # """clean up gpu memory""" torch.cuda.empty_cache() return agent.evaluate() print('a') print(config) print(args) return main_loop(config)