def _train_batch(self, reward_fn=None): sess = tf.Session() sess.__enter__() def make_obs_ph(name): return BatchInput(self._obs_shape, name=name) tools = deepq.build_train( make_obs_ph=make_obs_ph, q_func=self._model, num_actions=self._n_action, optimizer=tf.train.AdamOptimizer(learning_rate=self._lr), gamma=self._gamma, grad_norm_clipping=self._grad_norm_clipping) act, train, update_target, debug = tools self._timestep = int(self._exploration_fraction * self._max_timesteps), U.initialize() update_target() for t in itertools.count(): if self._prioritized_replay: experience = self._replay_buffer.sample( self._buffer_batch_size, beta=self._beta_schedule.value(t + 1)) (s, a, r, s_next, dones, weights, batch_idxes) = experience if reward_fn is not None: r = np.array( [np.asscalar(reward_fn(s, a)) for s, a in zip(s, a)]) else: s, a, r, s_next, dones = self._replay_buffer.sample( self._buffer_batch_size) if reward_fn is not None: r = np.array( [np.asscalar(reward_fn(s, a)) for s, a in zip(s, a)]) weights, batch_idxes = np.ones_like(r), None td_errors = train(s, a, r, s_next, dones, weights) if self._prioritized_replay: new_priorities = np.abs( td_errors) + self._prioritized_replay_eps self._replay_buffer.update_priorities(batch_idxes, new_priorities) if t % self._target_network_update_freq == 0: logging.info("been trained {} steps".format(t)) update_target() if t > 100 and t % self._policy_evaluate_freq == 0: logging.info("evaluating the policy...{} steps".format(t)) if self._env is not None: self._evaluate_policy(act) if t > self._max_timesteps: break self._policy = act return act
def run_gym(env, policy_func, policy_name, load_model_path, timesteps_per_batch, number_trajs, stochastic_policy, save=False, reuse=False): # Setup network # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func(policy_name, ob_space, ac_space, reuse=reuse) U.initialize() # Prepare for rollouts # ---------------------------------------- U.load_state(load_model_path) obs_list = [] acs_list = [] len_list = [] ret_list = [] for _ in tqdm(range(number_trajs)): traj = traj_1_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy) obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[ 'ep_len'], traj['ep_ret'] obs_list.append(obs) acs_list.append(acs) len_list.append(ep_len) ret_list.append(ep_ret) if stochastic_policy: print('stochastic policy:') else: print('deterministic policy:') if save: filename = load_model_path.split('/')[-1] + '.' + env.spec.id np.savez(filename, obs=np.array(obs_list), acs=np.array(acs_list), lens=np.array(len_list), rets=np.array(ret_list)) avg_len = sum(len_list) / len(len_list) avg_ret = sum(ret_list) / len(ret_list) print("Average length:", avg_len) print("Average return:", avg_ret) return avg_len, avg_ret
def train(data, task_desc, params, args, task_path): import gym ob_dim = data["irl"]["ob_list"][0].shape[0] #ob_dim = ob_dim // 4 c = np.max([ np.abs(np.min(data["irl"]["ob_list"])), np.abs(np.max(data["irl"]["ob_list"])) ]) ob_low = np.ones(ob_dim) * -c ob_high = np.ones(ob_dim) * c ob_space = gym.spaces.Box(low=ob_low, high=ob_high) n_action = 5 ac_space = gym.spaces.Discrete(n=n_action) if args.pretrain: model_path = os.path.join(root_path, "task", args.task, "model") fname = "ckpt.bc.{}.{}".format(args.traj_limitation, args.seed) ckpt_dir = os.path.join(model_path, fname) pretrained_path = os.path.join(ckpt_dir, fname) if not os.path.exists(os.path.join(ckpt_dir, "checkpoint")): print("==== pretraining starts ===") pretrained_path = train_bc_sepsis(task_desc, params, ob_space, ac_space, args) U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) def mlp_pi_wrapper(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size_phi=args.policy_hidden_size, num_hid_layers_phi=2, dim_phi=args.dim_phi) # just imitation learning #def mlp_pi_wrapper(name, ob_space, ac_space, reuse=False): # return mlp_policy.MlpPolicyOriginal(name=name, # ob_space=ob_space, # ac_space=ac_space, # reuse=reuse, # hid_size=args.policy_hidden_size, # num_hid_layers=2) env_name = task_desc["env_id"] scope_name = "pi.{}.{}".format(env_name.lower().split("-")[0], args.traj_limitation) pi_bc = mlp_pi_wrapper(scope_name, ob_space, ac_space) U.initialize() U.load_state(pretrained_path) phi_bc = pi_bc.featurize def phi_old(s, a): """ TODO: if action is discrete one hot encode action and concatenate with phi(s) """ # expect phi(s) -> (N, state_dim) # expect a -> (N, action_dim) phi_s = phi_bc(s) if len(phi_s.shape) == 1: # s -> (1, state_dim) phi_s = np.expand_dims(phi_s, axis=0) # if a = 5 try: if a == int(a): a = [a] except: pass a = np.array(a) # if a = [5] if len(a.shape) == 1: a = np.expand_dims(a, axis=1) # otherwise if a = [[5], [3]] phi_sa = np.hstack((phi_s, a)) return phi_sa def phi_discrete_action(n_action): def f(s, a): # expect phi(s) -> (N, state_dim) # expect a -> (N, action_dim) phi_s = phi_bc(s) try: if a == int(a): a = [a] except: pass a = np.array(a) a_onehot = np.eye(n_action)[a.astype(int)] if len(phi_s.shape) == 1: # s -> (1, state_dim) phi_s = np.expand_dims(phi_s, axis=0) try: phi_sa = np.hstack((phi_s, a_onehot)) except: a_onehot = a_onehot.reshape(a_onehot.shape[0], a_onehot.shape[2]) phi_sa = np.hstack((phi_s, a_onehot)) return phi_sa return f if isinstance(ac_space, gym.spaces.Discrete): phi = phi_discrete_action(ac_space.n) elif isinstance(ac_space, gym.spaces.Box): phi = phi_continuous_action else: raise NotImplementedError D = data["irl"] obs = D["ob_list"].reshape(-1, D["ob_list"].shape[-1]) obs_p1 = D["ob_next_list"].reshape(-1, D["ob_next_list"].shape[-1]) #assuming action dof of 1 acs = D["ac_list"].reshape(-1) new = D["new"].reshape(-1) data = {} data["s"] = obs data["a"] = acs data["s_next"] = obs_p1 data["done"] = data["absorb"] = new data["phi_sa"] = phi(obs, acs) data["phi_fn"] = phi data["phi_fn_s"] = phi_bc data["psi_sa"] = data["phi_sa"] data["psi_fn"] = phi evaluator = ALEvaluator(data, task_desc["gamma"], env=None) data_path = os.path.join(task_path, "data") pi_0 = pi_bc phi_dim = data["phi_sa"].shape[1] model_id = "{}.{}".format(params["id"], params["version"]) if model_id == "mma.0": result = train_mma(pi_0, phi_dim, task_desc, params, data, evaluator, ob_space, ac_space) elif model_id == "mma.1": result = train_mma(pi_0, phi_dim, task_desc, params, data, evaluator, ob_space, ac_space) elif model_id == "mma.2": #result = train_scirl_v2(data, phi_bc, evaluator, phi_dim, task_desc, params) #result = train_scirl_v3(data, phi_bc, evaluator) result = train_scirl(data, phi_bc, evaluator) else: raise NotImplementedError name = "{}.{}.{}".format(model_id, args.n_e, args.seed) result_path = os.path.join(args.save_path, name + "train.log") with open(result_path, "w") as f: #flush? for step in range(params["n_iteration"] + 1): data_points = [ step, round(result["margin_mu"][step], 2), round(result["margin_v"][step], 2), round(result["a_match"][step], 2) ] f.write("{}\t{}\t{}\t{}\n".format(*data_points)) with open(os.path.join(args.save_path, name + ".pkl"), "wb") as f: pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
def _train(self): self._buffer_list = [] self._beta_schedule_list = [] if self._prioritized_replay: self._rb = PrioritizedReplayBufferNextAction( self._n_train, alpha=self._prioritized_replay_alpha) if self._prioritized_replay_beta_iters is None: self._prioritized_replay_beta_iters = self._max_timesteps self._bs = LinearSchedule(self._prioritized_replay_beta_iters, initial_p=self._prioritized_replay_beta0, final_p=1.0) else: self._rb = ReplayBufferNextAction(self._n_train) D_train_zipped = zip(self._D_train["s"], self._D_train["a"], self._D_train["phi_sa"], self._D_train["s_next"], self._D_train["done"]) for (s, a, phi_sa, s_next, done) in D_train_zipped: a_next = self._pi.act(self._mu_stochastic, s_next[np.newaxis, ...])[0] self._rb.add(s, a, phi_sa.flatten(), s_next, a_next, float(done)) phi_sa_val = self._D_val["phi_sa"] s_val = self._D_val["s"] a_val = self._D_val["a"] s_next_val = self._D_val["s_next"] a_next_val = self._pi.act(self._mu_stochastic, s_next_val)[0] a_next_val = a_next_val[..., np.newaxis] sess = tf.Session() sess.__enter__() def make_obs_ph(name): return BatchInput(self._obs_shape, name=name) def make_acs_ph(name): return BatchInput(self._acs_shape, name=name) tools = build_train( make_obs_ph=make_obs_ph, make_acs_ph=make_acs_ph, optimizer=tf.train.AdamOptimizer(learning_rate=self._lr), mu_func=self._model, phi_sa_dim=self._mu_dim, grad_norm_clipping=self._grad_norm_clipping, gamma=self._gamma, scope=self._scope_name, reuse=True) mu_estimator, train, update_target = tools self._timestep = int(self._exploration_fraction * self._max_timesteps), U.initialize() update_target() for t in itertools.count(): if self._prioritized_replay: experience = self._rb.sample(self._buffer_batch_size, beta=self._bs.value(t + 1)) (s, a, phi_sa, s_next, a_next, dones, weights, batch_idxes) = experience else: s, a, phi_sa, s_next, a_next, dones = self._rb.sample( self._buffer_batch_size) weights, batch_idxes = np.ones(self._buffer_batch_size), None if len(a_next.shape) == 1: a_next = np.expand_dims(a_next, axis=1) td_errors = train(self._mu_stochastic, s, a, phi_sa, s_next, a_next, dones, weights) if self._prioritized_replay: new_priorities = np.abs( td_errors) + self._prioritized_replay_eps self._rb.update_priorities(batch_idxes, new_priorities) if t % self._target_network_update_freq == 0: #sys.stdout.flush() #sys.stdout.write("average training td_errors: {}".format(td_errors.mean())) logger.log("average training td_errors: {}".format( td_errors.mean())) update_target() if t % self._evaluation_freq == 0: logger.log("been trained {} steps".format(t)) mu_est_val = mu_estimator(self._mu_stochastic, s_val, a_val) mu_target_val = phi_sa_val + self._gamma * mu_estimator( self._mu_stochastic, s_next_val, a_next_val) # average over rows and cols td_errors_val = np.mean((mu_est_val - mu_target_val)**2) if td_errors_val < self._delta: logger.log( "mean validation td_errors: {}".format(td_errors_val)) break if t > self._max_timesteps: break self._mu_estimator = mu_estimator return mu_estimator
def learn_original(pi, dataset, env_name, n_action, prefix, traj_lim, seed, optim_batch_size=128, max_iters=5e3, adam_epsilon=1e-4, optim_stepsize=1e-4, ckpt_dir=None, plot_dir=None, task_name=None, verbose=False): """ learn without regularization """ # custom hyperparams seed = 0 max_iters = 5e4 val_per_iter = int(max_iters / 10) # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(tf.to_float(ac - pi.ac))) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Training a policy with Behavior Cloning") logger.log("with {} trajs, {} steps".format(dataset.num_traj, dataset.num_transition)) loss_history = {} loss_history["train_action_loss"] = [] loss_history["val_action_loss"] = [] for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert, _, _ = dataset.get_next_batch( optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert, _, _ = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) loss_history["train_action_loss"].append(train_loss) loss_history["val_action_loss"].append(val_loss) plot(env_name, loss_history, traj_lim, plot_dir) os.makedirs(ckpt_dir, exist_ok=True) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed) savedir_fname = osp.join(ckpt_dir, ckpt_fname) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def learn(network, dataset, env_name, n_action, prefix, traj_lim, seed, optim_batch_size=32, max_iters=1e4, adam_epsilon=1e-4, optim_stepsize=3e-4, ckpt_dir=None, plot_dir=None, task_name=None, verbose=False): """ learn with regularization """ seed = 0 alpha = 0.7 beta = 1.0 pi = network.pi T = network.T val_per_iter = int(max_iters / 20) ob = U.get_placeholder_cached(name="ob") T_ac = U.get_placeholder_cached(name="T_ac") pi_stochastic = U.get_placeholder_cached(name="pi_stochastic") T_stochastic = U.get_placeholder_cached(name="T_stochastic") ac = network.pdtype.sample_placeholder([None]) ob_next = network.ob_next_pdtype.sample_placeholder([None]) onehot_ac = tf.one_hot(ac, depth=n_action) ce_loss = tf.losses.softmax_cross_entropy(logits=pi.logits, onehot_labels=onehot_ac) ce_loss = tf.reduce_mean(ce_loss) reg_loss = tf.reduce_mean(tf.square(tf.to_float(ob_next - network.ob_next))) losses = [ce_loss, reg_loss] total_loss = alpha * ce_loss + beta * reg_loss var_list = network.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function( [ob, ac, T_ac, ob_next, pi_stochastic, T_stochastic], losses + [U.flatgrad(total_loss, var_list)]) U.initialize() adam.sync() logger.log("Training a policy with Behavior Cloning") logger.log("with {} trajs, {} steps".format(dataset.num_traj, dataset.num_transition)) loss_history = {} loss_history["train_action_loss"] = [] loss_history["train_transition_loss"] = [] loss_history["val_action_loss"] = [] loss_history["val_transition_loss"] = [] for iter_so_far in tqdm(range(int(max_iters))): #ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch( optim_batch_size, 'train') train_loss_ce, train_loss_reg, g = lossandgrad(ob_expert, ac_expert, ac_expert, ob_next_expert, True, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: #ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch( -1, 'val') val_loss_ce, val_loss_reg, _ = lossandgrad(ob_expert, ac_expert, ac_expert, ob_next_expert, True, True) items = [train_loss_ce, train_loss_reg, val_loss_ce, val_loss_reg] logger.log("Training Action loss: {}\n" \ "Training Transition loss: {}\n" \ "Validation Action loss: {}\n" \ "Validation Transition Loss:{}\n".format(*items)) loss_history["train_action_loss"].append(train_loss_ce) loss_history["train_transition_loss"].append(train_loss_reg) loss_history["val_action_loss"].append(val_loss_ce) loss_history["val_transition_loss"].append(val_loss_reg) #if len(loss_history["val_action_loss"]) > 1: # val_loss_ce_delta = loss_history["val_action_loss"][-1] - val_loss_ce # if np.abs(val_loss_ce_delta) < val_stop_threshold: # logger.log("validation error seems to have converged.") # break plot(env_name, loss_history, traj_lim, plot_dir) os.makedirs(ckpt_dir, exist_ok=True) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed) savedir_fname = osp.join(ckpt_dir, ckpt_fname) U.save_state(savedir_fname, var_list=network.get_variables()) return savedir_fname