def fit(self, D, pi): """estimate xi to compute mu assuminng action-value function mu(s, a) is linearly parametrized by xi such that mu(s, a) = Q_phi(s, a) = xi^T psi(s) Parameters ---------- pi : Policy policy to evaluate Returns ------- xi_hat = xi_hat TODO - vectorize this - phi(s, a) or phi(s) when to use - what phi or psi to use? - check dimensionality of everytthing """ self._D = D s_next = self._D["s_next"] absorb = self._D["done"] phi_sa = self._D["phi_sa"] psi_sa = self._D["psi_sa"] self._psi = self._D["psi_fn"] a_next = [ pi.act(self._stochastic, s[np.newaxis, ...])[0] for s in s_next ] psi_sa_next = self._psi(s_next, a_next) psi_sa_next[absorb.flatten(), :] = 0 A_hat = np.zeros((self._q, self._q)) A_hat += self._lstd_eps * np.identity(self._q) b_hat = np.zeros((self._q, self._p)) psi_delta = psi_sa - self._gamma * psi_sa_next A_hat += psi_sa.T.dot(psi_delta) b_hat = psi_sa.T.dot(phi_sa) rank = matrix_rank(A_hat) if rank == self._p: xi_hat = solve(A_hat, b_hat) else: logger.log("condition number of A_hat\n{}".format(cond(A_hat))) logging.warning("A_hat is not full rank {} < {}".format( rank, self._p)) xi_hat = lstsq(A_hat, b_hat)[0] self._xi_hat = xi_hat return xi_hat
def fit(self, phi_sa, phi_sa_next, r): """TODO: Docstring for learn. assuminng action-value function Q(s,a) is linearly parametrized by W such that Q = W^T phi(s) Parameters ---------- Returns ------- TODO this is LSTD_Q check dimensionality of everything """ gamma = self._gamma A_hat, b_hat = fast_solve(phi_sa, phi_sa_next, r, gamma) rank = matrix_rank(A_hat) if rank == self._p: W_hat = solve(A_hat, b_hat) else: logger.log("condition number of A_hat\n{}".format(cond(A_hat))) logger.log("A_hat is not full rank {} < {}".format(rank, self._p)) W_hat = lstsq(A_hat, b_hat)[0] self._W_hat = W_hat return W_hat
def train_bc(task, params, ob_space, ac_space, args, env): task_path = os.path.join(root_path, "task", args.task) plot_path = os.path.join(task_path, "result") dataset = GymDataset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size_phi=args.policy_hidden_size, num_hid_layers_phi=2, dim_phi=args.dim_phi) env_name = task["env_id"] name = "pi.{}.{}".format(env_name.lower().split("-")[0], args.traj_limitation) pi = policy_fn(name, ob_space, ac_space) n_action = env.action_space.n fname = "ckpt.bc.{}.{}".format(args.traj_limitation, args.seed) savedir_fname = osp.join(args.checkpoint_dir, fname, fname) if not os.path.exists(savedir_fname + ".index"): savedir_fname = learn(pi, dataset, env_name, n_action, prefix="bc", seed=args.seed, traj_lim=args.traj_limitation, max_iters=args.BC_max_iter, ckpt_dir=osp.join(args.checkpoint_dir, fname), plot_dir=plot_path, task_name=task["env_id"], verbose=True) logger.log(savedir_fname + "saved") # avg_len, avg_ret = run_gym(env, # policy_fn, # savedir_fname, # timesteps_per_batch=args.horizon, # number_trajs=10, # stochastic_policy=args.stochastic_policy, # save=args.save_sample, # reuse=True) # # return savedir_fname
def log_info(self): logger.log("Total trajectorues: %d" % self.num_traj) logger.log("Total transitions: %d" % self.num_transition) logger.log("Average returns: %f" % self.avg_ret) logger.log("Std for returns: %f" % self.std_ret)
def _train(self): self._buffer_list = [] self._beta_schedule_list = [] if self._prioritized_replay: self._rb = PrioritizedReplayBufferNextAction( self._n_train, alpha=self._prioritized_replay_alpha) if self._prioritized_replay_beta_iters is None: self._prioritized_replay_beta_iters = self._max_timesteps self._bs = LinearSchedule(self._prioritized_replay_beta_iters, initial_p=self._prioritized_replay_beta0, final_p=1.0) else: self._rb = ReplayBufferNextAction(self._n_train) D_train_zipped = zip(self._D_train["s"], self._D_train["a"], self._D_train["phi_sa"], self._D_train["s_next"], self._D_train["done"]) for (s, a, phi_sa, s_next, done) in D_train_zipped: a_next = self._pi.act(self._mu_stochastic, s_next[np.newaxis, ...])[0] self._rb.add(s, a, phi_sa.flatten(), s_next, a_next, float(done)) phi_sa_val = self._D_val["phi_sa"] s_val = self._D_val["s"] a_val = self._D_val["a"] s_next_val = self._D_val["s_next"] a_next_val = self._pi.act(self._mu_stochastic, s_next_val)[0] a_next_val = a_next_val[..., np.newaxis] sess = tf.Session() sess.__enter__() def make_obs_ph(name): return BatchInput(self._obs_shape, name=name) def make_acs_ph(name): return BatchInput(self._acs_shape, name=name) tools = build_train( make_obs_ph=make_obs_ph, make_acs_ph=make_acs_ph, optimizer=tf.train.AdamOptimizer(learning_rate=self._lr), mu_func=self._model, phi_sa_dim=self._mu_dim, grad_norm_clipping=self._grad_norm_clipping, gamma=self._gamma, scope=self._scope_name, reuse=True) mu_estimator, train, update_target = tools self._timestep = int(self._exploration_fraction * self._max_timesteps), U.initialize() update_target() for t in itertools.count(): if self._prioritized_replay: experience = self._rb.sample(self._buffer_batch_size, beta=self._bs.value(t + 1)) (s, a, phi_sa, s_next, a_next, dones, weights, batch_idxes) = experience else: s, a, phi_sa, s_next, a_next, dones = self._rb.sample( self._buffer_batch_size) weights, batch_idxes = np.ones(self._buffer_batch_size), None if len(a_next.shape) == 1: a_next = np.expand_dims(a_next, axis=1) td_errors = train(self._mu_stochastic, s, a, phi_sa, s_next, a_next, dones, weights) if self._prioritized_replay: new_priorities = np.abs( td_errors) + self._prioritized_replay_eps self._rb.update_priorities(batch_idxes, new_priorities) if t % self._target_network_update_freq == 0: #sys.stdout.flush() #sys.stdout.write("average training td_errors: {}".format(td_errors.mean())) logger.log("average training td_errors: {}".format( td_errors.mean())) update_target() if t % self._evaluation_freq == 0: logger.log("been trained {} steps".format(t)) mu_est_val = mu_estimator(self._mu_stochastic, s_val, a_val) mu_target_val = phi_sa_val + self._gamma * mu_estimator( self._mu_stochastic, s_next_val, a_next_val) # average over rows and cols td_errors_val = np.mean((mu_est_val - mu_target_val)**2) if td_errors_val < self._delta: logger.log( "mean validation td_errors: {}".format(td_errors_val)) break if t > self._max_timesteps: break self._mu_estimator = mu_estimator return mu_estimator
def learn_original(pi, dataset, env_name, n_action, prefix, traj_lim, seed, optim_batch_size=128, max_iters=5e3, adam_epsilon=1e-4, optim_stepsize=1e-4, ckpt_dir=None, plot_dir=None, task_name=None, verbose=False): """ learn without regularization """ # custom hyperparams seed = 0 max_iters = 5e4 val_per_iter = int(max_iters / 10) # placeholder ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) stochastic = U.get_placeholder_cached(name="stochastic") loss = tf.reduce_mean(tf.square(tf.to_float(ac - pi.ac))) var_list = pi.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function([ob, ac, stochastic], [loss] + [U.flatgrad(loss, var_list)]) U.initialize() adam.sync() logger.log("Training a policy with Behavior Cloning") logger.log("with {} trajs, {} steps".format(dataset.num_traj, dataset.num_transition)) loss_history = {} loss_history["train_action_loss"] = [] loss_history["val_action_loss"] = [] for iter_so_far in tqdm(range(int(max_iters))): ob_expert, ac_expert, _, _ = dataset.get_next_batch( optim_batch_size, 'train') train_loss, g = lossandgrad(ob_expert, ac_expert, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: ob_expert, ac_expert, _, _ = dataset.get_next_batch(-1, 'val') val_loss, _ = lossandgrad(ob_expert, ac_expert, True) logger.log("Training loss: {}, Validation loss: {}".format( train_loss, val_loss)) loss_history["train_action_loss"].append(train_loss) loss_history["val_action_loss"].append(val_loss) plot(env_name, loss_history, traj_lim, plot_dir) os.makedirs(ckpt_dir, exist_ok=True) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed) savedir_fname = osp.join(ckpt_dir, ckpt_fname) U.save_state(savedir_fname, var_list=pi.get_variables()) return savedir_fname
def learn(network, dataset, env_name, n_action, prefix, traj_lim, seed, optim_batch_size=32, max_iters=1e4, adam_epsilon=1e-4, optim_stepsize=3e-4, ckpt_dir=None, plot_dir=None, task_name=None, verbose=False): """ learn with regularization """ seed = 0 alpha = 0.7 beta = 1.0 pi = network.pi T = network.T val_per_iter = int(max_iters / 20) ob = U.get_placeholder_cached(name="ob") T_ac = U.get_placeholder_cached(name="T_ac") pi_stochastic = U.get_placeholder_cached(name="pi_stochastic") T_stochastic = U.get_placeholder_cached(name="T_stochastic") ac = network.pdtype.sample_placeholder([None]) ob_next = network.ob_next_pdtype.sample_placeholder([None]) onehot_ac = tf.one_hot(ac, depth=n_action) ce_loss = tf.losses.softmax_cross_entropy(logits=pi.logits, onehot_labels=onehot_ac) ce_loss = tf.reduce_mean(ce_loss) reg_loss = tf.reduce_mean(tf.square(tf.to_float(ob_next - network.ob_next))) losses = [ce_loss, reg_loss] total_loss = alpha * ce_loss + beta * reg_loss var_list = network.get_trainable_variables() adam = MpiAdam(var_list, epsilon=adam_epsilon) lossandgrad = U.function( [ob, ac, T_ac, ob_next, pi_stochastic, T_stochastic], losses + [U.flatgrad(total_loss, var_list)]) U.initialize() adam.sync() logger.log("Training a policy with Behavior Cloning") logger.log("with {} trajs, {} steps".format(dataset.num_traj, dataset.num_transition)) loss_history = {} loss_history["train_action_loss"] = [] loss_history["train_transition_loss"] = [] loss_history["val_action_loss"] = [] loss_history["val_transition_loss"] = [] for iter_so_far in tqdm(range(int(max_iters))): #ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train') ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch( optim_batch_size, 'train') train_loss_ce, train_loss_reg, g = lossandgrad(ob_expert, ac_expert, ac_expert, ob_next_expert, True, True) adam.update(g, optim_stepsize) if verbose and iter_so_far % val_per_iter == 0: #ob_expert, ac_expert = dataset.get_next_batch(-1, 'val') ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch( -1, 'val') val_loss_ce, val_loss_reg, _ = lossandgrad(ob_expert, ac_expert, ac_expert, ob_next_expert, True, True) items = [train_loss_ce, train_loss_reg, val_loss_ce, val_loss_reg] logger.log("Training Action loss: {}\n" \ "Training Transition loss: {}\n" \ "Validation Action loss: {}\n" \ "Validation Transition Loss:{}\n".format(*items)) loss_history["train_action_loss"].append(train_loss_ce) loss_history["train_transition_loss"].append(train_loss_reg) loss_history["val_action_loss"].append(val_loss_ce) loss_history["val_transition_loss"].append(val_loss_reg) #if len(loss_history["val_action_loss"]) > 1: # val_loss_ce_delta = loss_history["val_action_loss"][-1] - val_loss_ce # if np.abs(val_loss_ce_delta) < val_stop_threshold: # logger.log("validation error seems to have converged.") # break plot(env_name, loss_history, traj_lim, plot_dir) os.makedirs(ckpt_dir, exist_ok=True) if ckpt_dir is None: savedir_fname = tempfile.TemporaryDirectory().name else: ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed) savedir_fname = osp.join(ckpt_dir, ckpt_fname) U.save_state(savedir_fname, var_list=network.get_variables()) return savedir_fname
def train_mma(pi_0, phi_sa_dim, task_desc, params, D, evaluator, ob_space=None, ac_space=None): gym.logger.setLevel(logging.WARN) gamma = task_desc["gamma"] horizon = task_desc["horizon"] eps = params["eps"] p = q = phi_sa_dim # adding action dim phi = D["phi_fn"] phi_s = D["phi_fn_s"] stochastic = True mu_estimator_type = params["mu_estimator"] n_action = task_desc["n_action"] assert isinstance(n_action, int) action_list = range(n_action) precision = params["precision"] mu_exp_estimator = EmpiricalMuEstimator(phi, gamma) mu_exp_estimator.fit(D, stochastic, return_s_init=True) mu_exp, s_init_list = mu_exp_estimator.estimate() logger.log("fitting {}".format(mu_estimator_type)) if task_desc["type"] == "gym": env = gym.make(task_desc["env_id"]) ac_space = env.action_space ob_space = env.observation_space mu_dim = p # only for discrete action elif task_desc["type"] == "sepsis": if ac_space is None: ac_space = (5, ) if ob_space is None: ob_space = (46, ) mu_dim = p stochastic = True s = D["s"] a = D["a"] if len(a.shape) == 1: a = np.expand_dims(a, axis=1) s_next = D["s_next"] done = D["done"] if len(done.shape) == 1: done = np.expand_dims(done, axis=1) phi_sa = D["phi_sa"] n_transition = D["s"].shape[0] idx = idx = int(n_transition * 0.7) D_train = {"s" : s[:idx, :], "a" : a[:idx, :], "phi_sa" : phi_sa[:idx, :], "s_next": s_next[:idx, :], "done": done[:idx, :]} D_val = {"s" : s[idx:, :], "a" : a[idx:, :], "phi_sa" : phi_sa[idx:, :], "s_next": s_next[idx:, :], "done": done[idx:, :]} if mu_estimator_type == "lstd": mu_estimator = LSTDMuEstimator(phi, gamma, D, p, q, eps, s_init_list) elif mu_estimator_type == "dsfn": mu_estimator = DeepMuEstimator(phi, gamma, D_train, D_val, s_init_list, ob_space, ac_space, mu_dim, horizon) else: raise NotImplementedError if params["mdp_solver"] == "lspi": W_0 = np.random.normal(loc=0, scale=0.1, size=p) lspi = LSPI(D=D, action_list=range(task_desc["n_action"]), p=p, gamma=gamma, precision=precision, lstd_eps=params["eps"], W_0=W_0, reward_fn=None, stochastic=True, max_iter=10) mdp_solver = lspi elif params["mdp_solver"] == "dqn": mdp_solver = DQNSepsis(D=D_train) else: raise NotImplementedError mma = MaxMarginAbbeel(pi_init=pi_0, p=p, phi=phi, mu_exp=mu_exp, mdp_solver=mdp_solver, evaluator=evaluator, irl_precision=params["precision"], method=params["method"], mu_estimator=mu_estimator, stochastic=stochastic, D_val=D_val) results = mma.run(n_iteration=params["n_iteration"]) return results