Exemple #1
0
    def _train_batch(self, reward_fn=None):
        sess = tf.Session()
        sess.__enter__()

        def make_obs_ph(name):
            return BatchInput(self._obs_shape, name=name)

        tools = deepq.build_train(
            make_obs_ph=make_obs_ph,
            q_func=self._model,
            num_actions=self._n_action,
            optimizer=tf.train.AdamOptimizer(learning_rate=self._lr),
            gamma=self._gamma,
            grad_norm_clipping=self._grad_norm_clipping)
        act, train, update_target, debug = tools

        self._timestep = int(self._exploration_fraction * self._max_timesteps),

        U.initialize()
        update_target()

        for t in itertools.count():
            if self._prioritized_replay:
                experience = self._replay_buffer.sample(
                    self._buffer_batch_size,
                    beta=self._beta_schedule.value(t + 1))
                (s, a, r, s_next, dones, weights, batch_idxes) = experience
                if reward_fn is not None:
                    r = np.array(
                        [np.asscalar(reward_fn(s, a)) for s, a in zip(s, a)])
            else:
                s, a, r, s_next, dones = self._replay_buffer.sample(
                    self._buffer_batch_size)
                if reward_fn is not None:
                    r = np.array(
                        [np.asscalar(reward_fn(s, a)) for s, a in zip(s, a)])
                weights, batch_idxes = np.ones_like(r), None
            td_errors = train(s, a, r, s_next, dones, weights)

            if self._prioritized_replay:
                new_priorities = np.abs(
                    td_errors) + self._prioritized_replay_eps
                self._replay_buffer.update_priorities(batch_idxes,
                                                      new_priorities)

            if t % self._target_network_update_freq == 0:
                logging.info("been trained {} steps".format(t))
                update_target()
            if t > 100 and t % self._policy_evaluate_freq == 0:
                logging.info("evaluating the policy...{} steps".format(t))
                if self._env is not None:
                    self._evaluate_policy(act)

            if t > self._max_timesteps:
                break

        self._policy = act
        return act
Exemple #2
0
def run_gym(env,
            policy_func,
            policy_name,
            load_model_path,
            timesteps_per_batch,
            number_trajs,
            stochastic_policy,
            save=False,
            reuse=False):

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    pi = policy_func(policy_name, ob_space, ac_space, reuse=reuse)
    U.initialize()
    # Prepare for rollouts
    # ----------------------------------------
    U.load_state(load_model_path)

    obs_list = []
    acs_list = []
    len_list = []
    ret_list = []
    for _ in tqdm(range(number_trajs)):
        traj = traj_1_generator(pi,
                                env,
                                timesteps_per_batch,
                                stochastic=stochastic_policy)
        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[
            'ep_len'], traj['ep_ret']
        obs_list.append(obs)
        acs_list.append(acs)
        len_list.append(ep_len)
        ret_list.append(ep_ret)
    if stochastic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    if save:
        filename = load_model_path.split('/')[-1] + '.' + env.spec.id
        np.savez(filename,
                 obs=np.array(obs_list),
                 acs=np.array(acs_list),
                 lens=np.array(len_list),
                 rets=np.array(ret_list))
    avg_len = sum(len_list) / len(len_list)
    avg_ret = sum(ret_list) / len(ret_list)
    print("Average length:", avg_len)
    print("Average return:", avg_ret)
    return avg_len, avg_ret
def train(data, task_desc, params, args, task_path):
    import gym
    ob_dim = data["irl"]["ob_list"][0].shape[0]
    #ob_dim = ob_dim // 4
    c = np.max([
        np.abs(np.min(data["irl"]["ob_list"])),
        np.abs(np.max(data["irl"]["ob_list"]))
    ])
    ob_low = np.ones(ob_dim) * -c
    ob_high = np.ones(ob_dim) * c
    ob_space = gym.spaces.Box(low=ob_low, high=ob_high)
    n_action = 5
    ac_space = gym.spaces.Discrete(n=n_action)

    if args.pretrain:
        model_path = os.path.join(root_path, "task", args.task, "model")
        fname = "ckpt.bc.{}.{}".format(args.traj_limitation, args.seed)
        ckpt_dir = os.path.join(model_path, fname)
        pretrained_path = os.path.join(ckpt_dir, fname)

        if not os.path.exists(os.path.join(ckpt_dir, "checkpoint")):
            print("==== pretraining starts ===")
            pretrained_path = train_bc_sepsis(task_desc, params, ob_space,
                                              ac_space, args)

        U.make_session(num_cpu=1).__enter__()
        set_global_seeds(args.seed)

        def mlp_pi_wrapper(name, ob_space, ac_space, reuse=False):
            return mlp_policy.MlpPolicy(name=name,
                                        ob_space=ob_space,
                                        ac_space=ac_space,
                                        reuse=reuse,
                                        hid_size_phi=args.policy_hidden_size,
                                        num_hid_layers_phi=2,
                                        dim_phi=args.dim_phi)

        # just imitation learning
        #def mlp_pi_wrapper(name, ob_space, ac_space, reuse=False):
        #    return mlp_policy.MlpPolicyOriginal(name=name,
        #                                    ob_space=ob_space,
        #                                    ac_space=ac_space,
        #                                    reuse=reuse,
        #                                    hid_size=args.policy_hidden_size,
        #                                    num_hid_layers=2)

        env_name = task_desc["env_id"]
        scope_name = "pi.{}.{}".format(env_name.lower().split("-")[0],
                                       args.traj_limitation)

        pi_bc = mlp_pi_wrapper(scope_name, ob_space, ac_space)
        U.initialize()
        U.load_state(pretrained_path)
        phi_bc = pi_bc.featurize

        def phi_old(s, a):
            """
            TODO: if action is discrete
            one hot encode action and concatenate with phi(s)
            """
            # expect phi(s) -> (N, state_dim)
            # expect a -> (N, action_dim)
            phi_s = phi_bc(s)

            if len(phi_s.shape) == 1:
                # s -> (1, state_dim)
                phi_s = np.expand_dims(phi_s, axis=0)

            # if a = 5
            try:
                if a == int(a):
                    a = [a]
            except:
                pass

            a = np.array(a)
            # if a = [5]
            if len(a.shape) == 1:
                a = np.expand_dims(a, axis=1)
            # otherwise if a = [[5], [3]]
            phi_sa = np.hstack((phi_s, a))
            return phi_sa

        def phi_discrete_action(n_action):
            def f(s, a):
                # expect phi(s) -> (N, state_dim)
                # expect a -> (N, action_dim)
                phi_s = phi_bc(s)

                try:
                    if a == int(a):
                        a = [a]
                except:
                    pass
                a = np.array(a)

                a_onehot = np.eye(n_action)[a.astype(int)]

                if len(phi_s.shape) == 1:
                    # s -> (1, state_dim)
                    phi_s = np.expand_dims(phi_s, axis=0)

                try:
                    phi_sa = np.hstack((phi_s, a_onehot))
                except:
                    a_onehot = a_onehot.reshape(a_onehot.shape[0],
                                                a_onehot.shape[2])
                    phi_sa = np.hstack((phi_s, a_onehot))
                return phi_sa

            return f

        if isinstance(ac_space, gym.spaces.Discrete):
            phi = phi_discrete_action(ac_space.n)
        elif isinstance(ac_space, gym.spaces.Box):
            phi = phi_continuous_action
        else:
            raise NotImplementedError

    D = data["irl"]
    obs = D["ob_list"].reshape(-1, D["ob_list"].shape[-1])
    obs_p1 = D["ob_next_list"].reshape(-1, D["ob_next_list"].shape[-1])
    #assuming action dof of 1
    acs = D["ac_list"].reshape(-1)
    new = D["new"].reshape(-1)

    data = {}
    data["s"] = obs
    data["a"] = acs
    data["s_next"] = obs_p1
    data["done"] = data["absorb"] = new

    data["phi_sa"] = phi(obs, acs)
    data["phi_fn"] = phi
    data["phi_fn_s"] = phi_bc

    data["psi_sa"] = data["phi_sa"]
    data["psi_fn"] = phi

    evaluator = ALEvaluator(data, task_desc["gamma"], env=None)
    data_path = os.path.join(task_path, "data")

    pi_0 = pi_bc

    phi_dim = data["phi_sa"].shape[1]

    model_id = "{}.{}".format(params["id"], params["version"])

    if model_id == "mma.0":
        result = train_mma(pi_0, phi_dim, task_desc, params, data, evaluator,
                           ob_space, ac_space)
    elif model_id == "mma.1":
        result = train_mma(pi_0, phi_dim, task_desc, params, data, evaluator,
                           ob_space, ac_space)
    elif model_id == "mma.2":
        #result = train_scirl_v2(data, phi_bc, evaluator, phi_dim, task_desc, params)
        #result = train_scirl_v3(data, phi_bc, evaluator)
        result = train_scirl(data, phi_bc, evaluator)
    else:
        raise NotImplementedError

    name = "{}.{}.{}".format(model_id, args.n_e, args.seed)
    result_path = os.path.join(args.save_path, name + "train.log")

    with open(result_path, "w") as f:
        #flush?
        for step in range(params["n_iteration"] + 1):
            data_points = [
                step,
                round(result["margin_mu"][step], 2),
                round(result["margin_v"][step], 2),
                round(result["a_match"][step], 2)
            ]
            f.write("{}\t{}\t{}\t{}\n".format(*data_points))

    with open(os.path.join(args.save_path, name + ".pkl"), "wb") as f:
        pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
Exemple #4
0
    def _train(self):

        self._buffer_list = []
        self._beta_schedule_list = []
        if self._prioritized_replay:
            self._rb = PrioritizedReplayBufferNextAction(
                self._n_train, alpha=self._prioritized_replay_alpha)
            if self._prioritized_replay_beta_iters is None:
                self._prioritized_replay_beta_iters = self._max_timesteps
            self._bs = LinearSchedule(self._prioritized_replay_beta_iters,
                                      initial_p=self._prioritized_replay_beta0,
                                      final_p=1.0)
        else:
            self._rb = ReplayBufferNextAction(self._n_train)

        D_train_zipped = zip(self._D_train["s"], self._D_train["a"],
                             self._D_train["phi_sa"], self._D_train["s_next"],
                             self._D_train["done"])
        for (s, a, phi_sa, s_next, done) in D_train_zipped:

            a_next = self._pi.act(self._mu_stochastic, s_next[np.newaxis,
                                                              ...])[0]
            self._rb.add(s, a, phi_sa.flatten(), s_next, a_next, float(done))

        phi_sa_val = self._D_val["phi_sa"]
        s_val = self._D_val["s"]
        a_val = self._D_val["a"]
        s_next_val = self._D_val["s_next"]

        a_next_val = self._pi.act(self._mu_stochastic, s_next_val)[0]
        a_next_val = a_next_val[..., np.newaxis]

        sess = tf.Session()
        sess.__enter__()

        def make_obs_ph(name):
            return BatchInput(self._obs_shape, name=name)

        def make_acs_ph(name):
            return BatchInput(self._acs_shape, name=name)

        tools = build_train(
            make_obs_ph=make_obs_ph,
            make_acs_ph=make_acs_ph,
            optimizer=tf.train.AdamOptimizer(learning_rate=self._lr),
            mu_func=self._model,
            phi_sa_dim=self._mu_dim,
            grad_norm_clipping=self._grad_norm_clipping,
            gamma=self._gamma,
            scope=self._scope_name,
            reuse=True)

        mu_estimator, train, update_target = tools

        self._timestep = int(self._exploration_fraction * self._max_timesteps),

        U.initialize()
        update_target()

        for t in itertools.count():
            if self._prioritized_replay:
                experience = self._rb.sample(self._buffer_batch_size,
                                             beta=self._bs.value(t + 1))
                (s, a, phi_sa, s_next, a_next, dones, weights,
                 batch_idxes) = experience
            else:
                s, a, phi_sa, s_next, a_next, dones = self._rb.sample(
                    self._buffer_batch_size)
                weights, batch_idxes = np.ones(self._buffer_batch_size), None

            if len(a_next.shape) == 1:
                a_next = np.expand_dims(a_next, axis=1)

            td_errors = train(self._mu_stochastic, s, a, phi_sa, s_next,
                              a_next, dones, weights)

            if self._prioritized_replay:
                new_priorities = np.abs(
                    td_errors) + self._prioritized_replay_eps
                self._rb.update_priorities(batch_idxes, new_priorities)

            if t % self._target_network_update_freq == 0:
                #sys.stdout.flush()
                #sys.stdout.write("average training td_errors: {}".format(td_errors.mean()))
                logger.log("average training td_errors: {}".format(
                    td_errors.mean()))
                update_target()

            if t % self._evaluation_freq == 0:
                logger.log("been trained {} steps".format(t))

                mu_est_val = mu_estimator(self._mu_stochastic, s_val, a_val)
                mu_target_val = phi_sa_val + self._gamma * mu_estimator(
                    self._mu_stochastic, s_next_val, a_next_val)
                # average over rows and cols
                td_errors_val = np.mean((mu_est_val - mu_target_val)**2)

                if td_errors_val < self._delta:
                    logger.log(
                        "mean validation td_errors: {}".format(td_errors_val))
                    break

            if t > self._max_timesteps:
                break

        self._mu_estimator = mu_estimator
        return mu_estimator
Exemple #5
0
def learn_original(pi,
                   dataset,
                   env_name,
                   n_action,
                   prefix,
                   traj_lim,
                   seed,
                   optim_batch_size=128,
                   max_iters=5e3,
                   adam_epsilon=1e-4,
                   optim_stepsize=1e-4,
                   ckpt_dir=None,
                   plot_dir=None,
                   task_name=None,
                   verbose=False):
    """
    learn without regularization
    """
    # custom hyperparams
    seed = 0
    max_iters = 5e4

    val_per_iter = int(max_iters / 10)
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(tf.to_float(ac - pi.ac)))
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic],
                             [loss] + [U.flatgrad(loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Training a policy with Behavior Cloning")
    logger.log("with {} trajs, {} steps".format(dataset.num_traj,
                                                dataset.num_transition))

    loss_history = {}
    loss_history["train_action_loss"] = []
    loss_history["val_action_loss"] = []

    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert, _, _ = dataset.get_next_batch(
            optim_batch_size, 'train')
        train_loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert, _, _ = dataset.get_next_batch(-1, 'val')
            val_loss, _ = lossandgrad(ob_expert, ac_expert, True)
            logger.log("Training loss: {}, Validation loss: {}".format(
                train_loss, val_loss))

            loss_history["train_action_loss"].append(train_loss)
            loss_history["val_action_loss"].append(val_loss)

    plot(env_name, loss_history, traj_lim, plot_dir)

    os.makedirs(ckpt_dir, exist_ok=True)
    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed)
        savedir_fname = osp.join(ckpt_dir, ckpt_fname)
    U.save_state(savedir_fname, var_list=pi.get_variables())
    return savedir_fname
Exemple #6
0
def learn(network,
          dataset,
          env_name,
          n_action,
          prefix,
          traj_lim,
          seed,
          optim_batch_size=32,
          max_iters=1e4,
          adam_epsilon=1e-4,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          plot_dir=None,
          task_name=None,
          verbose=False):
    """
    learn with regularization
    """
    seed = 0
    alpha = 0.7
    beta = 1.0

    pi = network.pi
    T = network.T

    val_per_iter = int(max_iters / 20)

    ob = U.get_placeholder_cached(name="ob")
    T_ac = U.get_placeholder_cached(name="T_ac")
    pi_stochastic = U.get_placeholder_cached(name="pi_stochastic")
    T_stochastic = U.get_placeholder_cached(name="T_stochastic")

    ac = network.pdtype.sample_placeholder([None])
    ob_next = network.ob_next_pdtype.sample_placeholder([None])

    onehot_ac = tf.one_hot(ac, depth=n_action)
    ce_loss = tf.losses.softmax_cross_entropy(logits=pi.logits,
                                              onehot_labels=onehot_ac)

    ce_loss = tf.reduce_mean(ce_loss)

    reg_loss = tf.reduce_mean(tf.square(tf.to_float(ob_next -
                                                    network.ob_next)))

    losses = [ce_loss, reg_loss]

    total_loss = alpha * ce_loss + beta * reg_loss

    var_list = network.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function(
        [ob, ac, T_ac, ob_next, pi_stochastic, T_stochastic],
        losses + [U.flatgrad(total_loss, var_list)])

    U.initialize()
    adam.sync()
    logger.log("Training a policy with Behavior Cloning")
    logger.log("with {} trajs, {} steps".format(dataset.num_traj,
                                                dataset.num_transition))

    loss_history = {}
    loss_history["train_action_loss"] = []
    loss_history["train_transition_loss"] = []
    loss_history["val_action_loss"] = []
    loss_history["val_transition_loss"] = []

    for iter_so_far in tqdm(range(int(max_iters))):
        #ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
        ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch(
            optim_batch_size, 'train')
        train_loss_ce, train_loss_reg, g = lossandgrad(ob_expert, ac_expert,
                                                       ac_expert,
                                                       ob_next_expert, True,
                                                       True)
        adam.update(g, optim_stepsize)
        if verbose and iter_so_far % val_per_iter == 0:
            #ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            ob_expert, ac_expert, ob_next_expert, info = dataset.get_next_batch(
                -1, 'val')

            val_loss_ce, val_loss_reg, _ = lossandgrad(ob_expert, ac_expert,
                                                       ac_expert,
                                                       ob_next_expert, True,
                                                       True)
            items = [train_loss_ce, train_loss_reg, val_loss_ce, val_loss_reg]
            logger.log("Training Action loss: {}\n" \
                       "Training Transition loss: {}\n" \
                       "Validation Action loss: {}\n" \
                       "Validation Transition Loss:{}\n".format(*items))
            loss_history["train_action_loss"].append(train_loss_ce)
            loss_history["train_transition_loss"].append(train_loss_reg)
            loss_history["val_action_loss"].append(val_loss_ce)
            loss_history["val_transition_loss"].append(val_loss_reg)

            #if len(loss_history["val_action_loss"]) > 1:
            #    val_loss_ce_delta = loss_history["val_action_loss"][-1] - val_loss_ce
            #    if np.abs(val_loss_ce_delta) < val_stop_threshold:
            #        logger.log("validation error seems to have converged.")
            #        break

    plot(env_name, loss_history, traj_lim, plot_dir)

    os.makedirs(ckpt_dir, exist_ok=True)
    if ckpt_dir is None:
        savedir_fname = tempfile.TemporaryDirectory().name
    else:
        ckpt_fname = "ckpt.bc.{}.{}".format(traj_lim, seed)
        savedir_fname = osp.join(ckpt_dir, ckpt_fname)
    U.save_state(savedir_fname, var_list=network.get_variables())
    return savedir_fname