예제 #1
0
def learn(env,
          policy_func,
          dataset,
          pretrained,
          optim_batch_size=128,
          max_iters=1e4,
          adam_epsilon=1e-5,
          optim_stepsize=3e-4,
          ckpt_dir=None,
          log_dir=None,
          task_name=None):
    val_per_iter = int(max_iters / 10)
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    # placeholder
    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])
    stochastic = U.get_placeholder_cached(name="stochastic")
    loss = tf.reduce_mean(tf.square(ac - pi.ac))
    var_list = pi.get_trainable_variables()
    adam = MpiAdam(var_list, epsilon=adam_epsilon)
    lossandgrad = U.function([ob, ac, stochastic],
                             [loss] + [U.flatgrad(loss, var_list)])

    if not pretrained:
        writer = U.FileWriter(log_dir)
        ep_stats = stats(["Loss"])
    U.initialize()
    adam.sync()
    logger.log("Pretraining with Behavior Cloning...")
    for iter_so_far in tqdm(range(int(max_iters))):
        ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size,
                                                      'train')
        loss, g = lossandgrad(ob_expert, ac_expert, True)
        adam.update(g, optim_stepsize)
        if not pretrained:
            ep_stats.add_all_summary(writer, [loss], iter_so_far)
        if iter_so_far % val_per_iter == 0:
            ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
            loss, g = lossandgrad(ob_expert, ac_expert, False)
            logger.log("Validation:")
            logger.log("Loss: %f" % loss)
            if not pretrained:
                U.save_state(os.path.join(ckpt_dir, task_name),
                             counter=iter_so_far)
    if pretrained:
        savedir_fname = tempfile.TemporaryDirectory().name
        U.save_state(savedir_fname, var_list=pi.get_variables())
        return savedir_fname
예제 #2
0
def learn(
        env,
        model_path,
        data_path,
        policy_fn,
        *,
        horizon=150,  # timesteps per actor per update
        rolloutSize=50,
        clip_param=0.2,
        entcoeff=0.02,  # clipping parameter epsilon, entropy coeff
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=32,  # optimization hypers
        gamma=0.99,
        lam=0.95,  # advantage estimation
        max_iters=0,  # time constraint
        adam_epsilon=1e-4,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        retrain=False):

    # Setup losses and policy
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param,
                             1.0 + clip_param) * atarg  #
    pol_surr = -tf.reduce_mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=5)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=5)  # rolling buffer for episode rewards

    p = []  # for saving the rollouts

    if retrain == True:
        print("Retraining the policy from saved path")
        time.sleep(2)
        U.load_state(model_path)
    max_timesteps = int(horizon * rolloutSize * max_iters)

    while True:
        if max_iters and iters_so_far >= max_iters:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        print("Collecting samples for policy optimization !! ")
        if iters_so_far > 70:
            render = True
        else:
            render = False
        rollouts = sample_trajectory(pi,
                                     env,
                                     horizon=horizon,
                                     rolloutSize=rolloutSize,
                                     stochastic=True,
                                     render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + 'rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        add_vtarg_and_adv(rollouts, gamma, lam)

        ob, ac, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts[
            "adv"], rollouts["tdlamret"]
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    deterministic=pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
예제 #3
0
def learn(env, model_path, data_path, policy_fn, *,
          rolloutSize, num_options=4, horizon=80,
          clip_param=0.025, ent_coeff=0.01,  # clipping parameter epsilon, entropy coeff
          optim_epochs=10, mainlr=3.25e-4, intlr=1e-4, piolr=1e-4, termlr=5e-7, optim_batchsize=100,  # optimization hypers
          gamma=0.99, lam=0.95,  # advantage estimation
          max_iters=20,  # time constraint
          adam_epsilon=1e-5,
          schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
          retrain=False,
          ):
    """
        Core learning function
    """
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space, num_options=num_options)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space, num_options=num_options)  # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32,
                            shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None])
    op_adv = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    betas = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    # Setup losses and stuff
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-ent_coeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    activated_options = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    option_hot = tf.one_hot(option, depth=num_options)
    pi_I = (pi.intfc * activated_options) * pi_w / tf.expand_dims(
        tf.reduce_sum((pi.intfc * activated_options) * pi_w, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    int_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_I = (intfc * activated_options) * pi.op_pi / tf.expand_dims(
        tf.reduce_sum((intfc * activated_options) * pi.op_pi, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    op_loss = - tf.reduce_sum(betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1)
    op_loss -= 0.01 * tf.reduce_sum(op_entropy)

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)])
    termgrad = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)])  # Since we will use a different step size.
    opgrad = U.function([ob, option, betas, op_adv, intfc, activated_options],
                        [U.flatgrad(op_loss, var_list)])  # Since we will use a different step size.
    intgrad = U.function([ob, option, betas, op_adv, pi_w, activated_options],
                         [U.flatgrad(int_loss, var_list)])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=5)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=5)  # rolling buffer for episode rewards

    datas = [0 for _ in range(num_options)]

    if retrain:
        print("Retraining to New Task !! ")
        time.sleep(2)
        U.load_state(model_path+'/')

    p = []
    max_timesteps = int(horizon * rolloutSize * max_iters)
    while True:
        if max_iters and iters_so_far >= max_iters:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        render = False

        rollouts = sample_trajectory(pi, env, horizon=horizon, rolloutSize=rolloutSize, render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + 'rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        add_vtarg_and_adv(rollouts, gamma, lam, num_options)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(rollouts['opt_dur'][i]) if len(rollouts['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        ob, ac, opts, atarg, tdlamret = rollouts["ob"], rollouts["ac"], rollouts["opts"], rollouts["adv"], rollouts["tdlamret"]
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Optimizing the policy
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("Option- ", opt, " Batch Size: ", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                continue

            datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            if indices.size < optim_batchsize:
                print("Too few samples for opt - ", opt)
                continue

            optim_batchsize_corrected = optim_batchsize
            optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs)
            print("Optim Epochs:", optim_epochs_corrected)
            logger.log("Optimizing...")
            # Here we do a bunch of optimization epochs over the data

            for _ in range(optim_epochs_corrected):
                losses = []  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize_corrected):
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"],
                                                    cur_lrmult, [opt])
                    adam.update(grads, mainlr * cur_lrmult)
                    losses.append(newlosses)

            # Optimize termination functions
            termg = termgrad(rollouts["ob"], rollouts['opts'], rollouts["op_adv"])[0]
            adam.update(termg, termlr)

            # Optimize interest functions
            intgrads = intgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["op_probs"], rollouts["activated_options"])[0]
            adam.update(intgrads, intlr)

        # Optimize policy over options
        opgrads = opgrad(rollouts['ob'], rollouts['opts'], rollouts["last_betas"], rollouts["op_adv"], rollouts["intfc"], rollouts["activated_options"])[0]
        adam.update(opgrads, piolr)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

    return pi
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
        gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
        batch_size=128, observation_range=(-500., 500.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
        adaptive_param_noise=True, adaptive_param_noise_policy_threshold=.1,
        critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms), self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if 'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
            beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        # print(action)
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            # assert noise.shape == action.shape
            # print('ac: ', action, noise)
            action += noise
        #no need for clip here    
        # action = np.clip(action, self.action_range[0], self.action_range[1])
        # print(action)
        '''added'''
        action_set=[]
        print('action_before_binarization: ', action[0])
        #discrete the action to be 0, 1 (binarization)
        for i in range (int(len(action[0]))):
        #     '''tanh as output'''
        #     # if action[0][i]>0:
        #     #     action_set.append(1)
        #     # else:
        #     #     action_set.append(0)
        #     '''sigmoid as output'''
            if action[0][i]>0.5:
                action_set.append(1)
            else:
                action_set.append(0)

        # print('action: ', action)
        ''' #DDPG doesnt use argmax to determine action like DQN!!!
        for i in range (int(len(action[0])/2)):
            # print(action[0][2*i:2*i+2])
            action_set.append(np.argmax(action[0][2*i:2*i+2]))
        '''
        # print('action_set: ', action_set)
        # action = np.argmax(action[0])
        

        return action_set, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        # print('rs: ', self.reward_scale*np.array([-1]))
        # reward *= self.reward_scale

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b], terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q], feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std : np.array([old_std]),
                self.old_mean : np.array([old_mean]),
            })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })
            
        #added
    def save(self, save_path):
           """
           Save the model
           """
           saver = tf.train.Saver()
           saver.save(self.sess, save_path)

    def load(self,sess, load_path):
           """
           Load the model
           """
           saver = tf.train.Saver()
           print('Loading ' + load_path)
           saver.restore(sess, load_path)
           self.sess = sess
예제 #5
0
def learn(
        env,
        policy_func,
        discriminator,
        expert_dataset,
        embedding_z,
        pretrained,
        pretrained_weight,
        *,
        g_step,
        d_step,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        d_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        save_per_iter=100,
        ckpt_dir=None,
        log_dir=None,
        load_model_path=None,
        task_name=None):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    d_adam = MpiAdam(discriminator.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n(
        [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    writer = U.FileWriter(log_dir)
    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     discriminator,
                                     embedding=embedding_z,
                                     timesteps_per_batch=timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(discriminator.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())
    # if provieded model path
    if load_model_path is not None:
        U.load_state(load_model_path)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, discriminator.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (ob, ac), include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for discriminator
            if hasattr(discriminator, "obs_rms"):
                discriminator.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch,
                                                      ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
            g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
            d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0),
                                         iters_so_far)
            ep_stats.add_all_summary(writer, [
                np.mean(true_rewbuffer),
                np.mean(rewbuffer),
                np.mean(lenbuffer)
            ], iters_so_far)
예제 #6
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 demon_buffer,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-1000., 1000.),
                 action_range=(-50., 50.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        '''have to use 2 memory here, simply demon_memory = memory will cause a common instantiated memory shared by two variables'''
        self.demon_memory = demon_buffer
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, target_actor(normalized_obs1)),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None

        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            # print('noise: ', noise.shape, action.shape)
            # assert noise.shape == action.shape  #(1,3), (3,)  correct addition, no need to assert
            # print(action, noise)
            action += noise
            # print(action)
        action = np.clip(action, self.action_range[0], self.action_range[1])

        return action, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):

        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b],
                               terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        demons_memory_ratio = 0.5  # the ratio of demonstrations over all batches sampled
        # Get a batch from memory
        batch = self.memory.sample(batch_size=int(2 * self.batch_size *
                                                  (1 - demons_memory_ratio)))
        # Get a batch from demonstration buffer
        demon_batch = self.demon_memory.sample(
            batch_size=int(2 * self.batch_size * demons_memory_ratio))
        # print('memory: ', batch['obs1'].shape, 'demons: ', demon_batch['obs1'].shape)
        # concatenate two sampled batches
        batch['obs0'] = np.concatenate((batch['obs0'], demon_batch['obs0']))
        batch['rewards'] = np.concatenate(
            (batch['rewards'], demon_batch['rewards']))
        batch['terminals1'] = np.concatenate(
            (batch['terminals1'], demon_batch['terminals1']))
        batch['obs1'] = np.concatenate((batch['obs1'], demon_batch['obs1']))
        batch['actions'] = np.concatenate(
            (batch['actions'], demon_batch['actions']))
        # batch = demon_batch

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
        # print('loss: ', actor_loss, critic_loss)
        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    #added
    def save(self, save_path):
        """
           Save the model
           """
        saver = tf.train.Saver()
        saver.save(self.sess, save_path)

    def load(self, sess, load_path):
        """
           Load the model
           """
        saver = tf.train.Saver()
        print('Loading ' + load_path)
        saver.restore(sess, load_path)
        self.sess = sess

    # def feed_demon2memory(self):
    #     """
    #     feed demonstrations from data file into memory
    #     """
    #     with open('data_memory2_21steps.p', 'rb') as f:
    #         data = pickle.load(f)
    #     for _, episode in enumerate(data):
    #         for _, step in enumerate(episode):
    #             # state, action, reward, new_state, done
    #             self.store_transition(np.array(step[0]), step[1], step[2], step[3], step[4])

    def store_transition2demon(self, obs0, action, reward, obs1, terminal1):

        B = obs0.shape[0]
        for b in range(B):
            self.demon_memory.append(obs0[b], action[b], reward[b], obs1[b],
                                     terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def feed_demon_buffer(self):
        '''
        sample from the demonstration data instead of feeding them into the memory
        '''
        batch = {}
        with open('data_memory2_21steps.p', 'rb') as f:
            data = pickle.load(f)
        for _, episode in enumerate(data):
            for _, step in enumerate(episode):
                # state, action, reward, new_state, done
                self.store_transition2demon(np.array(step[0]), step[1],
                                            step[2], step[3], step[4])
예제 #7
0
def learn(encoder,
          action_decorder,
          state_decorder,
          embedding_shape,
          *,
          dataset,
          logdir,
          batch_size,
          time_steps,
          epsilon=0.001,
          lr_rate=1e-3):
    lstm_encoder = encoder("lstm_encoder")
    ac_decoder = action_decorder("ac_decoder")
    state_decoder = state_decorder("state_decoder")  #换成了mlp
    obs = U.get_placeholder_cached(name="obs")  ##for encoder

    ob = U.get_placeholder_cached(name="ob")
    embedding = U.get_placeholder_cached(name="embedding")

    # obss = U.get_placeholder_cached(name="obss")  ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs
    #   ## for action decoder, 这个state decoder应该也是可以用的
    # embeddingss = U.get_placeholder_cached(name="embeddingss")
    ac = ac_decoder.pdtype.sample_placeholder([None])
    obs_out = state_decoder.pdtype.sample_placeholder([None])

    # p(z) 标准正太分布, state先验分布???是不是应该换成demonstration的标准正态分布???? 可以考虑一下这个问题
    from common.distributions import make_pdtype

    p_z_pdtype = make_pdtype(embedding_shape)
    p_z_params = U.concatenate([
        tf.zeros(shape=[embedding_shape], name="mean"),
        tf.zeros(shape=[embedding_shape], name="logstd")
    ],
                               axis=-1)
    p_z = p_z_pdtype.pdfromflat(p_z_params)

    recon_loss = -tf.reduce_mean(
        tf.reduce_sum(ac_decoder.pd.logp(ac) + state_decoder.pd.logp(obs_out),
                      axis=0))  ##这个地方还要再改
    kl_loss = lstm_encoder.pd.kl(p_z)  ##p(z):标准正太分布, 这个看起来是不是也不太对!!!!
    vae_loss = recon_loss + kl_loss  ###vae_loss 应该是一个batch的

    ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"])
    losses = [recon_loss, kl_loss, vae_loss]

    ## var_list
    var_list = []
    en_var_list = lstm_encoder.get_trainable_variables()
    var_list.extend(en_var_list)
    # ac_de_var_list = ac_decoder.get_trainable_variables()
    # var_list.extend(ac_de_var_list)
    state_de_var_list = state_decoder.get_trainable_variables()
    var_list.extend(state_de_var_list)
    # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss)
    compute_losses = U.function([obs, ob, embedding, ac, obs_out], losses)
    compute_grad = U.function([obs, ob, embedding, ac, obs_out],
                              U.flatgrad(vae_loss,
                                         var_list))  ###这里没有想好!!!,可能是不对的!!
    adam = MpiAdam(var_list, epsilon=epsilon)

    U.initialize()
    adam.sync()

    writer = U.FileWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    # =========================== TRAINING ===================== #
    iters_so_far = 0
    saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=100)
    saver_encoder = tf.train.Saver(var_list=en_var_list, max_to_keep=100)
    # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎

    while True:
        logger.log("********** Iteration %i ************" % iters_so_far)

        recon_loss_buffer = deque(maxlen=100)
        kl_loss_buffer = deque(maxlen=100)
        vae_loss_buffer = deque(maxlen=100)

        for observations in dataset.get_next_batch(batch_size=time_steps):
            observations = observations.transpose((1, 0))
            embedding_now = lstm_encoder.get_laten_vector(observations)
            embeddings = np.array([embedding_now for _ in range(time_steps)])
            embeddings_reshape = embeddings.reshape((time_steps, -1))
            actions = ac_decoder.act(stochastic=True,
                                     ob=observations,
                                     embedding=embeddings_reshape)
            state_outputs = state_decoder.get_outputs(
                observations.reshape(time_steps, -1, 1),
                embeddings)  ##还没有加混合高斯......乱加了一通,已经加完了
            recon_loss, kl_loss, vae_loss = compute_losses(
                observations, observations.reshape(batch_size, time_steps,
                                                   -1), embeddings_reshape,
                observations.reshape(time_steps, -1, 1), embeddings, actions,
                state_outputs)

            g = compute_grad(observations,
                             observations.reshape(batch_size, time_steps,
                                                  -1), embeddings_reshape,
                             observations.reshape(time_steps, -1, 1),
                             embeddings, actions, state_outputs)
            adam.update(g, lr_rate)
            recon_loss_buffer.append(recon_loss)
            kl_loss_buffer.append(kl_loss)
            vae_loss_buffer.append(vae_loss)

        ep_stats.add_all_summary(writer, [
            np.mean(recon_loss_buffer),
            np.mean(kl_loss_buffer),
            np.mean(vae_loss_buffer)
        ], iters_so_far)
        logger.record_tabular("recon_loss", recon_loss)
        logger.record_tabular("kl_loss", kl_loss)
        logger.record_tabular("vae_loss", vae_loss)
        logger.dump_tabular()
        if (iters_so_far % 10 == 0 and iters_so_far != 0):
            save(saver=saver,
                 sess=tf.get_default_session(),
                 logdir=logdir,
                 step=iters_so_far)
            save(saver=saver_encoder,
                 sess=tf.get_default_session(),
                 logdir="./vae_saver",
                 step=iters_so_far)
            # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far)
        iters_so_far += 1
예제 #8
0
def learn(env, model_path, data_path, policy_fn, model_learning_params, svm_grid_params, svm_params_interest,
          svm_params_guard, *, modes, rolloutSize, num_options=2,
          horizon,  # timesteps per actor per update
          clip_param, ent_coeff=0.02,  # clipping parameter epsilon, entropy coeff
          optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=160,  # optimization hypers
          gamma=0.99, lam=0.95,  # advantage estimation
          max_iters=0,  # time constraint
          adam_epsilon=1.2e-4,
          schedule='linear',  # annealing for stepsize parameters (epsilon and adam)
          retrain=False
          ):
    """
        Core learning function
    """

    ob_space = env.observation_space
    ac_space = env.action_space
    if retrain:
        model = pickle.load(open(model_path + '/hybrid_model.pkl', 'rb'))
        print("Model graph:", model.transitionGraph.nodes)
        print("Model options:", model.transitionGraph.edges)
    else:
        model = partialHybridModel(env, model_learning_params, svm_grid_params, svm_params_interest, svm_params_guard, horizon, modes, num_options, rolloutSize)
    pi = policy_fn("pi", ob_space, ac_space, model, num_options)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space, model, num_options)  # Network for old policy
    atarg = tf1.placeholder(dtype=tf1.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf1.placeholder(dtype=tf1.float32, shape=[None])  # Empirical return

    lrmult = tf1.placeholder(name='lrmult', dtype=tf1.float32,
                             shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # Define placeholders for computing the advantage
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    ac = pi.pdtype.sample_placeholder([None])

    # Defining losses for optimization
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf1.reduce_mean(kloldnew)
    meanent = tf1.reduce_mean(ent)
    pol_entpen = (-ent_coeff) * meanent

    ratio = tf1.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = tf1.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - tf1.reduce_mean(tf1.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP), negative to convert from a maximization to minimization problem
    vf_loss = tf1.reduce_mean(tf1.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf1.assign(oldv, newv) for (oldv, newv) in
                                                    zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=10)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=10)  # rolling buffer for episode rewards

    p = []  # for saving the rollouts

    if retrain:
        print("Retraining to New Task !!")
        time.sleep(2)
        U.load_state(model_path+'/')
        print(pi.eps)
    max_timesteps = int(horizon * rolloutSize * max_iters)

    while True:
        if max_iters and iters_so_far >= max_iters:
            break
        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("************* Iteration %i *************" % iters_so_far)
        print("Collecting samples for policy optimization !! ")
        render = False

        rollouts = sample_trajectory(pi, model, env, horizon=horizon, rolloutSize=rolloutSize, render=render)
        # Save rollouts
        data = {'rollouts': rollouts}
        p.append(data)
        del data
        data_file_name = data_path + '/rollout_data.pkl'
        pickle.dump(p, open(data_file_name, "wb"))

        # Model update
        print("Updating model !!\n")
        model.updateModel(rollouts, pi)
        print("Model graph:", model.transitionGraph.nodes)
        print("Model options:", model.transitionGraph.edges)
        edges = list(model.transitionGraph.edges)
        for i in range(0, len(edges)):
            print(edges[i][0], " -> ", edges[i][1], " : ", model.transitionGraph[edges[i][0]][edges[i][1]]['weight'])

        datas = [0 for _ in range(num_options)]
        add_vtarg_and_adv(rollouts, pi, gamma, lam, num_options)

        ob, ac, opts, atarg, tdlamret = rollouts["seg_obs"], rollouts["seg_acs"], rollouts["des_opts"], rollouts["adv"], rollouts["tdlamret"]
        old_opts = rollouts["seg_opts"]
        similarity = 0
        for i in range(0, len(old_opts)):
            if old_opts[i] == opts[i]:
                similarity += 1

        print("Percentage similarity of options: ", similarity/len(old_opts) * 100)

        vpredbefore = rollouts["vpreds"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std()  # standardized advantage function estimate
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()

        pi.eps = pi.eps * gamma #reduce exploration

        # Optimizing the policy
        print("\nOptimizing policy !! \n")
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("Option- ", opt, " Batch Size: ", indices.size)
            if not indices.size:
                continue

            datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            if indices.size < optim_batchsize:
                print("Too few samples for opt - ", opt)
                continue

            optim_batchsize_corrected = optim_batchsize
            optim_epochs_corrected = np.clip(np.int(indices.size / optim_batchsize_corrected), 1, optim_epochs)
            print("Optim Epochs:", optim_epochs_corrected)
            logger.log("Optimizing...")
            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs_corrected):
                losses = []  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize_corrected):
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt])
                    if np.isnan(newlosses).any():
                        continue
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)
        if len(losses) > 0:
            meanlosses, _, _ = mpi_moments(losses, axis=0)
            print("Mean loss ", meanlosses)
            for (lossval, name) in zipsame(meanlosses, loss_names):
                logger.record_tabular("loss_" + name, lossval)

        lrlocal = (rollouts["ep_lens"], rollouts["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("Success", rollouts["success"])
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        '''
        if model_path and not retrain:
            U.save_state(model_path + '/')
            model_file_name = model_path + '/hybrid_model.pkl'
            pickle.dump(model, open(model_file_name, "wb"), pickle.HIGHEST_PROTOCOL)
            print("Policy and Model saved in - ", model_path)
        '''
    return pi, model
예제 #9
0
def learn(env, encoder, action_decorder, state_decorder, embedding_shape,*, dataset, optimizer, logdir, batch_size, time_steps, adam_epsilon = 0.001, lr_rate = 1e-4, vae_beta = 8):
    lstm_encoder = encoder("lstm_encoder")
    ac_decoder = action_decorder("ac_decoder")
    state_decoder = state_decorder("state_decoder") #这个地方有问题
    ac_de_ob = U.get_placeholder_cached(name="ac_de_ob")
    en_ob = U.get_placeholder_cached(name="en_ob")  ##for encoder
    state_de_ob = U.get_placeholder_cached(name="state_de_ob")  ## for action decoder, 这个state decoder是不是也可以用, 是不是应该改成obs
    ac_de_embedding = U.get_placeholder_cached(name="ac_de_embedding")  ## for action decoder, 这个state decoder应该也是可以用的
    state_de_embedding = U.get_placeholder_cached(name="state_de_embedding")
    # ac = ac_decoder.pdtype.sample_placeholder([None])
    ob_next = tf.placeholder(name="ob_next", shape=[None, ob_shape], dtype=tf.float32)
    # ob_next_ac = tf.placeholder(name="ob_next_ac", shape=[ob_shape], dtype=tf.float32)
    # obs_out = state_decoder.pdtype.sample_placeholder([None])

    # p(z) 标准正太分布
    from common.distributions import make_pdtype

    p_z_pdtype = make_pdtype(embedding_shape)
    p_z_params = U.concatenate([tf.zeros(shape=[embedding_shape], name="mean"), tf.zeros(shape=[embedding_shape], name="logstd")], axis=-1)
    p_z = p_z_pdtype.pdfromflat(p_z_params)

    # recon_loss 里再加一个,对于action的

    recon_loss =  -tf.reduce_sum(state_decoder.pd.logp(ob_next))
    # kl_loss = lstm_encoder.pd.kl(p_z)[0] ##p(z):标准正太分布, 这个看起来是不是也不太对!!!!
    # kl_loss = tf.maximum(lstm_encoder.pd.kl(p_z)[0], tf.constant(5.00)) ##p(z):标准正太分布, 这个看起来是不是也不太对!!!!
    kl_loss = lstm_encoder.pd.kl(p_z)[0]
    vae_loss = tf.reduce_mean(recon_loss + vae_beta * kl_loss) ###vae_loss 应该是一个batch的

    ep_stats = stats(["recon_loss", "kl_loss", "vae_loss"])
    losses = [recon_loss, kl_loss, vae_loss]
    # 均方误差去训练 action,把得到的action step 一下,得到x(t+1),然后用均方误差loss,或者可以试试交叉熵


    ## var_list
    var_list = []
    en_var_list = lstm_encoder.get_trainable_variables()
    var_list.extend(en_var_list)
    # ac_de_var_list = ac_decoder.get_trainable_variables()
    # var_list.extend(ac_de_var_list)
    state_de_var_list = state_decoder.get_trainable_variables()
    var_list.extend(state_de_var_list)
    # compute_recon_loss = U.function([ob, obs, embedding, obss, embeddingss, ac, obs_out], recon_loss)
    compute_losses = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], losses)
    compute_grad = U.function([en_ob, ac_de_ob, state_de_ob, ac_de_embedding, state_de_embedding, ob_next], U.flatgrad(vae_loss, var_list)) ###这里没有想好!!!,可能是不对的!!
    adam = MpiAdam(var_list, epsilon=adam_epsilon)


    U.initialize()
    adam.sync()

    writer = U.FileWriter(logdir)
    writer.add_graph(tf.get_default_graph())
    # =========================== TRAINING ===================== #
    iters_so_far = 0
    saver = tf.train.Saver(var_list=var_list, max_to_keep=100)
    saver_encoder = tf.train.Saver(var_list = en_var_list, max_to_keep=100)
    # saver_pol = tf.train.Saver(var_list=ac_de_var_list, max_to_keep=100) ##保留一下policy的参数,但是这个好像用不到哎

    while iters_so_far < 50:
        ## 加多轮
        logger.log("********** Iteration %i ************" % iters_so_far)
        ## 要不要每一轮调整一下batch_size
        recon_loss_buffer = deque(maxlen=100)
        # recon_loss2_buffer = deque(maxlen=100)
        kl_loss_buffer = deque(maxlen=100)
        vae_loss_buffer = deque(maxlen=100)
        # i = 0
        for obs_and_next in dataset.get_next_batch(batch_size=time_steps):
            # print(i)
            # i += 1
            observations = obs_and_next[0].transpose((1, 0))[:-1]
            ob_next = obs_and_next[0].transpose(1, 0)[state_decoder.receptive_field:, :]
            embedding_now = lstm_encoder.get_laten_vector(obs_and_next[0].transpose((1, 0)))
            embeddings = np.array([embedding_now for _ in range(time_steps - 1)])
            embeddings_reshape = embeddings.reshape((time_steps-1, -1))
            actions = ac_decoder.act(stochastic=True, ob=observations, embedding=embeddings_reshape)
            ob_next_ac = get_ob_next_ac(env, observations[-1], actions[0]) ##这个还需要再修改 #########################################3
            # state_outputs = state_decoder.get_outputs(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1))) ##还没有加混合高斯......乱加了一通,已经加完了
            # recon_loss = state_decoder.recon_loss(observations.reshape(1, time_steps, -1), embedding_now.reshape((1, 1, -1)))
            recon_loss,  kl_loss, vae_loss = compute_losses(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1),
                              observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next)

            g = compute_grad(obs_and_next[0].transpose((1, 0)).reshape(1, time_steps, -1), observations.reshape(time_steps-1,-1),
                              observations.reshape(1, time_steps-1, -1), embeddings_reshape, embedding_now.reshape((1,1, -1)), ob_next)
            # logger.record_tabular("recon_loss", recon_loss)
            # logger.record_tabular("recon_loss2", recon_loss2)
            # logger.record_tabular("kl_loss", kl_loss)
            # logger.record_tabular("vae_loss", vae_loss)
            # logger.dump_tabular()
            adam.update(g, lr_rate)
            recon_loss_buffer.append(recon_loss)
            # recon_loss2_buffer.append(recon_loss2)
            kl_loss_buffer.append(kl_loss)
            vae_loss_buffer.append(vae_loss)
        ep_stats.add_all_summary(writer, [np.mean(recon_loss_buffer),  np.mean(kl_loss_buffer),
                                          np.mean(vae_loss_buffer)], iters_so_far)
        logger.record_tabular("recon_loss", recon_loss)
        # logger.record_tabular("recon_loss2", recon_loss2)
        logger.record_tabular("kl_loss", kl_loss)
        logger.record_tabular("vae_loss", vae_loss)
        logger.dump_tabular()
        if(iters_so_far % 10 == 0 and iters_so_far != 0):
            save(saver=saver, sess=tf.get_default_session(), logdir=logdir, step=iters_so_far)
            save(saver=saver_encoder, sess=tf.get_default_session(),logdir="./vae_saver", step=iters_so_far)
            # save(saver=saver_pol, sess=tf.get_default_session(), logdir="pol_saver", step=iters_so_far)
        iters_so_far += 1
        if iters_so_far < 6:
            lr_rate /= 2
예제 #10
0
class Planner(object):
    @store_args
    def __init__(self,
                 inp_dim,
                 hid_size,
                 seq_len,
                 out_dim,
                 buffer_size,
                 batch_size=64,
                 optim_stepsize=1e-3,
                 sample_func=None,
                 norm_eps=1e-2,
                 norm_clip=5,
                 scope='planner',
                 layerNorm=False,
                 **kwargs):
        '''
        Implemention of LSTM Planner that produces given number of subgoals between src and dest.
        Args:
            inp_dim : dimension for the LSTM
            hid_size : cell_state_size
            seq_len : max_timesteps
            out_dim : dimension for LSTM output
        '''
        # self.main = lstm(hid_size, layerNorm)

        self.adamepsilon = 1e-6

        self.mode = tf.contrib.learn.ModeKeys.TRAIN  # TRAIN for training, INFER for prediction, EVAL for evaluation
        self.infer_outputs = None
        with tf.variable_scope(self.scope):
            self._create_network()

        buffer_shape = [
            seq_len + 2, out_dim
        ]  # plus 2: the [0] is 'src', [1] is 'dest', [2:] are 'labels',
        if self.sample_func is None:
            from sampler import make_sample_plans
            self.sample_func = make_sample_plans()
        self.buffer = PlanReplayBuffer(buffer_shape, buffer_size,
                                       self.sample_func)

    def _create_network(self):
        self.sess = U.get_session()

        self.inp_src = tf.placeholder(shape=[None, 1, self.inp_dim],
                                      dtype=tf.float32,
                                      name='input_src')
        self.inp_dest = tf.placeholder(shape=[None, 1, self.out_dim],
                                       dtype=tf.float32,
                                       name='input_dest')
        self.labels = tf.placeholder(shape=[None, self.seq_len, self.out_dim],
                                     dtype=tf.float32,
                                     name='label')
        self.src_seq_len = tf.placeholder(tf.int32, (None, ),
                                          name='source_sequence_length')
        self.tar_seq_len = tf.placeholder(tf.int32, (None, ),
                                          name='target_sequence_length')
        # running averages
        # with tf.variable_scope('goal_stats_src'):
        #     self.goal_stats_src = Normalizer(self.inp_dim, self.norm_eps, self.norm_clip, sess=self.sess)
        with tf.variable_scope('goal_stats_dest'):
            self.goal_stats_dest = Normalizer(self.out_dim,
                                              self.norm_eps,
                                              self.norm_clip,
                                              sess=self.sess,
                                              PLN=True)

        # normalize inp_src, and goals labels
        inp_src = self.goal_stats_dest.normalize(self.inp_src)
        inp_dest = self.goal_stats_dest.normalize(self.inp_dest)
        goal_labels = self.goal_stats_dest.normalize(self.labels)
        with tf.variable_scope('goal_gen'):
            encoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size)
            encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
                encoder_cell,
                inp_src,
                sequence_length=self.src_seq_len,
                dtype=tf.float32)

            decoder_cell = tf.nn.rnn_cell.LSTMCell(self.hid_size)
            project_layer = tf.layers.Dense(self.out_dim)

            with tf.variable_scope("decode"):
                train_inp = tf.concat([inp_dest, goal_labels[:, :-1, :]],
                                      axis=-2)
                train_helper = tf.contrib.seq2seq.TrainingHelper(
                    train_inp, sequence_length=self.tar_seq_len)
                train_decoder = tf.contrib.seq2seq.BasicDecoder(
                    decoder_cell,
                    train_helper,
                    encoder_state,
                    output_layer=project_layer)
                train_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode(
                    train_decoder, maximum_iterations=self.seq_len)
                self.train_outputs = train_outputs.rnn_output

            with tf.variable_scope("decode", reuse=True):
                infer_helper = ContinousInferHelper(inp_dest[:, 0, :],
                                                    self.tar_seq_len)
                infer_decoder = tf.contrib.seq2seq.BasicDecoder(
                    decoder_cell,
                    infer_helper,
                    encoder_state,
                    output_layer=project_layer)
                infer_outputs, _, final_seq_len = tf.contrib.seq2seq.dynamic_decode(
                    infer_decoder, maximum_iterations=self.seq_len)
                self.infer_outputs = self.goal_stats_dest.denormalize(
                    infer_outputs.rnn_output)

            log_sigma = tf.get_variable(name="logstd",
                                        shape=[1, self.out_dim],
                                        initializer=U.normc_initializer(0.1))

            goals = train_outputs.rnn_output
            loss =   0.5 * tf.reduce_sum(tf.square((goal_labels - goals)/tf.exp(log_sigma)), axis=-1) \
                + 0.5 * np.log(2*np.pi) * tf.to_float(tf.shape(self.labels)[-1]) \
                + tf.reduce_sum(log_sigma, axis=-1)
            self.loss = tf.reduce_mean(loss)
            self.tr_outputs = self.goal_stats_dest.denormalize(
                self.train_outputs
            )  # just for inspect the correctness of training

        var_list = self._vars('')
        self.grads = U.flatgrad(self.loss, var_list)
        self.adam = MpiAdam(var_list, epsilon=self.adamepsilon)

        tf.variables_initializer(self._global_vars('')).run()
        self.adam.sync()

    def train(self, use_buffer=False, justEval=False, **kwargs):
        self.mode = tf.contrib.learn.ModeKeys.TRAIN
        if not use_buffer:
            src = np.reshape(kwargs['src'], [-1, 1, self.inp_dim])
            dest = np.reshape(kwargs['dest'], [-1, 1, self.out_dim])
            lbl = kwargs['lbl']
        else:
            episode_batch = self.buffer.sample(self.batch_size)
            src = np.reshape(episode_batch[:, 0, :], [-1, 1, self.inp_dim])
            lbl = episode_batch[:, 2:, :]
            dest = np.reshape(episode_batch[:, 1, :], [-1, 1, self.out_dim])
        src_seq_len = [1] * src.shape[0]
        tar_seq_len = [self.seq_len] * dest.shape[0]
        # compute grads
        loss, g, tr_sub_goals, te_sub_goals = self.sess.run(
            [self.loss, self.grads, self.tr_outputs, self.infer_outputs],
            feed_dict={
                self.inp_src: src,
                self.inp_dest: dest,
                self.labels: lbl,
                self.src_seq_len: src_seq_len,
                self.tar_seq_len: tar_seq_len
            })
        if not justEval:
            self.adam.update(g, stepsize=self.optim_stepsize)
        return loss, tr_sub_goals[-1], te_sub_goals[-1]

    def plan(self, src, dest):
        src = np.reshape(src, [-1, 1, self.inp_dim])
        dest = np.reshape(dest, [-1, 1, self.out_dim])
        src_seq_len = [1] * src.shape[0]
        tar_seq_len = [self.seq_len] * dest.shape[0]
        plan_goals = self.sess.run(self.infer_outputs,
                                   feed_dict={
                                       self.inp_src: src,
                                       self.inp_dest: dest,
                                       self.src_seq_len: src_seq_len,
                                       self.tar_seq_len: tar_seq_len
                                   })

        assert plan_goals.shape[0] == src.shape[0] and plan_goals.shape[
            1] == self.seq_len
        plan_goals = np.flip(plan_goals, axis=-2)
        plan_goals = np.concatenate([plan_goals, dest],
                                    axis=-2)  # append the ultimate goal
        return plan_goals

    def store_episode(self, episode_batch, update_stats=True):
        """ episode_batch : [batch_size * (subgoal_num+1) * subgoal_dim]
        """
        isNull = episode_batch.shape[0] < 1
        if not isNull:
            self.buffer.store_episode(episode_batch)
        # logger.info("buffer store_episode done. updating statistics.")
        if update_stats:
            subgoals = episode_batch[:, 1:, :]
            self.goal_stats_dest.update(subgoals, isNull=isNull)
            # logger.info("ready to recomput_stats")
            # print(subgoals)
            self.goal_stats_dest.recompute_stats(inc=episode_batch.shape[0])

    def update_normalizer_stats(self, batch):
        # self.goal_stats_src.update(batch['src'])
        self.goal_stats_dest.update(batch['dest'])
        # self.goal_stats_src.recompute_stats()
        self.goal_stats_dest.recompute_stats()

    def _vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                scope=self.scope + '/' + scope)
        assert len(res) > 0
        return res

    def _global_vars(self, scope):
        res = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                scope=self.scope + '/' + scope)
        return res

    def save(self, save_path):
        assert self.infer_outputs is not None
        var_list = self._global_vars('')
        U.save_variables(save_path, variables=var_list, sess=self.sess)

    def load(self, load_path):
        if self.infer_outputs is None:
            self._create_network()
        var_list = self._global_vars('')
        U.load_variables(load_path, variables=var_list)

    def logs(self, prefix=''):
        logs = []
        logs += [('subgoals/buff_size', self.buffer.get_current_episode_size())
                 ]
        logs += [('goals/mean',
                  np.mean(self.sess.run([self.goal_stats_dest.mean])))]
        logs += [('goals/std',
                  np.mean(self.sess.run([self.goal_stats_dest.std])))]

        if prefix != '':
            prefix = prefix.strip('/')
            return [(prefix + '/' + key, val) for key, val in logs]
        else:
            return logs
예제 #11
0
class DDPG(object):
    def __init__(self,
                 observation_shape,
                 action_shape,
                 nb_demo_kine,
                 nb_key_states,
                 batch_size=128,
                 noise_type='',
                 actor=None,
                 critic=None,
                 layer_norm=True,
                 observation_range=(-5., 5.),
                 action_range=(-1., 1.),
                 return_range=(-np.inf, np.inf),
                 normalize_returns=False,
                 normalize_observations=True,
                 reward_scale=1.,
                 clip_norm=None,
                 demo_l2_reg=0.,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 demo_lr=5e-3,
                 gamma=0.99,
                 tau=0.001,
                 enable_popart=False,
                 save_ckpt=True):

        # Noise
        nb_actions = action_shape[-1]
        param_noise, action_noise = process_noise_type(noise_type, nb_actions)

        logger.info('param_noise', param_noise)
        logger.info('action_noise', action_noise)

        # States recording
        self.memory = Memory(limit=int(2e5),
                             action_shape=action_shape,
                             observation_shape=observation_shape)

        # Models
        self.nb_demo_kine = nb_demo_kine
        self.actor = actor or Actor(
            nb_actions, nb_demo_kine, layer_norm=layer_norm)
        self.nb_key_states = nb_key_states
        self.critic = critic or Critic(nb_key_states, layer_norm=layer_norm)
        self.nb_obs_org = nb_key_states

        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        # self.critic_target_Q: value assigned by self.target_Q_obs0
        self.critic_target_Q = tf.placeholder(tf.float32,
                                              shape=(None, 1),
                                              name='critic_target_Q')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # change in observations
        self.obs_delta_kine = (self.obs1 - self.obs0)[:, :self.nb_demo_kine]
        self.obs_delta_kstates = (self.obs1 -
                                  self.obs0)[:, :self.nb_key_states]

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range

        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.demo_lr = demo_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg
        self.demo_l2_reg = demo_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None

        self.normalized_obs0 = tf.clip_by_value(
            obs_norm_partial(self.obs0, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(
            obs_norm_partial(self.obs1, self.obs_rms, self.nb_obs_org),
            self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(self.actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(self.critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across set-up parts.
        # the actor output is [0,1], need to normalised to [-1,1] before feeding into critic
        self.actor_tf, self.demo_aprx = self.actor(self.normalized_obs0)

        # critic loss
        # normalized_critic_tf, pred_rwd, pred_obs_delta: critic_loss
        self.normalized_critic_tf, self.pred_rwd, self.pred_obs_delta = self.critic(
            self.normalized_obs0, act_norm(self.actions))
        # self.critic_tf: only in logging [reference_Q_mean/std]
        self.critic_tf = ret_denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)

        # actor loss
        normalized_critic_with_actor_tf = self.critic(self.normalized_obs0,
                                                      act_norm(self.actor_tf),
                                                      reuse=True)[0]
        # self.critic_with_actor_tf: actor loss, and logging [reference_Q_tf_mean/std]
        self.critic_with_actor_tf = ret_denormalize(
            tf.clip_by_value(normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)

        # target Q
        self.target_action = tf.clip_by_value(
            target_actor(normalized_obs1)[0], self.action_range[0],
            self.action_range[1])
        self.target_Q_obs1 = ret_denormalize(
            target_critic(normalized_obs1, act_norm(self.target_action))[0],
            self.ret_rms)
        self.target_Q_obs0 = self.rewards + (
            1. - self.terminals1) * gamma * self.target_Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(self.normalized_obs0)

        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()
        self.dbg_vars = self.actor.dbg_vars + self.critic.dbg_vars

        self.sess = None
        # Set up checkpoint saver
        self.save_ckpt = save_ckpt
        if save_ckpt:
            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=20)
        else:
            # saver for loading ckpt
            self.saver = tf.train.Saver()

        self.main_summaries = tf.summary.merge_all()
        logdir = logger.get_dir()
        if logdir:
            self.train_writer = tf.summary.FileWriter(
                os.path.join(logdir, 'tb'), tf.get_default_graph())
        else:
            self.train_writer = None

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)[0]
        logger.debug('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)[0]
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        # loss_normed = -tf.reduce_mean(self.normalized_critic_with_actor_tf)
        self.actor_Q = tf.reduce_mean(self.critic_with_actor_tf)
        self.actor_loss = -self.actor_Q
        tf.summary.scalar('actor/Q', self.actor_Q)

        # setting up actor vars/grads/optimizer
        self.actor_vars = self.actor.active_vars
        self.actor_grads = tf_util.flatgrad(self.actor_loss,
                                            self.actor_vars,
                                            clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        self.actor_params = actor_params = [0] * (
            len(self.actor.trainable_vars) + 1)
        for i, shape in enumerate(actor_shapes):
            actor_params[i + 1] = actor_params[i] + np.prod(shape)
        n_inact = len(actor_shapes) - len(self.actor_vars)
        active_params = actor_params[n_inact:] - actor_params[n_inact]
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_params))
        logger.info('  actor total: {}'.format(actor_params[-1]))
        logger.info('  actor active: {}'.format(active_params))

        grad = self.actor_grads[active_params[0]:active_params[1]]
        tf.summary.scalar(
            'grads/actor_layer%d_%d' %
            (n_inact // 2, active_params[1] - active_params[0]),
            tf.reduce_mean(grad))

        grad = self.actor_grads[active_params[-3]:active_params[-2]]
        tf.summary.scalar(
            'grads/actor_layer%d_%d' %
            (-1, active_params[-2] - active_params[-3]), tf.reduce_mean(grad))

        # for train_demo()
        self.demo_loss = tf.reduce_mean(
            tf.square(self.obs_delta_kine - self.demo_aprx))
        self.demo_max_loss = tf.reduce_max(
            tf.square(self.obs_delta_kine - self.demo_aprx))
        if self.demo_l2_reg > 0.:
            demo_reg_vars = self.actor.demo_reg_vars
            for var in demo_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info(
                '  applying l2 regularization for demo_aprx with {}'.format(
                    self.demo_l2_reg))
            self.demo_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.demo_l2_reg),
                weights_list=demo_reg_vars)
            self.demo_loss += self.demo_reg
        else:
            self.demo_reg = None

        self.demo_grads = tf_util.flatgrad(self.demo_loss,
                                           self.actor.trainable_vars,
                                           clip_norm=self.clip_norm)
        self.demo_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                      beta1=0.9,
                                      beta2=0.999,
                                      epsilon=1e-08)

        # mimic rwd
        self.mimic_rwd = -self.demo_loss
        tf.summary.scalar('actor/mimic_rwd', self.mimic_rwd)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')

        self.normalized_critic_target_tf = tf.clip_by_value(
            ret_normalize(self.critic_target_Q, self.ret_rms),
            self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf -
                      self.normalized_critic_target_tf))
        tf.summary.scalar('critic_loss/Q_diff', self.critic_loss)
        if self.normalize_returns:
            tf.summary.scalar('critic_loss/Q_normed_critic',
                              tf.reduce_mean(self.normalized_critic_tf))
            tf.summary.scalar('critic_loss/Q_normed_target',
                              tf.reduce_mean(self.normalized_critic_target_tf))

        self.critic_loss_step = 0
        diff_rwd = tf.reduce_mean(tf.square(self.pred_rwd - self.rewards))
        self.critic_loss_step += diff_rwd
        tf.summary.scalar('critic_loss/step_rwd', self.critic_loss_step)

        critic_kine_factor = 100
        diff_obs = tf.reduce_mean(tf.square(self.pred_obs_delta -
                                            self.obs_delta_kstates),
                                  axis=0)
        diff_obs_kine = tf.reduce_mean(
            diff_obs[:self.nb_demo_kine]) * critic_kine_factor
        diff_obs_rest = tf.reduce_mean(diff_obs[self.nb_demo_kine:])
        self.critic_loss_step += (diff_obs_kine + diff_obs_rest)
        tf.summary.scalar(
            'critic_loss/step_kstates_kine_x%d' % critic_kine_factor,
            diff_obs_kine)
        tf.summary.scalar('critic_loss/step_kstates_rest', diff_obs_rest)
        tf.summary.scalar('critic_loss/step_total', self.critic_loss_step)

        self.critic_loss += self.critic_loss_step

        if self.critic_l2_reg > 0.:
            critic_reg_vars = self.critic.reg_vars
            for var in critic_reg_vars:
                logger.debug('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
            tf.summary.scalar('critic_loss/reg', critic_reg)

        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]

        critic_params = [0] * (len(self.critic.trainable_vars) + 1)
        for i, shape in enumerate(critic_shapes):
            critic_params[i + 1] = critic_params[i] + np.prod(shape)

        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_params))
        logger.info('  critic total: {}'.format(critic_params[-1]))
        self.critic_grads = tf_util.flatgrad(self.critic_loss,
                                             self.critic.trainable_vars,
                                             clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

        # todo: make the following general
        grad = self.critic_grads[critic_params[0]:critic_params[1]]
        tf.summary.scalar(
            'grads/critic_layer%d_%d' %
            (0, critic_params[1] - critic_params[0]), tf.reduce_mean(grad))
        grad = self.critic_grads[critic_params[-3]:critic_params[-2]]
        tf.summary.scalar(
            'grads/critic_layer%d_rwd_%d' %
            (-1, critic_params[-2] - critic_params[-3]), tf.reduce_mean(grad))
        grad = self.critic_grads[critic_params[-7]:critic_params[-6]]
        tf.summary.scalar(
            'grads/critic_layer%d_q_%d' %
            (-1, critic_params[-6] - critic_params[-7]), tf.reduce_mean(grad))

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['zrms/ret_mean', 'zrms/ret_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean[:self.nb_demo_kine]),
                tf.reduce_mean(self.obs_rms.std[:self.nb_demo_kine])
            ]
            names += ['zrms/obs_kine_mean', 'zrms/obs_kine_std']

            ops += [
                tf.reduce_mean(self.obs_rms.mean[:self.nb_key_states]),
                tf.reduce_mean(self.obs_rms.std[:self.nb_key_states])
            ]
            names += ['zrms/obs_kstates_mean', 'zrms/obs_kstates_std']

            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['zrms/obs_mean', 'zrms/obs_std']

            # for debugging partial normalisation
            for o_i in [self.nb_obs_org - 1, self.nb_obs_org]:
                ops += [self.obs0[0, o_i], self.normalized_obs0[0, o_i]]
                names += ['zobs_dbg_%d' % o_i, 'zobs_dbg_%d_normalized' % o_i]

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['zref/Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['zref/Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['zref/Q_tf_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['zref/Q_tf_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['zref/action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['zref/action_std']

        ops += [tf.reduce_mean(self.mimic_rwd)]
        names += ['zref/mimic_rwd']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['zref/action_ptb_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['zref/action_ptb_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self,
           obs,
           step,
           apply_param_noise=True,
           apply_action_noise=True,
           compute_Q=True,
           rollout_log=False):
        if self.param_noise is not None and apply_param_noise:
            actor_tf = self.perturbed_actor_tf
            info = 'ptb'
        else:
            actor_tf = self.actor_tf
            info = 'org'
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf],
                                      feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        # actor output is [0,1], no need to denormalise.
        # action = act_denorm(action)
        if rollout_log:
            summary_list = [('the_action/%d_rollout_%s' % (i, info), a)
                            for i, a in enumerate(action)]

        if self.action_noise is not None and apply_action_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        else:
            noise = None
        action = np.clip(action, self.action_range[0], self.action_range[1])

        if rollout_log:
            if noise is not None:
                summary_list += [('the_action/%d_rollout_noise' % i, a)
                                 for i, a in enumerate(noise)]
            self.add_list_summary(summary_list, step)
        return action, q

    def store_transition(self, storage, obs0, action, reward, obs1, terminal1):
        '''store one experience'''
        reward *= self.reward_scale
        storage.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def store_multrans(self, storage, obs0, action, reward, obs1, terminal1):
        '''store multiple experiences'''
        for i in range(len(reward)):
            storage.append(obs0[i], action[i], reward[i] * self.reward_scale,
                           obs1[i], terminal1[i])
        if self.normalize_observations:
            self.obs_rms.update(np.vstack(obs0))

    def train_demo(self,
                   obs0_pos,
                   obs1_pos,
                   obs0_neg,
                   obs1_neg,
                   step,
                   neg_pct=1.0,
                   lr_decay=1.0):
        # gradients calculated for pos and neg data separately, then combined for gradient update,
        # because only positive data are used in eval modes

        # the loss evaluated here are those before gradient update
        ops = [
            self.demo_grads, self.demo_loss, self.demo_max_loss, self.actor_Q
        ]
        pos_grads, demo_loss, max_loss, actor_Q = self.sess.run(ops,
                                                                feed_dict={
                                                                    self.obs0:
                                                                    obs0_pos,
                                                                    self.obs1:
                                                                    obs1_pos,
                                                                })
        ops = [self.demo_grads, self.demo_loss]
        neg_grads, neg_loss = self.sess.run(ops,
                                            feed_dict={
                                                self.obs0: obs0_neg,
                                                self.obs1: obs1_neg,
                                            })

        comb_grads = pos_grads - neg_grads * neg_pct
        self.demo_optimizer.update(comb_grads,
                                   stepsize=self.demo_lr * lr_decay)

        if self.demo_reg is not None:
            demo_reg = self.sess.run(self.demo_reg)
        else:
            demo_reg = 0

        # sanity check the training
        pos_g = pos_grads[self.actor_params[2]:self.actor_params[3]]
        neg_g = neg_grads[self.actor_params[2]:self.actor_params[3]]
        comb_g = comb_grads[self.actor_params[2]:self.actor_params[3]]
        summary_list = [
            ('demo_loss/train_pos', demo_loss),
            ('demo_loss/train_max', max_loss),
            ('demo_loss/train_neg', neg_loss),
            ('grads/demo_pos_layer%d_%d' % (1, len(pos_g)), np.mean(pos_g)),
            ('grads/demo_neg_layer%d_%d' % (1, len(neg_g)), np.mean(neg_g)),
            ('grads/demo_comb_layer%d_%d' % (1, len(comb_g)), np.mean(comb_g)),
            ('actor/Q', actor_Q), ('demo_loss/reg', demo_reg)
        ]
        self.add_list_summary(summary_list, step)

        return demo_loss

    def test_demo(self, obs0, obs1):
        loss_mean, loss_max = self.sess.run(
            [self.demo_loss, self.demo_max_loss],
            feed_dict={
                self.obs0: obs0,
                self.obs1: obs1,
            })
        return loss_mean, loss_max

    def eval_demo(self, obs0):
        return self.sess.run(self.demo_aprx, feed_dict={self.obs0: obs0})

    def get_mimic_rwd(self, obs0, obs1):
        mimic_rwd, demo_aprx = self.sess.run([self.mimic_rwd, self.demo_aprx],
                                             feed_dict={
                                                 self.obs0: obs0,
                                                 self.obs1: obs1
                                             })
        return mimic_rwd, demo_aprx

    def train_main(self, step):
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            ops = [
                self.ret_rms.mean, self.ret_rms.std, self.target_Q_obs0,
                self.target_Q_obs1
            ]
            old_mean, old_std, target_Q_obs0, target_Q_obs1 = self.sess.run(
                ops,
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q_obs0.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q_obs0, self.ret_rms.mean, self.ret_rms.std],
            # feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q_obs0, new_mean, new_std)
            # assert (np.abs(target_Q_obs0 - target_Q_new) < 1e-3).all()
        else:
            ops = [self.target_Q_obs0, self.target_Q_obs1]
            target_Q_obs0, target_Q_obs1 = self.sess.run(
                ops,
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32')
                })

        summary_list = [
            ('critic_loss/Q_target_obs1_mean', np.mean(target_Q_obs1)),
            ('critic_loss/Q_target_obs1_std', np.std(target_Q_obs1)),
            ('critic_loss/Q_target_obs0_mean', np.mean(target_Q_obs0)),
            ('critic_loss/Q_target_obs0_std', np.std(target_Q_obs0))
        ]
        self.add_list_summary(summary_list, step)

        # Get all gradients and perform a synced update.
        ops = [
            self.main_summaries, self.actor_grads, self.actor_loss,
            self.critic_grads, self.critic_loss
        ]
        main_summaries, actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target_Q: target_Q_obs0,
                self.rewards: batch['rewards'],
                self.obs1: batch['obs1']
            })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        if self.train_writer:
            self.train_writer.add_summary(main_summaries, step)

        return critic_loss, actor_loss

    def initialize(self, sess, start_ckpt=None):
        self.sess = sess
        if start_ckpt:
            self.saver.restore(sess, start_ckpt)
        else:
            self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.demo_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def store_ckpt(self, save_path, epoch):
        if self.save_ckpt:
            self.saver.save(self.sess, save_path, global_step=epoch)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self, storage):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = storage.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.obs1: self.stats_sample['obs1'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self, step):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })
        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        self.add_list_summary([('param_noise/distance', mean_distance)], step)
        self.add_list_summary(
            [('param_noise/std', self.param_noise.current_stddev)], step)
        return mean_distance

    def reset(self):
        '''Reset internal state after an episode is complete.'''
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    def add_list_summary(self, summary_raw, step):
        def summary_val(k, v):
            kwargs = {'tag': k, 'simple_value': v}
            return tf.Summary.Value(**kwargs)

        if self.train_writer:
            summary_list = [summary_val(tag, val) for tag, val in summary_raw]
            self.train_writer.add_summary(tf.Summary(value=summary_list), step)
예제 #12
0
def learn(
        env,
        policy_func,
        discriminator,
        expert_dataset,
        timesteps_per_batch,
        *,
        g_step,
        d_step,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        d_stepsize=3e-4,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        save_per_iter=100,
        ckpt_dir=None,
        task="train",
        sample_stochastic=True,
        load_model_path=None,
        task_name=None,
        max_sample_traj=1500):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    d_adam = MpiAdam(discriminator.get_trainable_variables())
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    adam.sync()

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     discriminator,
                                     timesteps_per_batch,
                                     stochastic=True)
    traj_gen = traj_episode_generator(pi,
                                      env,
                                      timesteps_per_batch,
                                      stochastic=sample_stochastic)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=100)

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    if task == 'sample_trajectory':
        # not elegant, i know :(
        sample_trajectory(load_model_path, max_sample_traj, traj_gen,
                          task_name, sample_stochastic)
        sys.exit()

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)
        for _ in range(g_step):
            seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)

            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate
            d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                        shuffle=not pi.recurrent)
            optim_batchsize = optim_batchsize or ob.shape[0]

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            logger.log("Optimizing...")
            logger.log(fmt_row(13, loss_names))
            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):
                    *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                batch["atarg"], batch["vtarg"],
                                                cur_lrmult)
                    adam.update(g, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)
                logger.log(fmt_row(13, np.mean(losses, axis=0)))

            logger.log("Evaluating losses...")
            losses = []
            for batch in d.iterate_once(optim_batchsize):
                newlosses = compute_losses(batch["ob"], batch["ac"],
                                           batch["atarg"], batch["vtarg"],
                                           cur_lrmult)
                losses.append(newlosses)
            meanlosses, _, _ = mpi_moments(losses, axis=0)

        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, discriminator.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (ob, ac), include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for discriminator
            if hasattr(discriminator, "obs_rms"):
                discriminator.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch,
                                                      ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        # ----------------- logger --------------------
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        true_rewbuffer.extend(true_rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
예제 #13
0
class DDPG(object):
    def __init__(self,
                 actor,
                 critic,
                 memory,
                 observation_shape,
                 action_shape,
                 param_noise=None,
                 action_noise=None,
                 gamma=0.99,
                 tau=0.001,
                 normalize_returns=False,
                 enable_popart=False,
                 normalize_observations=True,
                 batch_size=128,
                 observation_range=(-1000., 1000.),
                 action_range=(-360., 360.),
                 return_range=(-np.inf, np.inf),
                 adaptive_param_noise=True,
                 adaptive_param_noise_policy_threshold=.1,
                 critic_l2_reg=0.,
                 actor_lr=1e-4,
                 critic_lr=1e-3,
                 clip_norm=None,
                 reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs0')
        self.obs1 = tf.placeholder(tf.float32,
                                   shape=(None, ) + observation_shape,
                                   name='obs1')
        self.terminals1 = tf.placeholder(tf.float32,
                                         shape=(None, 1),
                                         name='terminals1')
        self.rewards = tf.placeholder(tf.float32,
                                      shape=(None, 1),
                                      name='rewards')
        self.actions = tf.placeholder(tf.float32,
                                      shape=(None, ) + action_shape,
                                      name='actions')
        self.critic_target = tf.placeholder(tf.float32,
                                            shape=(None, 1),
                                            name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32,
                                                 shape=(),
                                                 name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms', reuse=tf.AUTO_REUSE):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0],
                                           self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms', reuse=tf.AUTO_REUSE):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        '''the normalization affect intialized policy to be effective, therefore remove it'''
        # self.actor_tf = actor(normalized_obs0)
        self.actor_tf, self.res_actor_tf = actor(self.obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0],
                             self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0,
                                                      self.res_actor_tf,
                                                      reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf,
                             self.return_range[0], self.return_range[1]),
            self.ret_rms)
        _, res_target_actor_action = target_actor(normalized_obs1)
        Q_obs1 = denormalize(
            target_critic(normalized_obs1, res_target_actor_action),
            self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

        self.initial_state = None  # recurrent architectures not supported yet

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(
            self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf, self.perturbed_res_actor_tf = param_noise_actor(
            normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(
            self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf, adaptive_res_actor_tf = adaptive_param_noise_actor(
            normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(
            self.actor, adaptive_param_noise_actor, self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(
            tf.reduce_mean(tf.square(self.res_actor_tf -
                                     adaptive_res_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [
            var.get_shape().as_list() for var in self.actor.trainable_vars
        ]
        actor_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss,
                                      self.actor.trainable_vars,
                                      clip_norm=self.clip_norm)
        # print('var:', self.actor.trainable_vars)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9,
                                       beta2=0.999,
                                       epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(
            normalize(self.critic_target, self.ret_rms), self.return_range[0],
            self.return_range[1])
        self.critic_loss = tf.reduce_mean(
            tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [
                var for var in self.critic.trainable_vars
                if 'kernel' in var.name and 'output' not in var.name
            ]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(
                self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars)
            self.critic_loss += critic_reg
        critic_shapes = [
            var.get_shape().as_list() for var in self.critic.trainable_vars
        ]
        critic_nb_params = sum(
            [reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss,
                                       self.critic.trainable_vars,
                                       clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9,
                                        beta2=0.999,
                                        epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [
                M.assign(M * self.old_std / new_std)
            ]
            self.renormalize_Q_outputs_op += [
                b.assign(
                    (b * self.old_std + self.old_mean - new_mean) / new_std)
            ]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [
                tf.reduce_mean(self.obs_rms.mean),
                tf.reduce_mean(self.obs_rms.std)
            ]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.res_actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.res_actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_res_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_res_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def step(self, obs, noise_factor=1., apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            res_actor_tf = self.perturbed_res_actor_tf
        else:
            res_actor_tf = self.res_actor_tf
        feed_dict = {self.obs0: U.adjust_shape(self.obs0, [obs])}
        if compute_Q:
            action, action_res, q = self.sess.run(
                [self.actor_tf, res_actor_tf, self.critic_with_actor_tf],
                feed_dict=feed_dict)
        else:
            action, action_res = self.sess.run([self.actor_tf, res_actor_tf],
                                               feed_dict=feed_dict)
            q = None
        print('action res: ', action_res)
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            # print('noise: ', noise.shape, action.shape)
            # assert noise.shape == action.shape  #(1,3), (3,)  correct addition, no need to assert
            # print('action, noise: ',action_res, noise)
            action_res += noise_factor * noise
            # print(action)
        # print(action, action_res)
        action_res = np.clip(action_res, self.action_range[0],
                             self.action_range[1])
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, action_res, q, None, None

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        # print('rs: ', self.reward_scale*np.array([-1]))
        # reward *= self.reward_scale
        B = obs0.shape[0]
        for b in range(B):
            self.memory.append(obs0[b], action[b], reward[b], obs1[b],
                               terminal1[b])
            if self.normalize_observations:
                self.obs_rms.update(np.array([obs0[b]]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })
        # print('batch actions: ', batch['actions'])
        # Get all gradients and perform a synced update.
        ops = [
            self.actor_grads, self.actor_loss, self.critic_grads,
            self.critic_loss
        ]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(
            ops,
            feed_dict={
                self.obs0: batch['obs0'],
                self.actions: batch['actions'],
                self.critic_target: target_Q,
            })
        print('grads:', actor_grads[0:3])
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        ac = tf.get_default_graph().get_tensor_by_name('actor/dense/kernel:0')
        ini_ac = tf.get_default_graph().get_tensor_by_name(
            'ini_actor/dense/kernel:0')
        print('weights:  ',
              self.sess.run(ac)[0][0:3],
              self.sess.run(ini_ac)[0][0:3])
        print('loss: ', actor_loss, critic_loss)

        return critic_loss, actor_loss

    def update_critic(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run(
                [self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                feed_dict={
                    self.obs1: batch['obs1'],
                    self.rewards: batch['rewards'],
                    self.terminals1: batch['terminals1'].astype('float32'),
                })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op,
                          feed_dict={
                              self.old_std: np.array([old_std]),
                              self.old_mean: np.array([old_mean]),
                          })

            # Run sanity check. Disabled by default since it slows down things considerably.
            # print('running sanity check')
            # target_Q_new, new_mean, new_std = self.sess.run([self.target_Q, self.ret_rms.mean, self.ret_rms.std], feed_dict={
            #     self.obs1: batch['obs1'],
            #     self.rewards: batch['rewards'],
            #     self.terminals1: batch['terminals1'].astype('float32'),
            # })
            # print(target_Q_new, target_Q, new_mean, new_std)
            # assert (np.abs(target_Q - target_Q_new) < 1e-3).all()
        else:
            target_Q = self.sess.run(self.target_Q,
                                     feed_dict={
                                         self.obs1:
                                         batch['obs1'],
                                         self.rewards:
                                         batch['rewards'],
                                         self.terminals1:
                                         batch['terminals1'].astype('float32'),
                                     })

        # Get all gradients and perform a synced update.
        ops = [self.critic_grads, self.critic_loss]
        critic_grads, critic_loss = self.sess.run(ops,
                                                  feed_dict={
                                                      self.obs0:
                                                      batch['obs0'],
                                                      self.actions:
                                                      batch['actions'],
                                                      self.critic_target:
                                                      target_Q,
                                                  })
        # 1. update the eval critic
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)
        # 2. update the target critic
        critic_init_updates, critic_soft_updates = get_target_updates(
            self.critic.vars, self.target_critic.vars, self.tau)
        self.sess.run(critic_soft_updates)
        return critic_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops,
                               feed_dict={
                                   self.obs0: self.stats_sample['obs0'],
                                   self.actions: self.stats_sample['actions'],
                               })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops,
                      feed_dict={
                          self.param_noise_stddev:
                          self.param_noise.current_stddev,
                      })
        distance = self.sess.run(self.adaptive_policy_distance,
                                 feed_dict={
                                     self.obs0:
                                     batch['obs0'],
                                     self.param_noise_stddev:
                                     self.param_noise.current_stddev,
                                 })

        mean_distance = MPI.COMM_WORLD.allreduce(
            distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops,
                          feed_dict={
                              self.param_noise_stddev:
                              self.param_noise.current_stddev,
                          })

    #added
    def save(self, save_path):
        """
           Save the model
           """
        saver = tf.train.Saver()
        saver.save(self.sess, save_path)

    def load(self, sess, load_path):
        self.sess = sess
        saver = tf.train.Saver()
        saver.restore(self.sess, load_path)

    #    self.sess=sess

    def load_ini(self, sess, load_path):
        """
           Load the model
           """
        #    variables = tf.contrib.framework.get_variables_to_restore()
        #    non_actor = [v for v in variables if v.name.split('/')[0]!='actor']

        #    saver = tf.train.Saver(non_actor)
        #    print('Loading ' + load_path)
        #    saver.restore(sess, load_path)
        self.sess = sess

        #    for v in tf.get_default_graph().as_graph_def().node:
        #         print(v.name)
        '''Initialize actor policy with supervised policy!'''
        try:
            # from the ddpg tensor graph: actor, critic, target_actor, target_critic
            actor_var_list = tf.contrib.framework.get_variables('ini_actor')

        except:
            print('Cannot get variables list!')
    #    print('actor_var:',actor_var_list)
        try:
            actor_saver = tf.train.Saver(actor_var_list)
            actor_saver.restore(self.sess, './model/small/ini')
            print('Actor Load Succeed!')
        except:
            print('Actor Load Failed!')
        #check if the actor initialization policy has been loaded correctly, i.e. equal to \
        # directly ouput values in checkpoint files
    #    loaded_weights=tf.get_default_graph().get_tensor_by_name('actor/mlp_fc0/w:0')
    #    print('loaded_weights:', self.sess.run(loaded_weights))

    #init-update once the target_actor network(init_update is fully copy, soft-update accords to tau)
        self.sess.run(self.target_init_updates)
예제 #14
0
class DDPG(object):
    def __init__(self, actor, critic, memory, observation_shape, action_shape, param_noise=None, action_noise=None,
                 gamma=0.99, tau=0.001, normalize_returns=False, enable_popart=False, normalize_observations=True,
                 batch_size=128, observation_range=(-5., 5.), action_range=(-1., 1.), return_range=(-np.inf, np.inf),
                 critic_l2_reg=0., actor_lr=1e-4, critic_lr=1e-3, clip_norm=None, reward_scale=1.):
        # Inputs.
        self.obs0 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs0')
        self.obs1 = tf.placeholder(tf.float32, shape=(None,) + observation_shape, name='obs1')
        self.terminals1 = tf.placeholder(tf.float32, shape=(None, 1), name='terminals1')
        self.rewards = tf.placeholder(tf.float32, shape=(None, 1), name='rewards')
        self.actions = tf.placeholder(tf.float32, shape=(None,) + action_shape, name='actions')
        self.critic_target = tf.placeholder(tf.float32, shape=(None, 1), name='critic_target')
        self.param_noise_stddev = tf.placeholder(tf.float32, shape=(), name='param_noise_stddev')

        # Parameters.
        self.gamma = gamma
        self.tau = tau
        self.memory = memory
        self.normalize_observations = normalize_observations
        self.normalize_returns = normalize_returns
        self.action_noise = action_noise
        self.param_noise = param_noise
        self.action_range = action_range
        self.return_range = return_range
        self.observation_range = observation_range
        self.critic = critic
        self.actor = actor
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.clip_norm = clip_norm
        self.enable_popart = enable_popart
        self.reward_scale = reward_scale
        self.batch_size = batch_size
        self.stats_sample = None
        self.critic_l2_reg = critic_l2_reg

        # Observation normalization.
        if self.normalize_observations:
            with tf.variable_scope('obs_rms'):
                self.obs_rms = RunningMeanStd(shape=observation_shape)
        else:
            self.obs_rms = None
        normalized_obs0 = tf.clip_by_value(normalize(self.obs0, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])
        normalized_obs1 = tf.clip_by_value(normalize(self.obs1, self.obs_rms),
                                           self.observation_range[0], self.observation_range[1])

        # Return normalization.
        if self.normalize_returns:
            with tf.variable_scope('ret_rms'):
                self.ret_rms = RunningMeanStd()
        else:
            self.ret_rms = None

        # Create target networks.
        target_actor = copy(actor)
        target_actor.name = 'target_actor'
        self.target_actor = target_actor
        target_critic = copy(critic)
        target_critic.name = 'target_critic'
        self.target_critic = target_critic

        # Create networks and core TF parts that are shared across setup parts.
        self.actor_tf = actor(normalized_obs0)
        self.normalized_critic_tf = critic(normalized_obs0, self.actions)
        self.critic_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_tf, self.return_range[0], self.return_range[1]), self.ret_rms)
        self.normalized_critic_with_actor_tf = critic(normalized_obs0, self.actor_tf, reuse=True)
        self.critic_with_actor_tf = denormalize(
            tf.clip_by_value(self.normalized_critic_with_actor_tf, self.return_range[0], self.return_range[1]),
            self.ret_rms)
        Q_obs1 = denormalize(target_critic(normalized_obs1, target_actor(normalized_obs1)), self.ret_rms)
        self.target_Q = self.rewards + (1. - self.terminals1) * gamma * Q_obs1

        # Set up parts.
        if self.param_noise is not None:
            self.setup_param_noise(normalized_obs0)
        self.setup_actor_optimizer()
        self.setup_critic_optimizer()
        if self.normalize_returns and self.enable_popart:
            self.setup_popart()
        self.setup_stats()
        self.setup_target_network_updates()

    def setup_target_network_updates(self):
        actor_init_updates, actor_soft_updates = get_target_updates(self.actor.vars, self.target_actor.vars, self.tau)
        critic_init_updates, critic_soft_updates = get_target_updates(self.critic.vars, self.target_critic.vars,
                                                                      self.tau)
        self.target_init_updates = [actor_init_updates, critic_init_updates]
        self.target_soft_updates = [actor_soft_updates, critic_soft_updates]

    def setup_param_noise(self, normalized_obs0):
        assert self.param_noise is not None

        # Configure perturbed actor.
        param_noise_actor = copy(self.actor)
        param_noise_actor.name = 'param_noise_actor'
        self.perturbed_actor_tf = param_noise_actor(normalized_obs0)
        logger.info('setting up param noise')
        self.perturb_policy_ops = get_perturbed_actor_updates(self.actor, param_noise_actor, self.param_noise_stddev)

        # Configure separate copy for stddev adoption.
        adaptive_param_noise_actor = copy(self.actor)
        adaptive_param_noise_actor.name = 'adaptive_param_noise_actor'
        adaptive_actor_tf = adaptive_param_noise_actor(normalized_obs0)
        self.perturb_adaptive_policy_ops = get_perturbed_actor_updates(self.actor, adaptive_param_noise_actor,
                                                                       self.param_noise_stddev)
        self.adaptive_policy_distance = tf.sqrt(tf.reduce_mean(tf.square(self.actor_tf - adaptive_actor_tf)))

    def setup_actor_optimizer(self):
        logger.info('setting up actor optimizer')
        self.actor_loss = -tf.reduce_mean(self.critic_with_actor_tf)
        actor_shapes = [var.get_shape().as_list() for var in self.actor.trainable_vars]
        actor_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in actor_shapes])
        logger.info('  actor shapes: {}'.format(actor_shapes))
        logger.info('  actor params: {}'.format(actor_nb_params))
        self.actor_grads = U.flatgrad(self.actor_loss, self.actor.trainable_vars, clip_norm=self.clip_norm)
        self.actor_optimizer = MpiAdam(var_list=self.actor.trainable_vars,
                                       beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_critic_optimizer(self):
        logger.info('setting up critic optimizer')
        normalized_critic_target_tf = tf.clip_by_value(normalize(self.critic_target, self.ret_rms),
                                                       self.return_range[0], self.return_range[1])
        self.critic_loss = tf.reduce_mean(tf.square(self.normalized_critic_tf - normalized_critic_target_tf))
        if self.critic_l2_reg > 0.:
            critic_reg_vars = [var for var in self.critic.trainable_vars if
                               'kernel' in var.name and 'output' not in var.name]
            for var in critic_reg_vars:
                logger.info('  regularizing: {}'.format(var.name))
            logger.info('  applying l2 regularization with {}'.format(self.critic_l2_reg))
            critic_reg = tc.layers.apply_regularization(
                tc.layers.l2_regularizer(self.critic_l2_reg),
                weights_list=critic_reg_vars
            )
            self.critic_loss += critic_reg
        critic_shapes = [var.get_shape().as_list() for var in self.critic.trainable_vars]
        critic_nb_params = sum([reduce(lambda x, y: x * y, shape) for shape in critic_shapes])
        logger.info('  critic shapes: {}'.format(critic_shapes))
        logger.info('  critic params: {}'.format(critic_nb_params))
        self.critic_grads = U.flatgrad(self.critic_loss, self.critic.trainable_vars, clip_norm=self.clip_norm)
        self.critic_optimizer = MpiAdam(var_list=self.critic.trainable_vars,
                                        beta1=0.9, beta2=0.999, epsilon=1e-08)

    def setup_popart(self):
        # See https://arxiv.org/pdf/1602.07714.pdf for details.
        self.old_std = tf.placeholder(tf.float32, shape=[1], name='old_std')
        new_std = self.ret_rms.std
        self.old_mean = tf.placeholder(tf.float32, shape=[1], name='old_mean')
        new_mean = self.ret_rms.mean

        self.renormalize_Q_outputs_op = []
        for vs in [self.critic.output_vars, self.target_critic.output_vars]:
            assert len(vs) == 2
            M, b = vs
            assert 'kernel' in M.name
            assert 'bias' in b.name
            assert M.get_shape()[-1] == 1
            assert b.get_shape()[-1] == 1
            self.renormalize_Q_outputs_op += [M.assign(M * self.old_std / new_std)]
            self.renormalize_Q_outputs_op += [b.assign((b * self.old_std + self.old_mean - new_mean) / new_std)]

    def setup_stats(self):
        ops = []
        names = []

        if self.normalize_returns:
            ops += [self.ret_rms.mean, self.ret_rms.std]
            names += ['ret_rms_mean', 'ret_rms_std']

        if self.normalize_observations:
            ops += [tf.reduce_mean(self.obs_rms.mean), tf.reduce_mean(self.obs_rms.std)]
            names += ['obs_rms_mean', 'obs_rms_std']

        ops += [tf.reduce_mean(self.critic_tf)]
        names += ['reference_Q_mean']
        ops += [reduce_std(self.critic_tf)]
        names += ['reference_Q_std']

        ops += [tf.reduce_mean(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_mean']
        ops += [reduce_std(self.critic_with_actor_tf)]
        names += ['reference_actor_Q_std']

        ops += [tf.reduce_mean(self.actor_tf)]
        names += ['reference_action_mean']
        ops += [reduce_std(self.actor_tf)]
        names += ['reference_action_std']

        if self.param_noise:
            ops += [tf.reduce_mean(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_mean']
            ops += [reduce_std(self.perturbed_actor_tf)]
            names += ['reference_perturbed_action_std']

        self.stats_ops = ops
        self.stats_names = names

    def pi(self, obs, apply_noise=True, compute_Q=True):
        if self.param_noise is not None and apply_noise:
            actor_tf = self.perturbed_actor_tf
        else:
            actor_tf = self.actor_tf
        feed_dict = {self.obs0: [obs]}
        if compute_Q:
            action, q = self.sess.run([actor_tf, self.critic_with_actor_tf], feed_dict=feed_dict)
        else:
            action = self.sess.run(actor_tf, feed_dict=feed_dict)
            q = None
        action = action.flatten()
        if self.action_noise is not None and apply_noise:
            noise = self.action_noise()
            assert noise.shape == action.shape
            action += noise
        action = np.clip(action, self.action_range[0], self.action_range[1])
        return action, q

    def store_transition(self, obs0, action, reward, obs1, terminal1):
        reward *= self.reward_scale
        self.memory.append(obs0, action, reward, obs1, terminal1)
        if self.normalize_observations:
            self.obs_rms.update(np.array([obs0]))

    def train(self):
        # Get a batch.
        batch = self.memory.sample(batch_size=self.batch_size)

        if self.normalize_returns and self.enable_popart:
            old_mean, old_std, target_Q = self.sess.run([self.ret_rms.mean, self.ret_rms.std, self.target_Q],
                                                        feed_dict={
                                                            self.obs1: batch['obs1'],
                                                            self.rewards: batch['rewards'],
                                                            self.terminals1: batch['terminals1'].astype('float32'),
                                                        })
            self.ret_rms.update(target_Q.flatten())
            self.sess.run(self.renormalize_Q_outputs_op, feed_dict={
                self.old_std: np.array([old_std]),
                self.old_mean: np.array([old_mean]),
            })
        else:
            target_Q = self.sess.run(self.target_Q, feed_dict={
                self.obs1: batch['obs1'],
                self.rewards: batch['rewards'],
                self.terminals1: batch['terminals1'].astype('float32'),
            })

        # Get all gradients and perform a synced update.
        ops = [self.actor_grads, self.actor_loss, self.critic_grads, self.critic_loss]
        actor_grads, actor_loss, critic_grads, critic_loss = self.sess.run(ops, feed_dict={
            self.obs0: batch['obs0'],
            self.actions: batch['actions'],
            self.critic_target: target_Q,
        })
        self.actor_optimizer.update(actor_grads, stepsize=self.actor_lr)
        self.critic_optimizer.update(critic_grads, stepsize=self.critic_lr)

        return critic_loss, actor_loss

    def initialize(self, sess):
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())
        self.actor_optimizer.sync()
        self.critic_optimizer.sync()
        self.sess.run(self.target_init_updates)

    def update_target_net(self):
        self.sess.run(self.target_soft_updates)

    def get_stats(self):
        if self.stats_sample is None:
            # Get a sample and keep that fixed for all further computations.
            # This allows us to estimate the change in value for the same set of inputs.
            self.stats_sample = self.memory.sample(batch_size=self.batch_size)
        values = self.sess.run(self.stats_ops, feed_dict={
            self.obs0: self.stats_sample['obs0'],
            self.actions: self.stats_sample['actions'],
        })

        names = self.stats_names[:]
        assert len(names) == len(values)
        stats = dict(zip(names, values))

        if self.param_noise is not None:
            stats = {**stats, **self.param_noise.get_stats()}

        return stats

    def adapt_param_noise(self):
        if self.param_noise is None:
            return 0.

        # Perturb a separate copy of the policy to adjust the scale for the next "real" perturbation.
        batch = self.memory.sample(batch_size=self.batch_size)
        self.sess.run(self.perturb_adaptive_policy_ops, feed_dict={
            self.param_noise_stddev: self.param_noise.current_stddev,
        })
        distance = self.sess.run(self.adaptive_policy_distance, feed_dict={
            self.obs0: batch['obs0'],
            self.param_noise_stddev: self.param_noise.current_stddev,
        })

        mean_distance = MPI.COMM_WORLD.allreduce(distance, op=MPI.SUM) / MPI.COMM_WORLD.Get_size()
        self.param_noise.adapt(mean_distance)
        return mean_distance

    def reset(self):
        # Reset internal state after an episode is complete.
        if self.action_noise is not None:
            self.action_noise.reset()
        if self.param_noise is not None:
            self.sess.run(self.perturb_policy_ops, feed_dict={
                self.param_noise_stddev: self.param_noise.current_stddev,
            })
예제 #15
0
def learn(
        env,
        policy_func,
        *,
        timesteps=4,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        save_per_iter=100,
        ckpt_dir=None,
        task="train",
        sample_stochastic=True,
        load_model_path=None,
        task_name=None,
        max_sample_traj=1500):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", timesteps, ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", timesteps, ob_space,
                        ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    pi_vpred = tf.placeholder(dtype=tf.float32, shape=[None])
    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    #    ob_now = tf.placeholder(dtype=tf.float32, shape=[optim_batchsize, list(ob_space.shape)[0]])
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    # total_loss = pol_surr + pol_entpen + vf_loss
    total_loss = pol_surr + pol_entpen
    losses = [pol_surr, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    vf_var_list = [
        v for v in var_list if v.name.split("/")[1].startswith("vf")
    ]
    pol_var_list = [
        v for v in var_list if not v.name.split("/")[1].startswith("vf")
    ]
    #  lossandgrad = U.function([ob, ac, atarg ,ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, pol_var_list)])
    vf_grad = U.function([ob, ac, atarg, ret, lrmult],
                         U.flatgrad(vf_loss, vf_var_list))

    # adam = MpiAdam(var_list, epsilon=adam_epsilon)
    pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon)
    vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    #adam.sync()
    pol_adam.sync()
    vf_adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     timesteps,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)
    traj_gen = traj_episode_generator(pi,
                                      env,
                                      timesteps_per_batch,
                                      stochastic=sample_stochastic)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards
    EpRewMean_MAX = 2.5e3
    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    if task == 'sample_trajectory':
        # not elegant, i know :(
        sample_trajectory(load_model_path, max_sample_traj, traj_gen,
                          task_name, sample_stochastic)
        sys.exit()

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)
        # if(iters_so_far == 1):
        #     a = 1
        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, vpred, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "vpred"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(
            dict(ob=ob, ac=ac, atarg=atarg, vpred=vpred, vtarg=tdlamret),
            shuffle=False
        )  #d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vpred = vpred, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)]
            for batch in d.iterate_once(optim_batchsize):
                ##feed ob, 重新处理一下ob,在batch["ob"]的最前面插入timesteps-1个env.reset的ob,然后滑动串口划分一下batch['ob]
                ob_now = np.append(pre_obs, batch['ob']).reshape(
                    optim_batchsize + timesteps - 1,
                    list(ob_space.shape)[0])
                pre_obs = ob_now[-(timesteps - 1):]
                ob_fin = []
                for jmj in range(optim_batchsize):
                    ob_fin.append(ob_now[jmj:jmj + timesteps])
                *newlosses, g = lossandgrad(ob_fin, batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)  ###这里的g好像都是0
                #adam.update(g, optim_stepsize * cur_lrmult)
                pol_adam.update(g, optim_stepsize * cur_lrmult)
                vf_g = vf_grad(ob_fin, batch["ac"], batch["atarg"],
                               batch["vtarg"], cur_lrmult)
                vf_adam.update(vf_g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

            pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)]
            for batch in d.iterate_once(optim_batchsize):
                ##feed ob, 重新处理一下ob,在batch["ob"]的最前面插入timesteps-1个env.reset的ob,然后滑动串口划分一下batch['ob]
                ob_now = np.append(pre_obs, batch['ob']).reshape(
                    optim_batchsize + timesteps - 1,
                    list(ob_space.shape)[0])
                pre_obs = ob_now[-(timesteps - 1):]
                ob_fin = []
                for jmj in range(optim_batchsize):
                    ob_fin.append(ob_now[jmj:jmj + timesteps])
                *newlosses, g = lossandgrad(ob_fin, batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)  ###这里的g好像都是0
                #adam.update(g, optim_stepsize * cur_lrmult)
                pol_adam.update(g, optim_stepsize * cur_lrmult)
                vf_g = vf_grad(ob_fin, batch["ac"], batch["atarg"],
                               batch["vtarg"], cur_lrmult)
                vf_adam.update(vf_g, optim_stepsize * cur_lrmult)

        logger.log("Evaluating losses...")
        losses = []
        loss_pre_obs = [seg["ob_reset"] for jmj in range(timesteps - 1)]
        for batch in d.iterate_once(optim_batchsize):
            ### feed ob
            ob_now = np.append(loss_pre_obs, batch['ob']).reshape(
                optim_batchsize + timesteps - 1,
                list(ob_space.shape)[0])
            loss_pre_obs = ob_now[-(timesteps - 1):]
            ob_fin = []
            for jmj in range(optim_batchsize):
                ob_fin.append(ob_now[jmj:jmj + timesteps])
            newlosses = compute_losses(ob_fin, batch["ac"], batch["atarg"],
                                       batch["vtarg"], cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        if (np.mean(rewbuffer) > EpRewMean_MAX):
            EpRewMean_MAX = np.mean(rewbuffer)
            print(iters_so_far)
            print(np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()