Example #1
0
    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
        #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables
Example #2
0
    def policy_loss_ppo(self, pi, oldpi, ac, atarg, ret):
        kl_oldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        mean_kl = U.mean(kl_oldnew)
        mean_ent = U.mean(ent)
        pol_entpen = -self._entcoeff * mean_ent

        action_prob = pi.pd.logp(ac) - oldpi.pd.logp(ac)
        action_loss = tf.exp(action_prob) * atarg

        ratio = tf.exp(action_prob)

        surr1 = ratio * atarg
        surr2 = U.clip(ratio, 1.0 - self._clip_param,
                       1.0 + self._clip_param) * atarg
        pol_surr = -U.mean(tf.minimum(surr1, surr2))
        vf_loss = U.mean(tf.square(pi.vpred - ret))
        total_loss = pol_surr + pol_entpen + vf_loss

        losses = {
            'total_loss': total_loss,
            'action_loss': action_loss,
            'pol_surr': pol_surr,
            'pol_entpen': pol_entpen,
            'kl': mean_kl,
            'entropy': mean_ent,
            'vf_loss': vf_loss
        }
        return losses
Example #3
0
 def __init__(self, ob_dim, ac_dim):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
     oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
     oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
     oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
     wd_dict = {}
     h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
     self.wd_dict = wd_dict
     self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
     logstd_1a = tf.expand_dims(logstd_1a, 0)
     std_1a = tf.exp(logstd_1a)
     std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
     ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
     sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
     logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
     logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
     kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
     #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
     surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
     surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
     self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
     #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
     self.compute_kl = U.function([ob_no, oldac_dist], kl)
     self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
     U.initialize() # Initialize uninitialized TF variables
Example #4
0
 def __init__(self, ob_dim, ac_dim):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
     oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
     oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
     oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
     wd_dict = {}
     h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
     mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
     self.wd_dict = wd_dict
     self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
     logstd_1a = tf.expand_dims(logstd_1a, 0)
     std_1a = tf.exp(logstd_1a)
     std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
     ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
     sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
     logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
     logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
     kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
     #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
     surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
     surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
     self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
     #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
     self.compute_kl = U.function([ob_no, oldac_dist], kl)
     self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
     U.initialize() # Initialize uninitialized TF variables
Example #5
0
 def __init__(self, ob_dim, ac_dim, ac_space, bins):
     # Here we'll construct a bunch of expressions, which will be used in two places:
     # (1) When sampling actions
     # (2) When computing loss functions, for the policy update
     # Variables specific to (1) have the word "sampled" in them,
     # whereas variables specific to (2) have the word "old" in them
     ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2],
                            name="ob")  # batch of observations
     oldac_na = tf.placeholder(
         tf.int32, shape=[None, ac_dim],
         name="ac")  # batch of actions previous actions
     oldac_logits = tf.placeholder(
         tf.float32, shape=[None, ac_dim * bins], name="oldac_logit"
     )  # batch of actions previous action distributions
     adv_n = tf.placeholder(tf.float32, shape=[None],
                            name="adv")  # advantage function estimate
     self.pdtype = make_pdtype(ac_space)
     wd_dict = {}
     # forward pass
     h1 = tf.nn.tanh(
         dense(ob_no,
               64,
               "h1",
               weight_init=U.normc_initializer(1.0),
               bias_init=0.0,
               weight_loss_dict=wd_dict))
     h2 = tf.nn.tanh(
         dense(h1,
               64,
               "h2",
               weight_init=U.normc_initializer(1.0),
               bias_init=0.0,
               weight_loss_dict=wd_dict))
     logits_na = dense(h2,
                       self.pdtype.param_shape()[0],
                       "logits",
                       weight_init=U.normc_initializer(0.1),
                       bias_init=0.0,
                       weight_loss_dict=wd_dict)  # Mean control
     self.wd_dict = wd_dict
     self.pd = self.pdtype.pdfromflat(
         logits_na)  # multi-categorical distributions
     # sample action for control
     sampled_ac_na = self.pd.sample()
     # log prob for sampled actions
     logprobsampled_n = -self.pd.neglogp(sampled_ac_na)
     logprob_n = -self.pd.neglogp(oldac_na)
     # kl div
     old_pd = self.pdtype.pdfromflat(oldac_logits)
     kl = U.mean(old_pd.kl(self.pd))
     # surr loss
     surr = -U.mean(adv_n * logprob_n)
     surr_sampled = -U.mean(logprob_n)
     # expressions
     self._act = U.function([ob_no],
                            [sampled_ac_na, logits_na, logprobsampled_n])
     self.compute_kl = U.function([ob_no, oldac_logits], kl)
     self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled)
     U.initialize()
Example #6
0
    def __init__(
        self,
        ob_space,
        ac_space,
        model_func,
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        adam_epsilon=1e-5,
    ):

        with tf.variable_scope('pi'):
            self.pi = pi = model_func(ob_space, ac_space)

        with tf.variable_scope('pi_old'):
            self.pi_old = pi_old = model_func(ob_space, ac_space)

        self.adv = tf.placeholder(
            dtype=tf.float32, shape=[None],
            name='adv')  # Target advantage function (if applicable)
        self.ret = tf.placeholder(dtype=tf.float32, shape=[None],
                                  name='ret')  # Empirical return

        self.lrmult = tf.placeholder(
            name='lrmult', dtype=tf.float32,
            shape=[])  # learning rate multiplier, updated with schedule
        clip_param = clip_param * self.lrmult  # Annealed cliping parameter epislon

        self.ac = ac = pi.pdtype.sample_placeholder([None])

        kloldnew = pi_old.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        meankl = U.mean(kloldnew)
        meanent = U.mean(ent)
        pol_entpen = (-entcoeff) * meanent

        ratio = tf.exp(pi.pd.logp(ac) - pi_old.pd.logp(ac))  # pnew / pold
        surr1 = ratio * self.adv  # surrogate from conservative policy iteration
        surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * self.adv  #
        pol_surr = -U.mean(tf.minimum(
            surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
        vf_loss = U.mean(tf.square(pi.vpred - self.ret))
        self.total_loss = pol_surr + pol_entpen + vf_loss

        # gradients
        self.grads = tf.gradients(self.total_loss, pi.train_vars)
        self.flat_grads = U.flatgrad(self.total_loss, pi.train_vars)

        # optimizer
        self.optimizer = MpiAdam(pi.train_vars, epsilon=adam_epsilon)

        # assign new pi to old pi
        self.op_assign_old_eq_new = tf.group(*[
            tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(pi_old.global_vars, pi.global_vars)
        ])

        U.initialize()
        self.optimizer.sync()
    def _init(self, ob_dim, act_dim, num_units=3, num_layers=4, batch=None):
        assert batch is not None
        self.batch = batch
        ob_act = tf.placeholder(tf.float32,
                                shape=[1, ob_dim * 2],
                                name="ob_act")
        ob_train = tf.placeholder(tf.float32,
                                  shape=[batch, ob_dim * 2],
                                  name="ob_train")
        oldac_na = tf.placeholder(tf.float32,
                                  shape=[batch, act_dim],
                                  name="ac")
        action_act = tf.placeholder(tf.float32,
                                    shape=[1, act_dim],
                                    name="ac_act")
        oldac_dist = tf.placeholder(
            tf.float32, shape=[batch],
            name="oldac_dist")  # logprob for old actions
        adv_n = tf.placeholder(tf.float32, shape=[batch], name="adv")
        wd_dict = {}

        # module for execution and training
        policy_train = NormalizingFlowStateModel(ob_train,
                                                 oldac_na,
                                                 name='policy',
                                                 reuse=False,
                                                 num_units=num_units,
                                                 num_layers=num_layers)
        policy_act = NormalizingFlowStateModel(ob_act,
                                               action_act,
                                               name='policy',
                                               reuse=True,
                                               num_units=num_units,
                                               num_layers=num_layers)

        # weight decay
        self.wd_dict = {}  # TODO

        # action for execution
        self.pi_act = policy_act.y_sample
        self.log_prob_act = policy_act.log_prob

        # kl divergence
        ac_dist = policy_train.log_prob  # logprob
        kl = U.mean(oldac_dist - ac_dist)  # sample based kl

        # surr loss
        surr = -U.mean(adv_n * ac_dist)
        surr_sampled = -U.mean(ac_dist)

        # functions
        self._act = U.function([ob_act], self.pi_act)
        self._act_logprob = U.function([ob_act, action_act], self.log_prob_act)
        self.compute_kl = U.function([ob_train, oldac_na, oldac_dist], kl)
        self.update_info = ((ob_train, oldac_na, adv_n), surr, surr_sampled)
        U.initialize()
Example #8
0
def load_policy(env, policy_func, *,
                clip_param, entcoeff,  # clipping parameter epsilon, entropy coeff
                adam_epsilon=1e-5,
                model_path, checkpoint):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = - U.mean(tf.minimum(surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    U.load_state(os.path.join(model_path, 'model-{}'.format(checkpoint)))

    return pi
Example #9
0
    def policy_loss_ppo(self, pi, oldpi, ac, atarg, ret, term=None, entcoeff=None):
        kl_oldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        mean_kl = U.mean(kl_oldnew)
        mean_ent = U.mean(ent)
        entcoeff = self._entcoeff if entcoeff is None else entcoeff
        logger.info('Policy {} entropy {}'.format(pi.name, entcoeff))
        pol_entpen = -entcoeff * mean_ent

        action_prob = pi.pd.logp(ac) - oldpi.pd.logp(ac)
        action_prob = tf.check_numerics(action_prob, 'check action_prob')
        action_loss = tf.check_numerics(atarg, 'check atarg')
        action_loss = tf.exp(action_prob) * atarg
        action_loss = tf.check_numerics(action_loss, 'check action_loss')

        term_loss = None
        if term is not None:
            # ignore prob of actions if term is True
            action_prob = (1 - tf.to_float(term)) * action_prob
            if pi.term_activation == 'sigmoid':
                term_prob = tf.log(pi.term_pred + 1e-5) - tf.clip_by_value(tf.log(oldpi.term_pred + 1e-5), -20, 20)
            else:
                term_prob = pi.term_pd.logp(term) - tf.clip_by_value(oldpi.term_pd.logp(term), -20, 20)
            action_prob += term_prob
            term_loss = tf.exp(term_prob) * atarg
        ratio = tf.exp(action_prob)

        surr1 = ratio * atarg
        surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg
        pol_surr = -U.mean(tf.minimum(surr1, surr2))
        vf_loss = U.mean(tf.square(pi.vpred - ret))
        pol_surr = tf.check_numerics(pol_surr, 'check pol_surr')
        vf_loss = tf.check_numerics(vf_loss, 'check vf_loss')
        total_loss = pol_surr + pol_entpen + vf_loss

        total_loss = tf.check_numerics(total_loss, 'check total_loss')
        losses = {'total_loss': total_loss,
                  'action_loss': action_loss,
                  'pol_surr': pol_surr,
                  'pol_entpen': pol_entpen,
                  'kl': mean_kl,
                  'entropy': mean_ent,
                  'vf_loss': vf_loss}
        if term_loss is not None:
            losses.update({'term_loss': term_loss})
        return losses
    def __init__(self, sess, ob_dim, ac_dim, vf_lr=0.001, cv_lr=0.001, reuse=False):
        # Here we'll construct a bunch of expressions, which will be used in two places:
        # (1) When sampling actions
        # (2) When computing loss functions, for the policy update
        # Variables specific to (1) have the word "sampled" in them,
        # whereas variables specific to (2) have the word "old" in them
        self.relaxed = False
        self.X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        self.ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
        self.oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
        oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
        
        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(self.ob_no, 64, "pi_h1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 64, "pi_h2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            mean_na = dense(h2, ac_dim, "pi", weight_init=U.normc_initializer(0.1), bias_init=0.0) # Mean control output
            self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
            logstd_1a = tf.expand_dims(logstd_1a, 0)
            self.std_1a = tf.exp(logstd_1a)
            self.std_na = tf.tile(self.std_1a, [tf.shape(mean_na)[0], 1])
            ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(self.std_na, [-1, ac_dim])], 1)
            sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
            logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
            self.logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - self.oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
            kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
        

            vh1 = tf.nn.elu(dense(self.X, 64, "vf_h1", weight_init=U.normc_initializer(1.0), bias_init=0))
            vh2 = tf.nn.elu(dense(vh1, 64, "vf_h2", weight_init=U.normc_initializer(1.0), bias_init=0))
            vpred_n = dense(vh2, 1, "vf", weight_init=None, bias_init=0)
            v0 = vpred_n[:, 0]
            self.vf_optim = tf.train.AdamOptimizer(vf_lr)
        
        def act(ob):
            ac, dist, logp = sess.run([sampled_ac_na, ac_dist, logprobsampled_n], {self.ob_no: ob[None]})  # Generate a new action and its logprob
            return ac[0], dist[0], logp[0]
        def value(obs, x):
            return sess.run(v0, {self.X: x, self.ob_no:obs})
        def preproc(path):
            l = pathlength(path)
            al = np.arange(l).reshape(-1,1)/10.0
            act = path["action_dist"].astype('float32')
            X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
            return X
        def predict(obs, path):
            return value(obs, preproc(path))
        def compute_kl(ob, dist):
            return sess.run(kl, {self.ob_no: ob, oldac_dist: dist})
            
        self.mean = mean_na
        self.vf = v0
        self.act = act
        self.value = value
        self.preproc = preproc
        self.predict = predict
        self.compute_kl = compute_kl
        self.a0 = sampled_ac_na
Example #11
0
    def policy_loss_trpo(self, pi, oldpi, ob, ac, atarg, ret):
        raise NotImplementedError()
        kl_oldnew = oldpi.pd.kl(pi.pd)
        ent = pi.pd.entropy()
        mean_kl = U.mean(kl_oldnew)
        mean_ent = U.mean(ent)
        pol_entpen = -self._entcoeff * mean_ent

        vf_loss = U.mean(tf.square(pi.vpred - ret))

        ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))
        pol_surr = U.mean(ratio * atarg)
        pol_loss = pol_surr + pol_entpen

        losses = {'pol_loss': pol_loss,
                  'pol_surr': pol_surr,
                  'pol_entpen': pol_entpen,
                  'kl': mean_kl,
                  'entropy': mean_ent,
                  'vf_loss': vf_loss}
        return losses
Example #12
0
    def policy_loss_ppo_term(self, pi, oldpi, atarg, ret, term):
        if pi.term_type == 'sigmoid':
            term_prob = tf.log(pi.term_pred + 1e-5) - tf.clip_by_value(tf.log(oldpi.term_pred + 1e-5), -20, 20)
        else:
            term_prob = pi.term_pd.logp(term) - tf.clip_by_value(oldpi.term_pd.logp(term), -20, 20)
        term_loss = tf.exp(term_prob) * atarg
        ratio = tf.exp(term_prob)

        surr1 = ratio * atarg
        surr2 = U.clip(ratio, 1.0 - self._clip_param, 1.0 + self._clip_param) * atarg
        pol_surr = -U.mean(tf.minimum(surr1, surr2))
        vf_loss = U.mean(tf.square(pi.vpred - ret))
        pol_surr = tf.check_numerics(pol_surr, 'check pol_surr')
        vf_loss = tf.check_numerics(vf_loss, 'check vf_loss')
        total_loss = pol_surr + vf_loss

        total_loss = tf.check_numerics(total_loss, 'check total_loss')
        losses = {'total_loss': total_loss,
                  'pol_surr': pol_surr,
                  'vf_loss': vf_loss,
                  'term_loss': term_loss}
        return losses
Example #13
0
    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables
def learn(env, policy_func, *,
          timesteps_per_batch,  # what to train on
          log_every=None,
          log_dir=None,
          episodes_so_far=0, timesteps_so_far=0, iters_so_far=0,
          max_kl, cg_iters,
          gamma, lam,  # advantage estimation
          entcoeff=0.0,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          vf_iters=3,
          max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
          callback=None,
          **kwargs
          ):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    # Target advantage function (if applicable)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([U.sum(g * tangent)
                    for (g, tangent) in zipsame(klgrads, tangents)])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv)
                                                    for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    # GRASPING
    saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1)
    checkpoint = tf.train.latest_checkpoint(log_dir)
    if checkpoint:
        print("Restoring checkpoint: {}".format(checkpoint))
        saver.restore(U.get_session(), checkpoint)
    if hasattr(env, "set_actor"):
        def actor(obs):
            return pi.act(False, obs)[0]
        env.set_actor(actor)
    if not checkpoint and hasattr(env, "warm_init_eps"):
        pretrain(pi, env)
        saver.save(U.get_session(), osp.join(log_dir, "model"))
    # /GRASPING
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    tstart = time.time()

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    while True:
        if callback:
            callback(locals(), globals())
        should_break = False
        if max_timesteps and timesteps_so_far >= max_timesteps:
            should_break = True
        elif max_episodes and episodes_so_far >= max_episodes:
            should_break = True
        elif max_iters and iters_so_far >= max_iters:
            should_break = True

        if log_every and log_dir:
            if (iters_so_far + 1) % log_every == 0 or should_break:
                # To reduce space, don't specify global step.
                saver.save(U.get_session(), osp.join(log_dir, "model"))

            job_info = {'episodes_so_far': episodes_so_far,
                        'iters_so_far': iters_so_far, 'timesteps_so_far': timesteps_so_far}
            with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file:
                yaml.dump(job_info, file, default_flow_style=False)
                # Make sure write is instantaneous.
                file.flush()
                os.fsync(file)
            os.rename(osp.join(log_dir, "job_info_new.yaml"), osp.join(log_dir, "job_info.yaml"))

        if should_break:
            break

        logger.log("********** Iteration %i ************" % iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (atarg.std() + 1e-10)  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"):
            pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]
        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)

        meanlosses = None
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        if meanlosses is not None:
            for (lossname, lossval) in zip(loss_names, meanlosses):
                logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
                                                         include_final_partial_batch=False, batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))

        logger.record_tabular("EpLenMean", np.mean(lens))
        logger.record_tabular("EpRewMean", np.mean(rews))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
def learn(env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        num_options=1,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0
        ):


    optim_batchsize_ideal = optim_batchsize 
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    ### Book-keeping
    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app
    # This variable: "version name, defines the name of the training"
    version_name = '25er_alternation_SEPARATE_optimization-ppo-ESCH-1-0-0-nI' 

    dirname = '{}_{}_{}opts_saves/'.format(version_name,gamename,num_options)
    print (dirname)

    # retrieve everything using relative paths. Create a train_results folder where the repo has been cloned
    dirname_rel = os.path.dirname(__file__)
    splitted = dirname_rel.split("/")
    dirname_rel = ("/".join(dirname_rel.split("/")[:len(splitted)-3])+"/")
    dirname = dirname_rel + "train_results/" + dirname

    # if saving -> create the necessary directories
    if wsaves:
        first=True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False

        # copy also the original files into the folder where the training results are stored

        files = ['pposgd_simple.py','mlp_policy.py','run_mujoco.py']
        first = True
        for i in range(len(files)):
            src = os.path.join(dirname_rel,'baselines/baselines/ppo1/') + files[i]
            print (src)
            #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname
            dest = dirname + "src_code/"
            if (first):
                os.makedirs(dest)
                first = False
            print (dest)
            shutil.copy2(src,dest)
        # brute force copy normal env file at end of copying process:
        src = os.path.join(dirname_rel,'nfunk/envs_nf/pendulum_nf.py')          
        shutil.copy2(src,dest)
        shutil.copy2(src,dest)
        os.makedirs(dest+"assets/")
        src = os.path.join(dirname_rel,'nfunk/envs_nf/assets/clockwise.png')
        shutil.copy2(src,dest+"assets/")
    ###


    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    max_action = env.action_space.high

    # add the dimension in the observation space!
    ob_space.shape =((ob_space.shape[0] + ac_space.shape[0]),)
    print (ob_space.shape)
    print (ac_space.shape)

    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function 
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
    pol_ov_op_ent = tf.placeholder(dtype=tf.float32, shape=None) # Entropy coefficient for policy over options


    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon for PPO


    # setup observation, option and terminal advantace
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv', dtype=tf.float32, shape=[None])

    # create variable for action
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    # propability of choosing action under new policy vs old policy (PPO)
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) 
    # advantage of choosing the action
    atarg_clip = atarg
    # surrogate 1:
    surr1 = ratio * atarg_clip #atarg # surrogate from conservative policy iteration
    # surrogate 2:
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg_clip 
    # PPO's pessimistic surrogate (L^CLIP)
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) 

    # Loss on the Q-function
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    # calculate the total loss
    total_loss = vf_loss
    intra_op = pol_surr 
    
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    # calculate logarithm of propability of policy over options
    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0))
    # calculate logarithm of propability of policy over options old parameter
    old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0))
    # calculate entropy of policy over options
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)

    # calculate the ppo update for the policy over options:
    ratio_pol_ov_op = tf.exp(tf.transpose(log_pi)[option[0]] - tf.transpose(old_log_pi)[option[0]]) # pnew / pold
    term_adv_clip = term_adv 
    surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip # surrogate from conservative policy iteration
    surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param, 1.0 + clip_param) * term_adv_clip #
    pol_surr_pol_ov_op = - U.mean(tf.minimum(surr1_pol_ov_op, surr2_pol_ov_op)) # PPO's pessimistic surrogate (L^CLIP)
    
    op_loss = pol_surr_pol_ov_op - pol_ov_op_ent*tf.reduce_sum(entropy)

    # add loss of policy over options to total loss
    #total_loss += op_loss
    total_loss1 = total_loss + intra_op
    total_loss2 = total_loss + op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    # define function that we will later do gradient descent on
    lossandgrad1 = U.function([ob, ac, atarg, ret, lrmult,option, term_adv,pol_ov_op_ent], losses + [U.flatgrad(total_loss1, var_list)])
    lossandgrad2 = U.function([ob, ac, atarg, ret, lrmult,option, term_adv,pol_ov_op_ent], losses + [U.flatgrad(total_loss2, var_list)])
    
    # define adam optimizer
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    # define function that will assign the current parameters to the old policy
    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)


    U.initialize()
    adam.sync()


    # NOW: everything for training was defined, from here on we start with the execution:

    # initialize "savers" which will store the results
    saver = tf.train.Saver(max_to_keep=10000)
    saver_best = tf.train.Saver(max_to_keep=1)


    ### Define the names of the .csv files that are going to be stored
    results=[]
    if saves:
        results = open(dirname + version_name + '_' + gamename +'_'+str(num_options)+'opts_'+'_results.csv','w')
        results_best_model = open(dirname + version_name + '_' + gamename +'_'+str(num_options)+'opts_'+'_bestmodel.csv','w')


        out = 'epoch,avg_reward'

        for opt in range(num_options): out += ',option {} dur'.format(opt)
        for opt in range(num_options): out += ',option {} std'.format(opt)
        for opt in range(num_options): out += ',option {} term'.format(opt)
        for opt in range(num_options): out += ',option {} adv'.format(opt)
        out+='\n'
        results.write(out)
        results.flush()

    # speciality: if running the training with epoch argument -> a model is loaded
    if epoch >= 0:
        
        dirname = '{}_{}opts_saves/'.format(gamename,num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename,epoch)
        saver.restore(U.get_session(),filename)
    ###    


    # start training
    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    alternating_frequency = 25 # defines after how many epochs we switch optimizing between control and communication
    des_pol_op_ent = 0.1    # define policy over options entropy scheduling
    max_val = -100000       # define max_val, this will be updated to always store the best model
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True, num_options=num_options,saves=saves,results=results,rewbuffer=rewbuffer,dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        # Sample (s,a)-Transitions
        seg = seg_gen.__next__()
        # Calculate A(s,a,o) using GAE
        add_vtarg_and_adv(seg, gamma, lam)


        # calculate information for logging
        opt_d = []
        for i in range(num_options):
            dur = np.mean(seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))
        print("mean opt dur:", opt_d)             
        print("mean op pol:", np.mean(np.array(seg['optpol_p']),axis=0))         
        print("mean term p:", np.mean(np.array(seg['term_p']),axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']),axis=0))
       

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
        if hasattr(pi, "ob_rms_only"): pi.ob_rms_only.update(ob[:,:-ac_space.shape[0]]) # update running mean/std for policy
        assign_old_eq_new() # set old parameter values to new parameter values

        # if iterations modulo 1000 -> adapt entropy scheduling coefficient
        #if ((iters_so_far+1)%1000 and (iters_so_far+1)>=2000) == 0:
        if ((iters_so_far+1)%1000) == 0:
            des_pol_op_ent = des_pol_op_ent/10

        # every 50 epochs save the best model
        if iters_so_far % 50 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(gamename,iters_so_far)
            save_path = saver.save(U.get_session(),filename)

        # adaptively save best model -> if current reward is highest, save the model
        if (np.mean(rewbuffer)>max_val) and wsaves:
            max_val = np.mean(rewbuffer)
            results_best_model.write('epoch: '+str(iters_so_far) + 'rew: ' + str(np.mean(rewbuffer)) + '\n')
            results_best_model.flush()
            filename = dirname + 'best.ckpt'.format(gamename,iters_so_far)
            save_path = saver_best.save(U.get_session(),filename)



        # minimum batch size:
        min_batch=160 
        t_advs = [[] for _ in range(num_options)]
        
        # select all the samples concering one of the options
        # Note: so far the update is that we first use all samples from option 0 to update, then we use all samples from option 1 to update
        for opt in range(num_options):
            indices = np.where(opts==opt)[0]
            print("batch size:",indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue


            ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'],ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'],ac[indices]))
                    cat_atarg = np.concatenate((oldmap['atarg'],atarg[indices]))
                    cat_vtarg = np.concatenate((oldmap['vtarg'],tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n < min_batch) or (indices.size > min_batch and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'],ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'],ac[indices]))
                    cat_atarg = np.concatenate((oldmap['atarg'],atarg[indices]))
                    cat_vtarg = np.concatenate((oldmap['vtarg'],tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob, ac=cat_ac, atarg=cat_atarg, vtarg=cat_vtarg), shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices], ac=ac[indices], atarg=atarg[indices], vtarg=tdlamret[indices]), shuffle=not pi.recurrent)
            ###


            # define the batchsize of the optimizer:
            optim_batchsize = optim_batchsize or ob.shape[0]
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")


            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [] # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    # Calculate advantage for using specific option here
                    tadv,nodc_adv = pi.get_opt_adv(batch["ob"],[opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    # calculate the gradient
                    #VAR 1:
                    #if ((iters_so_far+1)>=2000):
                    #    *newlosses, grads = lossandgrad2(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)
                    #else:
                    #    *newlosses, grads = lossandgrad1(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)

                    if (int((iters_so_far)/alternating_frequency)%2==1):
                        *newlosses, grads = lossandgrad2(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)
                    else:
                        #print ("optim comm always")
                        *newlosses, grads = lossandgrad1(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv,des_pol_op_ent)

                    # perform gradient update
                    adam.update(grads, optim_stepsize * cur_lrmult) 
                    losses.append(newlosses)


        # do logging:
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            for _ in range(num_options): out+=",{},{},{},{}"
            out+="\n"
            

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options): info.append(opt_d[i])
            for i in range(num_options): info.append(std[i])
            for i in range(num_options): info.append(np.mean(np.array(seg['term_p']),axis=0)[i])
            for i in range(num_options): 
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
Example #16
0
def cmaml_loss(neglogpacs, advantage):
    #  add in correction term.
    mean_adv = U.mean(advantage)
    exploration_term = U.mean(neglogpacs) * mean_adv
    return exploration_term
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        num_options=1,
        app='',
        saves=False,
        wsaves=False,
        epoch=-1,
        seed=1,
        dc=0):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)
    env.seed(seed)

    ### Book-keeping
    gamename = env.spec.id[:-3].lower()
    gamename += 'seed' + str(seed)
    gamename += app
    version_name = 'FINAL_NORM-ACT-LOWER-LR-len-400-wNoise-update1-ppo-ESCH-1-2-5-nI'

    dirname = '{}_{}_{}opts_saves/'.format(version_name, gamename, num_options)
    print(dirname)
    #input ("wait here after dirname")

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'mlp_policy.py', 'run_mujoco.py']
        first = True
        for i in range(len(files)):
            src = os.path.join(
                '/home/nfunk/Code_MA/ppoc_off_tryout/baselines/baselines/ppo1/'
            ) + files[i]
            print(src)
            #dest = os.path.join('/home/nfunk/results_NEW/ppo1/') + dirname
            dest = dirname + "src_code/"
            if (first):
                os.makedirs(dest)
                first = False
            print(dest)
            shutil.copy2(src, dest)
        # brute force copy normal env file at end of copying process:
        src = os.path.join(
            '/home/nfunk/Code_MA/ppoc_off_tryout/nfunk/envs_nf/pendulum_nf.py')
        shutil.copy2(src, dest)
    ###

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    max_action = env.action_space.high

    # add the dimension in the observation space!
    ob_space.shape = ((ob_space.shape[0] + ac_space.shape[0]), )
    print(ob_space.shape)
    print(ac_space.shape)
    #input ("wait here where the spaces are printed!!!")
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    pol_ov_op_ent = tf.placeholder(dtype=tf.float32,
                                   shape=None)  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # pdb.set_trace()
    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    atarg_clip = atarg  #tf.clip_by_value(atarg,-10,10)
    surr1 = ratio * atarg_clip  #atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param,
                   1.0 + clip_param) * atarg_clip  #atarg #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    #vf_loss = U.mean(tf.square(tf.clip_by_value(pi.vpred - ret, -10.0, 10.0)))
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    force_pi_loss = U.mean(
        tf.square(
            tf.clip_by_value(pi.op_pi, 1e-5, 1.0) -
            tf.constant([[0.05, 0.95]])))

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-5, 1.0))
    #log_pi = tf.Print(log_pi, [log_pi, tf.shape(tf.transpose(log_pi))])
    old_log_pi = tf.log(tf.clip_by_value(oldpi.op_pi, 1e-5, 1.0))
    entropy = -tf.reduce_sum(pi.op_pi * log_pi, reduction_indices=1)

    ratio_pol_ov_op = tf.exp(
        tf.transpose(log_pi)[option[0]] -
        tf.transpose(old_log_pi)[option[0]])  # pnew / pold
    term_adv_clip = term_adv  #tf.clip_by_value(term_adv,-10,10)
    surr1_pol_ov_op = ratio_pol_ov_op * term_adv_clip  # surrogate from conservative policy iteration
    surr2_pol_ov_op = U.clip(ratio_pol_ov_op, 1.0 - clip_param,
                             1.0 + clip_param) * term_adv_clip  #
    pol_surr_pol_ov_op = -U.mean(
        tf.minimum(surr1_pol_ov_op,
                   surr2_pol_ov_op))  # PPO's pessimistic surrogate (L^CLIP)

    op_loss = pol_surr_pol_ov_op - pol_ov_op_ent * tf.reduce_sum(entropy)
    #op_loss = pol_surr_pol_ov_op

    #total_loss += force_pi_loss
    total_loss += op_loss

    var_list = pi.get_trainable_variables()
    term_list = var_list[6:8]

    lossandgrad = U.function(
        [ob, ac, atarg, ret, lrmult, option, term_adv, pol_ov_op_ent],
        losses + [U.flatgrad(total_loss, var_list)])
    termloss = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    saver = tf.train.Saver(max_to_keep=10000)
    saver_best = tf.train.Saver(max_to_keep=1)

    ### More book-kepping
    results = []
    if saves:
        results = open(
            version_name + '_' + gamename + '_' + str(num_options) + 'opts_' +
            '_results.csv', 'w')
        results_best_model = open(
            dirname + version_name + '_' + gamename + '_' + str(num_options) +
            'opts_' + '_bestmodel.csv', 'w')

        out = 'epoch,avg_reward'

        for opt in range(num_options):
            out += ',option {} dur'.format(opt)
        for opt in range(num_options):
            out += ',option {} std'.format(opt)
        for opt in range(num_options):
            out += ',option {} term'.format(opt)
        for opt in range(num_options):
            out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch >= 0:

        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)
    ###

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    des_pol_op_ent = 0.1
    max_val = -100000
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        std = []
        for i in range(num_options):
            logstd = np.mean(
                seg['logstds'][i]) if len(seg['logstds'][i]) > 0 else 0.
            std.append(np.exp(logstd))
        print("mean opt dur:", opt_d)
        print("mean op pol:", np.mean(np.array(seg['optpol_p']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean value val:", np.mean(np.array(seg['value_val']), axis=0))

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        if hasattr(pi, "ob_rms_only"):
            pi.ob_rms_only.update(ob[:, :-ac_space.shape[0]]
                                  )  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if (iters_so_far + 1) % 1000 == 0:
            des_pol_op_ent = des_pol_op_ent / 10

        if iters_so_far % 50 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        # adaptively save best run:
        if (np.mean(rewbuffer) > max_val) and wsaves:
            max_val = np.mean(rewbuffer)
            results_best_model.write('epoch: ' + str(iters_so_far) + 'rew: ' +
                                     str(np.mean(rewbuffer)) + '\n')
            results_best_model.flush()
            filename = dirname + 'best.ckpt'.format(gamename, iters_so_far)
            save_path = saver_best.save(U.get_session(), filename)

        min_batch = 160  # Arbitrary
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            ### This part is only necessasry when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif indices.size + datas[opt].n < min_batch:
                    # pdb.set_trace()
                    oldmap = datas[opt].data_map

                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue

                elif (indices.size + datas[opt].n > min_batch and datas[opt].n
                      < min_batch) or (indices.size > min_batch
                                       and datas[opt].n < min_batch):

                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)

                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)

            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
            ###

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    #tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt])
                    tadv, nodc_adv = pi.get_opt_adv(batch["ob"], [opt])
                    tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    t_advs[opt].append(nodc_adv)

                    #if (opt==1):
                    #    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    #else:
                    #    *newlosses, grads = lossandgrad0(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    *newlosses, grads = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult,
                                                    [opt], tadv,
                                                    des_pol_op_ent)
                    #*newlosses, grads = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, [opt], tadv)
                    #termg = termloss(batch["ob"], [opt], tadv)
                    #adam.update(termg[0], 5e-7 * cur_lrmult)
                    adam.update(grads, optim_stepsize * cur_lrmult)
                    losses.append(newlosses)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            for _ in range(num_options):
                out += ",{},{},{},{}"
            out += "\n"

            info = [iters_so_far, np.mean(rewbuffer)]
            for i in range(num_options):
                info.append(opt_d[i])
            for i in range(num_options):
                info.append(std[i])
            for i in range(num_options):
                info.append(np.mean(np.array(seg['term_p']), axis=0)[i])
            for i in range(num_options):
                info.append(np.mean(t_advs[i]))

            results.write(out.format(*info))
            results.flush()
Example #18
0
def learn(
        env,
        policy_func,
        discriminator,
        expert_dataset,
        pretrained,
        pretrained_weight,
        *,
        g_step,
        d_step,
        timesteps_per_batch,  # what to train on
        max_kl,
        cg_iters,
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        d_stepsize=3e-4,
        vf_iters=3,
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        save_per_iter=100,
        ckpt_dir=None,
        log_dir=None,
        load_model_path=None,
        task_name=None):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("pol")
    ]
    vf_var_list = [
        v for v in all_var_list if v.name.split("/")[1].startswith("vf")
    ]
    d_adam = MpiAdam(discriminator.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n(
        [U.sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents)])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    writer = U.FileWriter(log_dir)
    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     discriminator,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0]) == 1

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(discriminator.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())
    # if provieded model path
    if load_model_path is not None:
        U.load_state(load_model_path)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break

        # Save model
        if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            U.save_state(os.path.join(ckpt_dir, task_name),
                         counter=iters_so_far)

        logger.log("********** Iteration %i ************" % iters_so_far)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        for _ in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()
            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before udpate
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "ob_rms"):
                            pi.ob_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

        g_losses = meanlosses
        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        # ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        logger.log(fmt_row(13, discriminator.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
        batch_size = len(ob) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (ob, ac), include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # update running mean/std for discriminator
            if hasattr(discriminator, "obs_rms"):
                discriminator.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch,
                                                      ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))

        lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]
                   )  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
        true_rewbuffer.extend(true_rets)
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank == 0:
            logger.dump_tabular()
            g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
            d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0),
                                         iters_so_far)
            ep_stats.add_all_summary(writer, [
                np.mean(true_rewbuffer),
                np.mean(rewbuffer),
                np.mean(lenbuffer)
            ], iters_so_far)
Example #19
0
    def __init__(self, a_name, env, policy_func, par):
        # Setup losses and stuff
        # ----------------------------------------
        self.env = env
        self.timesteps_per_actorbatch = par.timesteps_per_actorbatch
        self.optim_epochs = par.optim_epochs
        self.optim_stepsize = par.optim_stepsize
        self.optim_batchsize = par.optim_batchsize  # optimization hypers
        self.gamma = par.gamma
        self.lam = par.lam  # advantage estimation
        self.max_timesteps = par.max_timesteps
        self.max_episodes = par.max_episodes
        self.max_iters = par.max_iters
        self.max_seconds = par.max_seconds  # time constraint
        self.callback = par.callback,  # you can do anything in the callback, since it takes locals(), globals()
        self.adam_epsilon = par.adam_epsilon
        self.schedule = par.schedule  # annealing for stepsize parameters (epsilon and adam)

        self.ob_space = env.observation_space
        self.ac_space = env.action_space
        self.pi = policy_func(
            a_name, self.ob_space,
            self.ac_space)  # Construct network for new policy
        self.oldpi = policy_func("old" + a_name, self.ob_space,
                                 self.ac_space)  # Network for old policy
        self.atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        self.ret = tf.placeholder(dtype=tf.float32,
                                  shape=[None])  # Empirical return

        self.lrmult = tf.placeholder(
            name='lrmult' + a_name, dtype=tf.float32,
            shape=[])  # learning rate multiplier, updated with schedule
        self.clip_param = par.clip_param * self.lrmult  # Annealed cliping parameter epislon

        obname = str('ob' + str(learning_agent.index2))
        learning_agent.index2 += 1
        self.ob = U.get_placeholder_cached(name=obname)
        self.ac = self.pi.pdtype.sample_placeholder([None])

        self.kloldnew = self.oldpi.pd.kl(self.pi.pd)
        self.ent = self.pi.pd.entropy()
        self.meankl = U.mean(self.kloldnew)
        self.meanent = U.mean(self.ent)
        self.pol_entpen = (-par.entcoeff) * self.meanent

        self.ratio = tf.exp(
            self.pi.pd.logp(self.ac) -
            self.oldpi.pd.logp(self.ac))  # pnew / pold
        surr1 = self.ratio * self.atarg  # surrogate from conservative policy iteration
        surr2 = U.clip(self.ratio, 1.0 - self.clip_param,
                       1.0 + self.clip_param) * self.atarg  #
        self.pol_surr = -U.mean(tf.minimum(
            surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
        self.vf_loss = U.mean(tf.square(self.pi.vpred - self.ret))
        self.total_loss = self.pol_surr + self.pol_entpen + self.vf_loss
        self.losses = [
            self.pol_surr, self.pol_entpen, self.vf_loss, self.meankl,
            self.meanent
        ]
        self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

        self.var_list = self.pi.get_trainable_variables()
        self.lossandgrad = U.function(
            [self.ob, self.ac, self.atarg, self.ret, self.lrmult],
            self.losses + [U.flatgrad(self.total_loss, self.var_list)])
        self.adam = MpiAdam(self.var_list, epsilon=self.adam_epsilon)

        self.assign_old_eq_new = U.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                    self.oldpi.get_variables(), self.pi.get_variables())
            ])
        self.compute_losses = U.function(
            [self.ob, self.ac, self.atarg, self.ret, self.lrmult], self.losses)

        print(U.get_session())
        U.initialize()

        self.adam.sync()
def learn(
    # =========== modified part begins =========== #
    env_id,
    seed,
    robot,  # robot class with GMM params
    joint_optimization_iters,  # total number of joint optimization iterations
    design_iters,  # number of samples when updating physical design in each joint optimization iteration
    policy_iters,  # number of samples when updating robot policy in each joint optimization iteration
    # ============ modified part ends ============ #
    policy_func,
    *,
    timesteps_per_actorbatch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):

    # ================================== modification 1 ================================== #
    """
    input:  replace "env" (env class) with "env_id" (string)
            add "seed" (int)
        reason: to enable env.make() during training
        modification detail: add following lines into learn()
            env = gym.make(env_id)
            env = bench.Monitor(env, logger.get_dir())
            env.seed(seed)
            env.close() # added at the end of learn()
    """
    import roboschool, gym
    from baselines import bench
    env = gym.make(env_id)
    env = bench.Monitor(env, logger.get_dir())
    env.seed(seed)
    # ================================== modification 1 ================================== #

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    # policy_func is the initialization of NN
    # NN structure:
    #   state -> (num_hid_layers) fully-connected layers with (hid_size) units -> (action, predicted value)
    #       num_hid_layers, hid_size: set in the file calls "learn"
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    # placeholder for "ob"
    # created in mlppolicy.py
    ob = U.get_placeholder_cached(name="ob")
    # placeholder for "ac"
    # in common/distribution.py
    ac = pi.pdtype.sample_placeholder([None])

    # KL divergence and Entropy, defined in common/distribution.py
    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)

    # pol_entpen: Entropy Bounus encourages exploration
    # entcoeff: entropy coefficient, defined in PPO page 5, Equ. (9)
    pol_entpen = (-entcoeff) * meanent

    # probability ration, defined in PPO page 3
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold

    # Surrogate Goal
    # defined in PPO page 3, Equ (7)
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    # Value Function Loss: square error loss for ||v_pred - v_target||
    vf_loss = U.mean(tf.square(pi.vpred - ret))

    # Total_loss = L^CLIP - Value Function Loss + Entropy Bounus
    # defined in PPO page 5, Equ. (9)
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    # adam optimizer?
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    # oldpi = pi
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    # Why we need this line?
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # ================================== modification 2 ================================== #
    for joint_optimization_iter in range(joint_optimization_iters):
        U.save_state('/home/yetong/Desktop/Project/models/model{}.ckpt'.format(
            joint_optimization_iter))
        logger.log("joint optimization progree: {}/{}".format(
            joint_optimization_iter, joint_optimization_iters))
        # ================================== update physical design ================================== #
        if joint_optimization_iter > 20:
            Rewards_plus = np.zeros(design_iters)
            Rewards_minum = np.zeros(design_iters)
            params = robot.sample(design_iters, to_update=True)
            for i, param in enumerate(params):
                robot.modify_file(param)
                env = gym.make(env_id)
                # myenv = env.env

                # pdb.set_trace()
                env = bench.Monitor(env, logger.get_dir())
                R = episode_generator(pi, env, gamma, stochastic=True)
                logger.log("\t update physical design: %d/%d, rew: %f" %
                           (i, 2 * design_iters, R))
                if i % 2 == 0:
                    Rewards_plus[int(i / 2)] = R
                else:
                    Rewards_minum[int(i / 2)] = R
            logger.log("prev_mu: ", robot.params_mu)
            logger.log("prev_sig: ", robot.params_sig)
            robot.update(Rewards_plus, Rewards_minum)
            logger.log("mu: ", robot.params_mu)
            logger.log("sig: ", robot.params_sig)
        # ================================== update policy ================================== #
        # params = robot.sample(design_iters)
        params = [robot.params_mu]
        for param in params:
            # reinitialize env
            robot.modify_file(param)
            env = gym.make(env_id)
            env = bench.Monitor(env, logger.get_dir())
            # ================================== modification 2 ================================== #

            # Prepare for rollouts
            # ----------------------------------------
            seg_gen = traj_segment_generator(pi,
                                             env,
                                             timesteps_per_actorbatch,
                                             stochastic=True)

            episodes_so_far = 0
            timesteps_so_far = 0
            iters_so_far = 0
            tstart = time.time()
            lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
            rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

            assert sum([
                max_iters > 0, max_timesteps > 0, max_episodes > 0,
                max_seconds > 0
            ]) == 1, "Only one time constraint permitted"

            while True:
                if callback: callback(locals(), globals())
                if max_timesteps and timesteps_so_far >= max_timesteps:
                    break
                elif max_episodes and episodes_so_far >= max_episodes:
                    break
                elif max_iters and iters_so_far >= max_iters:
                    break
                elif max_seconds and time.time() - tstart >= max_seconds:
                    break

                # annealing for stepsize parameters (epsilon and adam)
                if schedule == 'constant':
                    cur_lrmult = 1.0
                elif schedule == 'linear':
                    cur_lrmult = max(
                        1.0 - float(timesteps_so_far) / max_timesteps, 0)
                else:
                    raise NotImplementedError

                logger.log("********** Iteration %i ************" %
                           iters_so_far)

                seg = seg_gen.__next__()
                add_vtarg_and_adv(seg, gamma, lam)

                # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
                ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg[
                    "adv"], seg["tdlamret"]
                vpredbefore = seg[
                    "vpred"]  # predicted value function before udpate
                atarg = (atarg - atarg.mean()) / atarg.std(
                )  # standardized advantage function estimate
                d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                            shuffle=not pi.recurrent)
                optim_batchsize = optim_batchsize or ob.shape[0]

                if hasattr(pi, "ob_rms"):
                    pi.ob_rms.update(ob)  # update running mean/std for policy

                # oldpi = pi
                # set old parameter values to new parameter values
                assign_old_eq_new()
                logger.log("Optimizing...")
                logger.log(fmt_row(13, loss_names))
                # Here we do a bunch of optimization epochs over the data
                for _ in range(optim_epochs):
                    losses = [
                    ]  # list of tuples, each of which gives the loss for a minibatch
                    for batch in d.iterate_once(optim_batchsize):
                        *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                                    batch["atarg"],
                                                    batch["vtarg"], cur_lrmult)
                        adam.update(g, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    logger.log(fmt_row(13, np.mean(losses, axis=0)))

                logger.log("Evaluating losses...")
                losses = []
                for batch in d.iterate_once(optim_batchsize):
                    newlosses = compute_losses(batch["ob"], batch["ac"],
                                               batch["atarg"], batch["vtarg"],
                                               cur_lrmult)
                    losses.append(newlosses)
                meanlosses, _, _ = mpi_moments(losses, axis=0)
                logger.log(fmt_row(13, meanlosses))
                for (lossval, name) in zipsame(meanlosses, loss_names):
                    logger.record_tabular("loss_" + name, lossval)
                logger.record_tabular(
                    "ev_tdlam_before",
                    explained_variance(vpredbefore, tdlamret))
                lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
                listoflrpairs = MPI.COMM_WORLD.allgather(
                    lrlocal)  # list of tuples
                lens, rews = map(flatten_lists, zip(*listoflrpairs))
                lenbuffer.extend(lens)
                rewbuffer.extend(rews)
                logger.record_tabular("EpLenMean", np.mean(lenbuffer))
                logger.record_tabular("EpRewMean", np.mean(rewbuffer))
                logger.record_tabular("EpThisIter", len(lens))
                episodes_so_far += len(lens)
                timesteps_so_far += sum(lens)
                iters_so_far += 1
                logger.record_tabular("EpisodesSoFar", episodes_so_far)
                logger.record_tabular("TimestepsSoFar", timesteps_so_far)
                logger.record_tabular("TimeElapsed", time.time() - tstart)
                if MPI.COMM_WORLD.Get_rank() == 0:
                    logger.dump_tabular()

    # ================================== modification 1 ================================== #
    env.close()
Example #21
0
def learn(env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant' # annealing for stepsize parameters (epsilon and adam)
        ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult) 
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)            
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank()==0:
            logger.dump_tabular()
Example #22
0
def learn(env, policy_func, *,
        timesteps_per_batch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint
        noisy_nets=False,
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant', # annealing for stepsize parameters (epsilon and adam)
        desired_kl=0.02,
        logdir=".",
        agentName="PPO-Agent",
        resume = 0,
        num_parallel=1,
        num_cpu=1
        ):
    # Setup losses and stuff
    # ----------------------------------------
    rank = MPI.COMM_WORLD.Get_rank()
    ob_space = env.observation_space
    ac_space = env.action_space

    ob_size = ob_space.shape[0]
    ac_size = ac_space.shape[0]

    #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape))
    #exit(0)
    pi = policy_func("pi", ob_space, ac_space, noisy_nets) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space, noisy_nets) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vfloss1 = tf.square(pi.vpred - ret)
    vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred, -clip_param, clip_param)
    vfloss2 = tf.square(vpredclipped - ret)
    vf_loss = .5 * U.mean(tf.maximum(vfloss1, vfloss2)) # we do the same clipping-based trust region for the value function
    #vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    if noisy_nets:
        stochastic = False
    else:
        stochastic = True
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=stochastic, num_parallel=num_parallel, num_cpu=num_cpu, rank=rank, ob_size=ob_size, ac_size=ac_size,com=MPI.COMM_WORLD)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    saver = tf.train.Saver()
    if resume > 0:
        saver.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume)))
    iters_so_far = resume
    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    logF = open(os.path.join(logdir, 'log.txt'), 'a')
    logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a')

    dump_training = 0
    learn_from_training = 0
    if dump_training:
        if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'):
            with open(logdir + "\\" +'ob_list_' + str(rank) + '.pkl', 'rb') as f:
                ob_list = pickle.load(f)
        else:
            ob_list = []

        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std
        saverRMS = tf.train.Saver({"_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count})
        saverRMS.save(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf"))

        ob_np_a = np.asarray(ob_list)
        ob_np = np.reshape(ob_np_a, (-1,ob_size))
        [vpred, pdparam] = pi._vpred_pdparam(ob_np)

        print("vpred = " + str(vpred))
        print("pd_param = " + str(pdparam))
        with open('training.pkl', 'wb') as f:
            pickle.dump(ob_np, f)
            pickle.dump(vpred, f)
            pickle.dump(pdparam, f)
        exit(0)

    if learn_from_training:
        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std
        with open('training.pkl', 'rb') as f:
            ob_np = pickle.load(f)
            vpred = pickle.load(f)
            pdparam = pickle.load(f)
        num = ob_np.shape[0]
        for i in range(num):
            xp = ob_np[i][1]
            ob_np[i][1] = 0.0
            ob_np[i][18] -= xp
            ob_np[i][22] -= xp
            ob_np[i][24] -= xp
            ob_np[i][26] -= xp
            ob_np[i][28] -= xp
            ob_np[i][30] -= xp
            ob_np[i][32] -= xp
            ob_np[i][34] -= xp
        print("ob_np = " + str(ob_np))
        print("vpred = " + str(vpred))
        print("pdparam = " + str(pdparam))
        batch_size = 128

        y_vpred = tf.placeholder(tf.float32, [batch_size, ])
        y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]])

        vpred_loss = U.mean(tf.square(pi.vpred - y_vpred))
        vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam))

        total_train_loss = vpred_loss + vpdparam_loss
        #total_train_loss = vpdparam_loss
        #total_train_loss = vpred_loss
        #coef = 0.01
        #dense_all = U.dense_all
        #for a in dense_all:
        #   total_train_loss += coef * tf.nn.l2_loss(a)
        #total_train_loss = vpdparam_loss
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(total_train_loss)
        d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam), shuffle=not pi.recurrent)
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        saverRMS = tf.train.Saver({"_sum": pi.ob_rms._sum, "_sumsq": pi.ob_rms._sumsq, "_count": pi.ob_rms._count})
        saverRMS.restore(tf.get_default_session(), os.path.join(os.path.abspath(logdir), "rms.tf"))
        if resume > 0:
            saver.restore(tf.get_default_session(),
                          os.path.join(os.path.abspath(logdir), "{}-{}".format(agentName, resume)))
        for q in range(100):
            sumLoss = 0
            for batch in d.iterate_once(batch_size):
                tl, _ = sess.run([total_train_loss, optimizer], feed_dict={pi.ob: batch["ob"], y_vpred: batch["vpred"], y_pdparam:batch["pdparam"]})
                sumLoss += tl
            print("Iteration " + str(q)+ " Loss = " + str(sumLoss))
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Save as frame 1
        try:
            saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=1)
        except:
            pass
        #exit(0)

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'adaptive' or 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0.0)
        elif schedule == 'linear_clipped':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0.2)
        elif schedule == 'cyclic':
        #    cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
            raise NotImplementedError
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel, num_cpu)
        #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"]))

        #exit(0)
        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]

        if dump_training:
            ob_list.append(ob.tolist())
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        assign_old_eq_new() # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
                if desired_kl != None and schedule == 'adaptive':
                    if newlosses[-2] > desired_kl * 2.0:
                        optim_stepsize = max(1e-8, optim_stepsize / 1.5)
                        print('kl divergence was too large = ', newlosses[-2])
                        print('New optim_stepsize = ', optim_stepsize)
                    elif newlosses[-2] < desired_kl / 2.0:
                        optim_stepsize = min(1e0, optim_stepsize * 1.5)
                        print('kl divergence was too small = ', newlosses[-2])
                        print('New optim_stepsize = ', optim_stepsize)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            #print(str(losses))
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
            losses.append(newlosses)
        meanlosses,_,_ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))

        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_"+name, lossval)
        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        rewmean = np.mean(rewbuffer)
        logger.record_tabular("EpRewMean", rewmean)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if dump_training:
            with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f:
                pickle.dump(ob_list, f)

        if MPI.COMM_WORLD.Get_rank()==0:
            logF.write(str(rewmean) + "\n")
            logStats.write(logger.get_str() + "\n")
            logF.flush()
            logStats.flush()

            logger.dump_tabular()

            try:
                os.remove(logdir + "/checkpoint")
            except OSError:
                pass
            try:
                saver.save(tf.get_default_session(), os.path.join(logdir, agentName), global_step=iters_so_far)
            except:
                pass
Example #23
0
def learn(
        env,
        policy_func,
        disc,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        logdir=".",
        agentName="PPO-Agent",
        resume=0,
        num_parallel=0,
        num_cpu=1,
        num_extra=0,
        gan_batch_size=128,
        gan_num_epochs=5,
        gan_display_step=40,
        resume_disc=0,
        resume_non_disc=0,
        mocap_path="",
        gan_replay_buffer_size=1000000,
        gan_prob_to_put_in_replay=0.01,
        gan_reward_to_retrain_discriminator=5,
        use_distance=0,
        use_blend=0):
    # Deal with GAN
    if not use_distance:
        replay_buf = MyReplayBuffer(gan_replay_buffer_size)
    data = np.loadtxt(
        mocap_path + ".dat"
    )  #"D:/p4sw/devrel/libdev/flex/dev/rbd/data/bvh/motion_simple.dat");
    label = np.concatenate((np.ones(
        (data.shape[0], 1)), np.zeros((data.shape[0], 1))),
                           axis=1)

    print("Real data label = " + str(label))

    mocap_set = Dataset(dict(data=data, label=label), shuffle=True)

    # Setup losses and stuff
    # ----------------------------------------
    rank = MPI.COMM_WORLD.Get_rank()
    ob_space = env.observation_space
    ac_space = env.action_space

    ob_size = ob_space.shape[0]
    ac_size = ac_space.shape[0]

    #print("rank = " + str(rank) + " ob_space = "+str(ob_space.shape) + " ac_space = "+str(ac_space.shape))
    #exit(0)
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vfloss1 = tf.square(pi.vpred - ret)
    vpredclipped = oldpi.vpred + tf.clip_by_value(pi.vpred - oldpi.vpred,
                                                  -clip_param, clip_param)
    vfloss2 = tf.square(vpredclipped - ret)
    vf_loss = .5 * U.mean(
        tf.maximum(vfloss1, vfloss2)
    )  # we do the same clipping-based trust region for the value function
    #vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    sess = tf.get_default_session()

    avars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    non_disc_vars = [
        a for a in avars
        if not a.name.split("/")[0].startswith("discriminator")
    ]
    disc_vars = [
        a for a in avars if a.name.split("/")[0].startswith("discriminator")
    ]
    #print(str(non_disc_names))
    #print(str(disc_names))
    #exit(0)
    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    disc_saver = tf.train.Saver(disc_vars, max_to_keep=None)
    non_disc_saver = tf.train.Saver(non_disc_vars, max_to_keep=None)
    saver = tf.train.Saver(max_to_keep=None)
    if resume > 0:
        saver.restore(
            tf.get_default_session(),
            os.path.join(os.path.abspath(logdir),
                         "{}-{}".format(agentName, resume)))
        if not use_distance:
            if os.path.exists(logdir + "\\" + 'replay_buf_' +
                              str(int(resume / 100) * 100) + '.pkl'):
                print("Load replay buf")
                with open(
                        logdir + "\\" + 'replay_buf_' +
                        str(int(resume / 100) * 100) + '.pkl', 'rb') as f:
                    replay_buf = pickle.load(f)
            else:
                print("Can't load replay buf " + logdir + "\\" +
                      'replay_buf_' + str(int(resume / 100) * 100) + '.pkl')
    iters_so_far = resume

    if resume_non_disc > 0:
        non_disc_saver.restore(
            tf.get_default_session(),
            os.path.join(
                os.path.abspath(logdir),
                "{}-{}".format(agentName + "_non_disc", resume_non_disc)))
        iters_so_far = resume_non_disc

    if use_distance:
        print("Use distance")
        nn = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(data)
    else:
        nn = None
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     disc,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_parallel=num_parallel,
                                     num_cpu=num_cpu,
                                     rank=rank,
                                     ob_size=ob_size,
                                     ac_size=ac_size,
                                     com=MPI.COMM_WORLD,
                                     num_extra=num_extra,
                                     iters_so_far=iters_so_far,
                                     use_distance=use_distance,
                                     nn=nn)

    if resume_disc > 0:
        disc_saver.restore(
            tf.get_default_session(),
            os.path.join(os.path.abspath(logdir),
                         "{}-{}".format(agentName + "_disc", resume_disc)))

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"
    logF = open(logdir + "\\" + 'log.txt', 'a')
    logR = open(logdir + "\\" + 'log_rew.txt', 'a')
    logStats = open(logdir + "\\" + 'log_stats.txt', 'a')
    if os.path.exists(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl'):
        with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'rb') as f:
            ob_list = pickle.load(f)
    else:
        ob_list = []

    dump_training = 0
    learn_from_training = 0
    if dump_training:
        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std
        saverRMS = tf.train.Saver({
            "_sum": pi.ob_rms._sum,
            "_sumsq": pi.ob_rms._sumsq,
            "_count": pi.ob_rms._count
        })
        saverRMS.save(tf.get_default_session(),
                      os.path.join(os.path.abspath(logdir), "rms.tf"))

        ob_np_a = np.asarray(ob_list)
        ob_np = np.reshape(ob_np_a, (-1, ob_size))
        [vpred, pdparam] = pi._vpred_pdparam(ob_np)

        print("vpred = " + str(vpred))
        print("pd_param = " + str(pdparam))
        with open('training.pkl', 'wb') as f:
            pickle.dump(ob_np, f)
            pickle.dump(vpred, f)
            pickle.dump(pdparam, f)
        exit(0)
    if learn_from_training:
        # , "mean": pi.ob_rms.mean, "std": pi.ob_rms.std

        with open('training.pkl', 'rb') as f:
            ob_np = pickle.load(f)
            vpred = pickle.load(f)
            pdparam = pickle.load(f)
        num = ob_np.shape[0]
        for i in range(num):
            xp = ob_np[i][1]
            ob_np[i][1] = 0.0
            ob_np[i][18] -= xp
            ob_np[i][22] -= xp
            ob_np[i][24] -= xp
            ob_np[i][26] -= xp
            ob_np[i][28] -= xp
            ob_np[i][30] -= xp
            ob_np[i][32] -= xp
            ob_np[i][34] -= xp
        print("ob_np = " + str(ob_np))
        print("vpred = " + str(vpred))
        print("pdparam = " + str(pdparam))
        batch_size = 128

        y_vpred = tf.placeholder(tf.float32, [
            batch_size,
        ])
        y_pdparam = tf.placeholder(tf.float32, [batch_size, pdparam.shape[1]])

        vpred_loss = U.mean(tf.square(pi.vpred - y_vpred))
        vpdparam_loss = U.mean(tf.square(pi.pdparam - y_pdparam))

        total_train_loss = vpred_loss + vpdparam_loss
        #total_train_loss = vpdparam_loss
        #total_train_loss = vpred_loss
        #coef = 0.01
        #dense_all = U.dense_all
        #for a in dense_all:
        #   total_train_loss += coef * tf.nn.l2_loss(a)
        #total_train_loss = vpdparam_loss
        optimizer = tf.train.AdamOptimizer(
            learning_rate=0.001).minimize(total_train_loss)
        d = Dataset(dict(ob=ob_np, vpred=vpred, pdparam=pdparam),
                    shuffle=not pi.recurrent)
        sess = tf.get_default_session()
        sess.run(tf.global_variables_initializer())
        saverRMS = tf.train.Saver({
            "_sum": pi.ob_rms._sum,
            "_sumsq": pi.ob_rms._sumsq,
            "_count": pi.ob_rms._count
        })
        saverRMS.restore(tf.get_default_session(),
                         os.path.join(os.path.abspath(logdir), "rms.tf"))
        if resume > 0:
            saver.restore(
                tf.get_default_session(),
                os.path.join(os.path.abspath(logdir),
                             "{}-{}".format(agentName, resume)))

        for q in range(100):
            sumLoss = 0
            for batch in d.iterate_once(batch_size):
                tl, _ = sess.run(
                    [total_train_loss, optimizer],
                    feed_dict={
                        pi.ob: batch["ob"],
                        y_vpred: batch["vpred"],
                        y_pdparam: batch["pdparam"]
                    })
                sumLoss += tl
            print("Iteration " + str(q) + " Loss = " + str(sumLoss))
        assign_old_eq_new()  # set old parameter values to new parameter values

        # Save as frame 1
        try:
            saver.save(tf.get_default_session(),
                       os.path.join(logdir, agentName),
                       global_step=1)
        except:
            pass
        #exit(0)
    if resume > 0:
        firstTime = False
    else:
        firstTime = True

    # Check accuracy
    #amocap = sess.run([disc.accuracy],
    #                feed_dict={disc.input: data,
    #                           disc.label: label})
    #print("Mocap accuracy = " + str(amocap))
    #print("Mocap label is " + str(label))

    #adata = np.array(replay_buf._storage)
    #print("adata shape = " + str(adata.shape))
    #alabel = np.concatenate((np.zeros((adata.shape[0], 1)), np.ones((adata.shape[0], 1))), axis=1)

    #areplay = sess.run([disc.accuracy],
    #                feed_dict={disc.input: adata,
    #                           disc.label: alabel})
    #print("Replay accuracy = " + str(areplay))
    #print("Replay label is " + str(alabel))
    #exit(0)
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam, timesteps_per_batch, num_parallel,
                          num_cpu)
        #print(" ob= " + str(seg["ob"])+ " rew= " + str(seg["rew"])+ " vpred= " + str(seg["vpred"])+ " new= " + str(seg["new"])+ " ac= " + str(seg["ac"])+ " prevac= " + str(seg["prevac"])+ " nextvpred= " + str(seg["nextvpred"])+ " ep_rets= " + str(seg["ep_rets"])+ " ep_lens= " + str(seg["ep_lens"]))

        #exit(0)
        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret, extra = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"], seg["extra"]

        #ob_list.append(ob.tolist())
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            #print(str(losses))
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        rewmean = np.mean(rewbuffer)
        logger.record_tabular("EpRewMean", rewmean)
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        # Train discriminator
        if not use_distance:
            print("Put in replay buf " +
                  str((int)(gan_prob_to_put_in_replay * extra.shape[0] + 1)))
            replay_buf.add(extra[np.random.choice(
                extra.shape[0],
                (int)(gan_prob_to_put_in_replay * extra.shape[0] + 1),
                replace=True)])
            #if iters_so_far == 1:
            if not use_blend:
                if firstTime:
                    firstTime = False
                    # Train with everything we got
                    lb = np.concatenate((np.zeros(
                        (extra.shape[0], 1)), np.ones((extra.shape[0], 1))),
                                        axis=1)
                    extra_set = Dataset(dict(data=extra, label=lb),
                                        shuffle=True)
                    for e in range(10):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            batch = extra_set.next_batch(gan_batch_size)
                            _, l = sess.run(
                                [disc.optimizer_first, disc.loss],
                                feed_dict={
                                    disc.input:
                                    np.concatenate(
                                        (mbatch['data'], batch['data'])),
                                    disc.label:
                                    np.concatenate(
                                        (mbatch['label'], batch['label']))
                                })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))
                if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator:
                    for e in range(gan_num_epochs):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            data = replay_buf.sample(mbatch['data'].shape[0])
                            lb = np.concatenate((np.zeros(
                                (data.shape[0], 1)), np.ones(
                                    (data.shape[0], 1))),
                                                axis=1)
                            _, l = sess.run(
                                [disc.optimizer, disc.loss],
                                feed_dict={
                                    disc.input:
                                    np.concatenate((mbatch['data'], data)),
                                    disc.label:
                                    np.concatenate((mbatch['label'], lb))
                                })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))
            else:
                if firstTime:
                    firstTime = False
                    # Train with everything we got
                    extra_set = Dataset(dict(data=extra), shuffle=True)
                    for e in range(10):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            batch = extra_set.next_batch(gan_batch_size)
                            bf = np.random.uniform(0, 1, (gan_batch_size, 1))
                            onembf = 1 - bf
                            my_label = np.concatenate((bf, onembf), axis=1)
                            my_data = np.multiply(mbatch['data'],
                                                  bf) + np.multiply(
                                                      batch['data'], onembf)
                            _, l = sess.run([disc.optimizer_first, disc.loss],
                                            feed_dict={
                                                disc.input: my_data,
                                                disc.label: my_label
                                            })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))
                if seg['mean_ext_rew'] > gan_reward_to_retrain_discriminator:
                    for e in range(gan_num_epochs):
                        i = 0
                        for mbatch in mocap_set.iterate_once(gan_batch_size):
                            data = replay_buf.sample(mbatch['data'].shape[0])

                            bf = np.random.uniform(0, 1, (gan_batch_size, 1))
                            onembf = 1 - bf
                            my_label = np.concatenate((bf, onembf), axis=1)
                            my_data = np.multiply(mbatch['data'],
                                                  bf) + np.multiply(
                                                      data, onembf)

                            _, l = sess.run([disc.optimizer_first, disc.loss],
                                            feed_dict={
                                                disc.input: my_data,
                                                disc.label: my_label
                                            })
                            i = i + 1
                            # Display logs per step
                            if i % gan_display_step == 0 or i == 1:
                                print(
                                    'discriminator epoch %i Step %i: Minibatch Loss: %f'
                                    % (e, i, l))
                        print(
                            'discriminator epoch %i Step %i: Minibatch Loss: %f'
                            % (e, i, l))

        # if True:
        #     lb = np.concatenate((np.zeros((extra.shape[0],1)),np.ones((extra.shape[0],1))),axis=1)
        #     extra_set = Dataset(dict(data=extra,label=lb), shuffle=True)
        #     num_r = 1
        #     if iters_so_far == 1:
        #         num_r = gan_num_epochs
        #     for e in range(num_r):
        #         i = 0
        #         for batch in extra_set.iterate_once(gan_batch_size):
        #             mbatch = mocap_set.next_batch(gan_batch_size)
        #             _, l = sess.run([disc.optimizer, disc.loss], feed_dict={disc.input: np.concatenate((mbatch['data'],batch['data'])), disc.label: np.concatenate((mbatch['label'],batch['label']))})
        #             i = i + 1
        #             # Display logs per step
        #             if i % gan_display_step == 0 or i == 1:
        #                 print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l))
        #         print('discriminator epoch %i Step %i: Minibatch Loss: %f' % (e, i, l))

        if not use_distance:
            if iters_so_far % 100 == 0:
                with open(
                        logdir + "\\" + 'replay_buf_' + str(iters_so_far) +
                        '.pkl', 'wb') as f:
                    pickle.dump(replay_buf, f)

        with open(logdir + "\\" + 'ob_list_' + str(rank) + '.pkl', 'wb') as f:
            pickle.dump(ob_list, f)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logF.write(str(rewmean) + "\n")
            logR.write(str(seg['mean_ext_rew']) + "\n")
            logStats.write(logger.get_str() + "\n")
            logF.flush()
            logStats.flush()
            logR.flush()

            logger.dump_tabular()

            try:
                os.remove(logdir + "/checkpoint")
            except OSError:
                pass
            try:
                saver.save(tf.get_default_session(),
                           os.path.join(logdir, agentName),
                           global_step=iters_so_far)
            except:
                pass
            try:
                non_disc_saver.save(tf.get_default_session(),
                                    os.path.join(logdir,
                                                 agentName + "_non_disc"),
                                    global_step=iters_so_far)
            except:
                pass
            try:
                disc_saver.save(tf.get_default_session(),
                                os.path.join(logdir, agentName + "_disc"),
                                global_step=iters_so_far)
            except:
                pass
Example #24
0
def learn(env, policy_func, *,
        timesteps_per_actorbatch, # timesteps per actor per update
        clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
        optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
        gamma, lam, # advantage estimation
        max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
        callback=None, # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant' # annealing for stepsize parameters (epsilon and adam)
        ):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
    surr1 = ratio * atarg # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
    pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    #seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"

    while True:
        data_path = '/Users/wjh720/Desktop/Tmp/para_%i/' % (timesteps_per_actorbatch / 100)
        U.load_state(data_path + 'para')

        test(pi, env, timesteps_per_actorbatch, stochastic=True)
Example #25
0
def render_evaluate(env, policy_func, *,
        timesteps_per_batch, # what to train on
        max_kl, cg_iters,
        gamma, lam, # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters =3,
        max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
        callback=None
        ):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)    
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.compute_kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
        else:
            yield
    
    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    # set up saver
    sess = tf.get_default_session()
    saver = tf.train.Saver()
    
    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    print("loading pretrained model")
    saver.restore(sess, callback.model_dir)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1

    import gym
    env = gym.make('Ant-v1')
    if True:
        obsall = []
        for _ in range(50):
            obs = []
            done = False
            ob = env.reset()
            #env.render()
            stochastic = 1
            obs.append(env.unwrapped.get_body_com('torso')[:2].copy())
            while not done:
                ac, vpred = pi.act(stochastic, ob)
                ob, rew, done, _ = env.step(ac)
                #env.render()
                obs.append(env.unwrapped.get_body_com('torso')[:2].copy())
            obsall.append(obs)

        if rank==0:
            logger.dump_tabular()

            if callback is not None:
                callback(locals(), globals())
    """
Example #26
0
def learn(
        env,
        policy_func,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        load_model=None,
        action_bias=0.4,
        action_repeat=0,
        action_repeat_rand=False,
        warmup_frames=0,
        target_kl=0.01,
        vf_loss_mult=1,
        vfloss_optim_stepsize=0.003,
        vfloss_optim_batchsize=8,
        vfloss_optim_epochs=10):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    # Not sure why they anneal clip and learning rate with the same parameter
    #clip_param = clip_param * lrmult # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen
    losses = [pol_surr, pol_entpen, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    lossandgrad_vfloss = U.function([ob, ac, atarg, ret], [vf_loss] +
                                    [U.flatgrad(vf_loss, var_list)])
    adam_vfloss = MpiAdam(var_list, epsilon=adam_epsilon)
    compute_vfloss = U.function([ob, ac, atarg, ret], [vf_loss])

    U.initialize()
    adam.sync()
    adam_vfloss.sync()

    if load_model:
        logger.log('Loading model: %s' % load_model)
        pi.load(load_model)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     action_bias=action_bias,
                                     action_repeat=action_repeat,
                                     action_repeat_rand=action_repeat_rand,
                                     warmup_frames=warmup_frames)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    ep_rew_file = None
    if MPI.COMM_WORLD.Get_rank() == 0:
        import wandb
        ep_rew_file = open(
            os.path.join(wandb.run.dir, 'episode_rewards.jsonl'), 'w')
        checkpoint_dir = 'checkpoints-%s' % wandb.run.id
        os.mkdir(checkpoint_dir)

    cur_lrmult = 1.0
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        elif schedule == 'target_kl':
            pass
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.next()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                result = lossandgrad(batch["ob"], batch["ac"], batch["atarg"],
                                     batch["vtarg"], cur_lrmult)
                newlosses = result[:-1]
                g = result[-1]
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        # vfloss optimize
        logger.log("Optimizing value function...")
        logger.log(fmt_row(13, ['vf']))
        for _ in range(vfloss_optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(vfloss_optim_batchsize):
                result = lossandgrad_vfloss(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"])
                newlosses = result[:-1]
                g = result[-1]
                adam_vfloss.update(g, vfloss_optim_stepsize)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            newlosses += compute_vfloss(batch["ob"], batch["ac"],
                                        batch["atarg"], batch["vtarg"])
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names + ['vf']):
            logger.record_tabular("loss_" + name, lossval)
        # check kl
        if schedule == 'target_kl':
            if meanlosses[2] > target_kl * 1.1:
                cur_lrmult /= 1.5
            elif meanlosses[2] < target_kl / 1.1:
                cur_lrmult *= 1.5
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        if rewbuffer:
            logger.record_tabular('CurLrMult', cur_lrmult)
            logger.record_tabular('StepSize', optim_stepsize * cur_lrmult)
            logger.record_tabular("EpLenMean", np.mean(lenbuffer))
            logger.record_tabular("EpRewMax", np.max(rewbuffer))
            logger.record_tabular("EpRewMean", np.mean(rewbuffer))
            logger.record_tabular("EpRewMin", np.min(rewbuffer))
            logger.record_tabular("EpThisIter", len(lens))
            episodes_so_far += len(lens)
            timesteps_so_far += sum(lens)
            iters_so_far += 1
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            time_elapsed = time.time() - tstart
            logger.record_tabular("TimeElapsed", time_elapsed)
            if MPI.COMM_WORLD.Get_rank() == 0:
                import wandb
                ep_rew_file.write('%s\n' % json.dumps({
                    'TimeElapsed': time_elapsed,
                    'Rewards': rews
                }))
                ep_rew_file.flush()
                data = logger.Logger.CURRENT.name2val
                wandb.run.history.add(data)
                summary_data = {}
                for k, v in data.iteritems():
                    if 'Rew' in k:
                        summary_data[k] = v
                wandb.run.summary.update(summary_data)
                pi.save(
                    os.path.join(checkpoint_dir,
                                 'model-%s.ckpt' % (iters_so_far - 1)))

                logger.dump_tabular()
        else:
            logger.log('No episodes complete yet')
Example #27
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant'  # annealing for stepsize parameters (epsilon and adam)
):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    rollouts_time = 0
    optimization_time = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)
        a = time.time()

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        b = time.time()
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        grad_time = 0.0
        allreduce_time = 0.0
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                aa = time.time()
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                bb = time.time()
                adam.update(g, optim_stepsize * cur_lrmult)
                cc = time.time()
                grad_time += bb - aa
                allreduce_time += cc - bb
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("GradTime", grad_time)
        logger.record_tabular("AllReduceTime", allreduce_time)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        c = time.time()
        rollouts_time += (b - a)
        optimization_time += (c - b)
        logger.record_tabular("RolloutsTime", rollouts_time)
        logger.record_tabular("OptimizationTime", optimization_time)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
    def __init__(self, env, policy, 
                 emb_network, emb_size,
                 clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
                 optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
                 gamma, lam, # advantage estimation
                 max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
                 adam_epsilon=1e-5,
                 schedule='constant',
                 joint_training=False
                 ):
        # Setup variables
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.gamma = gamma
        self.lam = lam
        self.max_timesteps = max_timesteps
        self.adam_epsilon = adam_epsilon
        self.schedule = schedule

        # Setup losses and stuff
        # ----------------------------------------
        with tf.name_scope('ppo'):
            ob_space = env.observation_space
            ac_space = env.action_space
            self.pi = policy # Construct network for new policy
            oldpi = Policy("old_policy", env.action_space, joint_training, emb_size, emb_network) # Network for old policy
            atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
            ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

            lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
            clip_param = clip_param * lrmult # Annealed cliping parameter epislon

            # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None] + list(ob_space.shape))
            if joint_training:
                ob = U.get_placeholder_cached(name="ob_f")
            else:
                ob = U.get_placeholder_cached(name="ob")
            ac = self.pi.pdtype.sample_placeholder([None])

            kloldnew = oldpi.pd.kl(self.pi.pd)
            ent = self.pi.pd.entropy()
            meankl = U.mean(kloldnew)
            meanent = U.mean(ent)
            pol_entpen = (-entcoeff) * meanent

            ratio = tf.exp(self.pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
            surr1 = ratio * atarg # surrogate from conservative policy iteration
            surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
            pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
            vf_loss = U.mean(tf.square(self.pi.vpred - ret))
            self.total_loss = pol_surr + pol_entpen + vf_loss
            losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
            self.loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

            var_list = self.pi.get_trainable_variables()
            self.lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(self.total_loss, var_list)])
            self.adam = MpiAdam(var_list, epsilon=adam_epsilon)

            self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(oldpi.get_variables(), self.pi.get_variables())])
            self.compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

            U.initialize()
            self.adam.sync()

        # Prepare for rollouts
        # ----------------------------------------
        self.episodes_so_far = 0
        self.timesteps_so_far = 0
        self.iters_so_far = 0
        self.tstart = time.time()
        self.lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
        self.rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
Example #29
0
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        sym_loss_weight=0.0,
        return_threshold=None,  # termiante learning if reaches return_threshold
        op_after_init=None,
        init_policy_params=None,
        policy_scope=None,
        max_threshold=None,
        positive_rew_enforce=False,
        reward_drop_bound=None,
        min_iters=0,
        ref_policy_params=None,
        rollout_length_thershold=None):

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    if policy_scope is None:
        pi = policy_func("pi", ob_space,
                         ac_space)  # Construct network for new policy
        oldpi = policy_func("oldpi", ob_space,
                            ac_space)  # Network for old policy
    else:
        pi = policy_func(policy_scope, ob_space,
                         ac_space)  # Construct network for new policy
        oldpi = policy_func("old" + policy_scope, ob_space,
                            ac_space)  # Network for old policy

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    sym_loss = sym_loss_weight * U.mean(
        tf.square(pi.mean - pi.mirrored_mean))  # mirror symmetric loss
    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2)) + sym_loss  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent, sym_loss]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent", "sym_loss"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()

    if init_policy_params is not None:
        cur_scope = pi.get_variables()[0].name[0:pi.get_variables()[0].name.
                                               find('/')]
        orig_scope = list(init_policy_params.keys()
                          )[0][0:list(init_policy_params.keys())[0].find('/')]
        for i in range(len(pi.get_variables())):
            assign_op = pi.get_variables()[i].assign(
                init_policy_params[pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)
            assign_op = oldpi.get_variables()[i].assign(
                init_policy_params[pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)

    if ref_policy_params is not None:
        ref_pi = policy_func("ref_pi", ob_space, ac_space)
        cur_scope = ref_pi.get_variables()[0].name[0:ref_pi.get_variables()[0].
                                                   name.find('/')]
        orig_scope = list(ref_policy_params.keys()
                          )[0][0:list(ref_policy_params.keys())[0].find('/')]
        for i in range(len(ref_pi.get_variables())):
            assign_op = ref_pi.get_variables()[i].assign(
                ref_policy_params[ref_pi.get_variables()[i].name.replace(
                    cur_scope, orig_scope, 1)])
            U.get_session().run(assign_op)
        env.env.env.ref_policy = ref_pi

    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    max_thres_satisfied = max_threshold is None
    adjust_ratio = 0.0
    prev_avg_rew = -1000000
    revert_parameters = {}
    variables = pi.get_variables()
    for i in range(len(variables)):
        cur_val = variables[i].eval()
        revert_parameters[variables[i].name] = cur_val
    revert_data = [0, 0, 0]
    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        if reward_drop_bound is not None:
            lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews = map(flatten_lists, zip(*listoflrpairs))
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
            revert_iteration = False
            if np.mean(
                    rewbuffer
            ) < prev_avg_rew - reward_drop_bound:  # detect significant drop in performance, revert to previous iteration
                print("Revert Iteration!!!!!")
                revert_iteration = True
            else:
                prev_avg_rew = np.mean(rewbuffer)
            logger.record_tabular("Revert Rew", prev_avg_rew)
            if revert_iteration:  # revert iteration
                for i in range(len(pi.get_variables())):
                    assign_op = pi.get_variables()[i].assign(
                        revert_parameters[pi.get_variables()[i].name])
                    U.get_session().run(assign_op)
                episodes_so_far = revert_data[0]
                timesteps_so_far = revert_data[1]
                iters_so_far = revert_data[2]
                continue
            else:
                variables = pi.get_variables()
                for i in range(len(variables)):
                    cur_val = variables[i].eval()
                    revert_parameters[variables[i].name] = np.copy(cur_val)
                revert_data[0] = episodes_so_far
                revert_data[1] = timesteps_so_far
                revert_data[2] = iters_so_far

        if positive_rew_enforce:
            rewlocal = (seg["pos_rews"], seg["neg_pens"], seg["rew"]
                        )  # local values
            listofrews = MPI.COMM_WORLD.allgather(rewlocal)  # list of tuples
            pos_rews, neg_pens, rews = map(flatten_lists, zip(*listofrews))
            if np.mean(rews) < 0.0:
                #min_id = np.argmin(rews)
                #adjust_ratio = pos_rews[min_id]/np.abs(neg_pens[min_id])
                adjust_ratio = np.max([
                    adjust_ratio,
                    np.mean(pos_rews) / np.abs(np.mean(neg_pens))
                ])
                for i in range(len(seg["rew"])):
                    if np.abs(seg["rew"][i] - seg["pos_rews"][i] -
                              seg["neg_pens"][i]) > 1e-5:
                        print(seg["rew"][i], seg["pos_rews"][i],
                              seg["neg_pens"][i])
                        print('Reward wrong!')
                        abc
                    seg["rew"][i] = seg["pos_rews"][
                        i] + seg["neg_pens"][i] * adjust_ratio
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))
        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        if reward_drop_bound is None:
            lenbuffer.extend(lens)
            rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        logger.record_tabular("Iter", iters_so_far)
        if positive_rew_enforce:
            if adjust_ratio is not None:
                logger.record_tabular("RewardAdjustRatio", adjust_ratio)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        if max_threshold is not None:
            print('Current max return: ', np.max(rewbuffer))
            if np.max(rewbuffer) > max_threshold:
                max_thres_satisfied = True
            else:
                max_thres_satisfied = False

        return_threshold_satisfied = True
        if return_threshold is not None:
            if not (np.mean(rewbuffer) > return_threshold
                    and iters_so_far > min_iters):
                return_threshold_satisfied = False
        rollout_length_thershold_satisfied = True
        if rollout_length_thershold is not None:
            rewlocal = (seg["avg_vels"], seg["rew"])  # local values
            listofrews = MPI.COMM_WORLD.allgather(rewlocal)  # list of tuples
            avg_vels, rews = map(flatten_lists, zip(*listofrews))
            if not (np.mean(lenbuffer) > rollout_length_thershold
                    and np.mean(avg_vels) > 0.5 * env.env.env.final_tv):
                rollout_length_thershold_satisfied = False
        if rollout_length_thershold is not None or return_threshold is not None:
            if rollout_length_thershold_satisfied and return_threshold_satisfied:
                break

    return pi, np.mean(rewbuffer)
Example #30
0
def learn(env, policy_func, *,
        timesteps_per_batch, # what to train on
        max_kl, cg_iters,
        gamma, lam, # advantage estimation
        entcoeff=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters =3,
        max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
        callback=None
        ):
    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)    
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    entbonus = entcoeff * meanent

    vferr = U.mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
    surrgain = U.mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
        start += sz
    gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
        else:
            yield
    
    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards

    assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1

    while True:        
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************"%iters_so_far)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]
        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new() # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
            assert np.isfinite(stepdir).all()
            shs = .5*stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
                assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), 
                include_final_partial_batch=False, batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        if rank==0:
            logger.dump_tabular()
def learn(
        env,
        policy_func,
        *,
        timesteps_per_batch,  # timesteps per actor per update
        log_every=None,
        log_dir=None,
        episodes_so_far=0,
        timesteps_so_far=0,
        iters_so_far=0,
        clip_param,
        entcoeff,  # clipping parameter epsilon, entropy coeff
        optim_epochs,
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        max_timesteps=0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        **kwargs):
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    # Target advantage function (if applicable)
    atarg = tf.placeholder(dtype=tf.float32, shape=[None])
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # learning rate multiplier, updated with schedule
    lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[])
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)
    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    # GRASPING
    saver = tf.train.Saver(var_list=U.ALREADY_INITIALIZED, max_to_keep=1)
    checkpoint = tf.train.latest_checkpoint(log_dir)
    if checkpoint:
        print("Restoring checkpoint: {}".format(checkpoint))
        saver.restore(U.get_session(), checkpoint)
    if hasattr(env, "set_actor"):

        def actor(obs):
            return pi.act(False, obs)[0]

        env.set_actor(actor)
    if not checkpoint and hasattr(env, "warm_init_eps"):
        pretrain(pi, env)
        saver.save(U.get_session(), osp.join(log_dir, "model"))
    # /GRASPING
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    tstart = time.time()

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if callback:
            callback(locals(), globals())
        should_break = False
        if max_timesteps and timesteps_so_far >= max_timesteps:
            should_break = True
        elif max_episodes and episodes_so_far >= max_episodes:
            should_break = True
        elif max_iters and iters_so_far >= max_iters:
            should_break = True
        elif max_seconds and time.time() - tstart >= max_seconds:
            should_break = True

        if log_every and log_dir:
            if (iters_so_far + 1) % log_every == 0 or should_break:
                # To reduce space, don't specify global step.
                saver.save(U.get_session(), osp.join(log_dir, "model"))

            job_info = {
                'episodes_so_far': episodes_so_far,
                'iters_so_far': iters_so_far,
                'timesteps_so_far': timesteps_so_far
            }
            with open(osp.join(log_dir, "job_info_new.yaml"), 'w') as file:
                yaml.dump(job_info, file, default_flow_style=False)
                # Make sure write is instantaneous.
                file.flush()
                os.fsync(file)
            os.rename(osp.join(log_dir, "job_info_new.yaml"),
                      osp.join(log_dir, "job_info.yaml"))

        if should_break:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-10)  # standardized advantage function estimate
        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)
            # logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            newlosses = compute_losses(batch["ob"], batch["ac"],
                                       batch["atarg"], batch["vtarg"],
                                       cur_lrmult)
            losses.append(newlosses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        logger.record_tabular("EpLenMean", np.mean(lens))
        logger.record_tabular("EpRewMean", np.mean(rews))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()
Example #32
0
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    num_options=1,
    app='',
    saves=False,
    wsaves=False,
    epoch=0,
    seed=1,
    dc=0,
    plots=False,
    w_intfc=True,
    switch=False,
    intlr=1e-4,
    piolr=1e-4,
    fewshot=False,
):

    optim_batchsize_ideal = optim_batchsize
    np.random.seed(seed)
    tf.set_random_seed(seed)

    ### Book-keeping
    if hasattr(env, 'NAME'):
        gamename = env.NAME.lower()
    else:
        gamename = env.spec.id[:-3].lower()

    gamename += 'seed' + str(seed)
    gamename += app

    dirname = '{}_{}opts_saves/'.format(gamename, num_options)

    if wsaves:
        first = True
        if not os.path.exists(dirname):
            os.makedirs(dirname)
            first = False
        # while os.path.exists(dirname) and first:
        #     dirname += '0'

        files = ['pposgd_simple.py', 'cnn_policy.py', 'run_miniw.py']
        for i in range(len(files)):
            src = os.path.expanduser(
                '~/baselines_intfc/baselines/ppoc_int/') + files[i]
            dest = os.path.expanduser(
                '~/baselines_intfc/baselines/ppoc_int/') + dirname
            shutil.copy2(src, dest)
    ###

    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space,
                     ac_space)  # Construct network for new policy
    oldpi = policy_func("oldpi", ob_space, ac_space)  # Network for old policy
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    # option = tf.placeholder(dtype=tf.int32, shape=[None])

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lrmult  # Annealed cliping parameter epislon

    ob = U.get_placeholder_cached(name="ob")
    option = U.get_placeholder_cached(name="option")
    term_adv = U.get_placeholder(name='term_adv',
                                 dtype=tf.float32,
                                 shape=[None])
    op_adv = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    betas = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = U.mean(kloldnew)
    meanent = U.mean(ent)
    pol_entpen = (-entcoeff) * meanent

    ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac))  # pnew / pold
    surr1 = ratio * atarg  # surrogate from conservative policy iteration
    surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg  #
    pol_surr = -U.mean(tf.minimum(
        surr1, surr2))  # PPO's pessimistic surrogate (L^CLIP)

    vf_loss = U.mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    term_loss = pi.tpred * term_adv

    # pi_w = tf.stop_gradient(pi.op_pi)
    pi_w = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    option_hot = tf.one_hot(option, depth=num_options)
    pi_I = pi.intfc * pi_w / tf.expand_dims(
        tf.reduce_sum(pi.intfc * pi_w, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    int_loss = -tf.reduce_sum(
        betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    intfc = tf.placeholder(dtype=tf.float32, shape=[None, num_options])
    pi_I = intfc * pi.op_pi / tf.expand_dims(
        tf.reduce_sum(intfc * pi.op_pi, axis=1), 1)
    pi_I = tf.clip_by_value(pi_I, 1e-6, 1 - 1e-6)
    op_loss = -tf.reduce_sum(
        betas * tf.reduce_sum(pi_I * option_hot, axis=1) * op_adv)

    log_pi = tf.log(tf.clip_by_value(pi.op_pi, 1e-20, 1.0))
    op_entropy = -tf.reduce_mean(pi.op_pi * log_pi, reduction_indices=1)
    op_loss -= 0.01 * tf.reduce_sum(op_entropy)

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult, option],
                             losses + [U.flatgrad(total_loss, var_list)])
    lossandgrad_vf = U.function([ob, ac, atarg, ret, lrmult, option],
                                losses + [U.flatgrad(vf_loss, var_list)])
    termgrad = U.function([ob, option, term_adv],
                          [U.flatgrad(term_loss, var_list)
                           ])  # Since we will use a different step size.
    opgrad = U.function([ob, option, betas, op_adv, intfc],
                        [U.flatgrad(op_loss, var_list)
                         ])  # Since we will use a different step size.
    intgrad = U.function([ob, option, betas, op_adv, pi_w],
                         [U.flatgrad(int_loss, var_list)
                          ])  # Since we will use a different step size.
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg, ret, lrmult, option], losses)

    U.initialize()
    adam.sync()

    saver = tf.train.Saver(max_to_keep=10000)

    ### More book-kepping
    results = []
    if saves:
        directory_res = "res_switch150/learnpio/lr{}/".format(
            optim_stepsize) if not fewshot else "res_fewshot/lr{}/".format(
                optim_stepsize)
        if not os.path.exists(directory_res):
            os.makedirs(directory_res)
        if w_intfc:
            results = open(
                directory_res + gamename +
                '_intfc{}_intlr{}_piolr{}'.format(int(w_intfc), intlr, piolr) +
                '_' + str(num_options) + 'opts.csv', 'w')
        else:
            results = open(
                directory_res + gamename +
                '_intfc{}_piolr{}'.format(int(w_intfc), piolr) + '_' +
                str(num_options) + 'opts.csv', 'w')
        out = 'epoch,avg_reward'

        # for opt in range(num_options): out += ',option {} dur'.format(opt)
        # # for opt in range(num_options): out += ',option {} std'.format(opt)
        # for opt in range(num_options): out += ',option {} term'.format(opt)
        # for opt in range(num_options): out += ',option {} adv'.format(opt)
        out += '\n'
        results.write(out)
        # results.write('epoch,avg_reward,option 1 dur, option 2 dur, option 1 term, option 2 term\n')
        results.flush()

    if epoch:

        dirname = '{}_{}opts_saves/'.format(gamename, num_options)
        print("Loading weights from iteration: " + str(epoch))

        filename = dirname + '{}_epoch_{}.ckpt'.format(gamename, epoch)
        saver.restore(U.get_session(), filename)
    ###

    episodes_so_far = 0
    timesteps_so_far = 0
    global iters_so_far
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=10)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=10)  # rolling buffer for episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     num_options=num_options,
                                     saves=saves,
                                     results=results,
                                     rewbuffer=rewbuffer,
                                     dc=dc,
                                     epoch=epoch,
                                     seed=seed,
                                     plots=plots,
                                     w_intfc=w_intfc,
                                     switch=switch)

    datas = [0 for _ in range(num_options)]

    while True:
        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam, num_options)

        opt_d = []
        for i in range(num_options):
            dur = np.mean(
                seg['opt_dur'][i]) if len(seg['opt_dur'][i]) > 0 else 0.
            opt_d.append(dur)

        print("mean opt dur:", opt_d)
        print("mean op probs:", np.mean(np.array(seg['op_probs']), axis=0))
        print("mean term p:", np.mean(np.array(seg['term_p']), axis=0))
        print("mean vpreds:", np.mean(np.array(seg['vpred']), axis=0))

        ob, ac, opts, atarg, tdlamret = seg["ob"], seg["ac"], seg["opts"], seg[
            "adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy
        assign_old_eq_new()  # set old parameter values to new parameter values

        if iters_so_far % 5 == 0 and wsaves:
            print("weights are saved...")
            filename = dirname + '{}_epoch_{}.ckpt'.format(
                gamename, iters_so_far)
            save_path = saver.save(U.get_session(), filename)

        min_batch = 160  # Arbitrary and this is the main issue for multi option (or options in general)
        t_advs = [[] for _ in range(num_options)]
        for opt in range(num_options):
            indices = np.where(opts == opt)[0]
            print("batch size:", indices.size)
            opt_d[opt] = indices.size
            if not indices.size:
                t_advs[opt].append(0.)
                continue

            ########## This part is only necessary when we use options. We proceed to these verifications in order not to discard any collected trajectories.
            if datas[opt] != 0:
                if (indices.size < min_batch and datas[opt].n > min_batch):
                    datas[opt] = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue
                    # The preivous dataset has already been trained on (datas[opt].n > min_batch), so we replace it,
                    # and continue without training, as indices.size is too small (indices.size < min_batch).
                    # A too small dataset causes divergence.

                ##################################################
                elif indices.size + datas[opt].n < min_batch:
                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = Dataset(dict(ob=cat_ob,
                                              ac=cat_ac,
                                              atarg=cat_atarg,
                                              vtarg=cat_vtarg),
                                         shuffle=not pi.recurrent)
                    t_advs[opt].append(0.)
                    continue
                    # The preivous dataset hasn't been trained on (datas[opt].n < min_batch), so we concatenante with new samples.
                    # The combination of both (indices.size + datas[opt].n < min_batch) is still insufficient, so we skip training.
                    # A too small dataset causes divergence.

                ###################################################
                elif (indices.size + datas[opt].n > min_batch
                      and datas[opt].n < min_batch):
                    oldmap = datas[opt].data_map
                    cat_ob = np.concatenate((oldmap['ob'], ob[indices]))
                    cat_ac = np.concatenate((oldmap['ac'], ac[indices]))
                    cat_atarg = np.concatenate(
                        (oldmap['atarg'], atarg[indices]))
                    cat_vtarg = np.concatenate(
                        (oldmap['vtarg'], tdlamret[indices]))
                    datas[opt] = d = Dataset(dict(ob=cat_ob,
                                                  ac=cat_ac,
                                                  atarg=cat_atarg,
                                                  vtarg=cat_vtarg),
                                             shuffle=not pi.recurrent)
                    # The preivous dataset hasn't been trained on (datas[opt].n < min_batch), so we concatenante with new samples.
                    # The combination of both (indices.size + datas[opt].n < min_batch) is sufficient for training.

                ##################################################
                if (indices.size > min_batch and datas[opt].n > min_batch):
                    datas[opt] = d = Dataset(dict(ob=ob[indices],
                                                  ac=ac[indices],
                                                  atarg=atarg[indices],
                                                  vtarg=tdlamret[indices]),
                                             shuffle=not pi.recurrent)
                    # The preivous dataset has already been trained on (datas[opt].n > min_batch), so we replace it.
                    # The new samples are numerous enough (indices.size > min_batch), so we use them for training.

                ##################################################
            elif datas[opt] == 0:
                datas[opt] = d = Dataset(dict(ob=ob[indices],
                                              ac=ac[indices],
                                              atarg=atarg[indices],
                                              vtarg=tdlamret[indices]),
                                         shuffle=not pi.recurrent)
                # Only useful for the very first iteration of the training process.
            #########

            optim_batchsize = optim_batchsize or ob.shape[0]
            optim_epochs = np.clip(
                np.int(10 * (indices.size /
                             (timesteps_per_batch / num_options))), 10,
                10) if num_options > 1 else optim_epochs
            print("optim epochs:", optim_epochs)
            logger.log("Optimizing...")

            # Here we do a bunch of optimization epochs over the data
            for _ in range(optim_epochs):
                losses = [
                ]  # list of tuples, each of which gives the loss for a minibatch
                for batch in d.iterate_once(optim_batchsize):

                    # tadv,nodc_adv = pi.get_term_adv(batch["ob"],[opt])
                    # tadv = tadv if num_options > 1 else np.zeros_like(tadv)
                    # t_advs[opt].append(nodc_adv)
                    if iters_so_far < 150 or not fewshot:
                        *newlosses, grads = lossandgrad(
                            batch["ob"], batch["ac"], batch["atarg"],
                            batch["vtarg"], cur_lrmult, [opt])
                        adam.update(grads, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)
                    else:
                        *newlosses, grads = lossandgrad_vf(
                            batch["ob"], batch["ac"], batch["atarg"],
                            batch["vtarg"], cur_lrmult, [opt])
                        adam.update(grads, optim_stepsize * cur_lrmult)
                        losses.append(newlosses)

        if iters_so_far < 150 or not fewshot:
            termg = termgrad(seg["ob"], seg['opts'], seg["op_adv"])[0]
            adam.update(termg, 5e-7)

            if w_intfc:
                intgrads = intgrad(seg['ob'], seg['opts'], seg["last_betas"],
                                   seg["op_adv"], seg["op_probs"])[0]
                adam.update(intgrads, intlr)

        opgrad = intgrad(seg['ob'], seg['opts'], seg["last_betas"],
                         seg["op_adv"], seg["intfc"])[0]
        adam.update(opgrad, piolr)

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        ### Book keeping
        if saves:
            out = "{},{}"
            # for _ in range(num_options): out+=",{},{},{}"
            out += "\n"
            # pdb.set_trace()

            info = [iters_so_far, np.mean(rewbuffer)]

            results.write(out.format(*info))
            results.flush()
Example #33
0
    def __init__(self, a_name, env, policy_func, par):

        self.env = env
        self.timesteps_per_batch = par.timesteps_per_batch
        self.max_kl = par.max_kl
        self.cg_iters = par.cg_iters
        self.gamma = par.gamma
        self.lam = par.lam  # advantage estimation
        self.entcoeff = par.entcoeff
        self.cg_damping = par.cg_damping
        self.vf_stepsize = par.vf_stepsize
        self.vf_iters = par.vf_iters
        self.max_timesteps = par.max_timesteps
        self.max_episodes = par.max_episodes
        self.max_iters = par.max_iters
        self.callback = par.callback,  # you can do anything in the callback, since it takes locals(), globals()

        self.nworkers = MPI.COMM_WORLD.Get_size()
        self.rank = MPI.COMM_WORLD.Get_rank()
        np.set_printoptions(precision=3)
        # Setup losses and stuff
        # ----------------------------------------
        self.ob_space = self.env.observation_space
        self.ac_space = self.env.action_space
        self.pi = policy_func(a_name, self.ob_space, self.ac_space)
        self.oldpi = policy_func("oldpi" + a_name, self.ob_space,
                                 self.ac_space)
        self.atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        self.ret = tf.placeholder(dtype=tf.float32,
                                  shape=[None])  # Empirical return

        self.ob = U.get_placeholder_cached(name="ob" +
                                           str(TRPO_agent_new.index2))
        self.ac = self.pi.pdtype.sample_placeholder([None])

        self.kloldnew = self.oldpi.pd.kl(self.pi.pd)
        self.ent = self.pi.pd.entropy()
        meankl = U.mean(self.kloldnew)
        meanent = U.mean(self.ent)
        entbonus = self.entcoeff * meanent

        self.vferr = U.mean(tf.square(self.pi.vpred - self.ret))

        ratio = tf.exp(self.pi.pd.logp(self.ac) -
                       self.oldpi.pd.logp(self.ac))  # advantage * pnew / pold
        surrgain = U.mean(ratio * self.atarg)

        optimgain = surrgain + entbonus
        self.losses = [optimgain, meankl, entbonus, surrgain, meanent]
        self.loss_names = [
            "optimgain", "meankl", "entloss", "surrgain", "entropy"
        ]

        self.dist = meankl

        all_var_list = self.pi.get_trainable_variables()

        var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("pol")
        ]
        vf_var_list = [
            v for v in all_var_list if v.name.split("/")[1].startswith("vf")
        ]
        self.vfadam = MpiAdam(vf_var_list)

        self.get_flat = U.GetFlat(var_list)
        self.set_from_flat = U.SetFromFlat(var_list)
        self.klgrads = tf.gradients(self.dist, var_list)
        self.flat_tangent = tf.placeholder(dtype=tf.float32,
                                           shape=[None],
                                           name="flat_tan" +
                                           str(TRPO_agent_new.index2))

        shapes = [var.get_shape().as_list() for var in var_list]
        start = 0
        self.tangents = []
        for shape in shapes:
            sz = U.intprod(shape)
            self.tangents.append(
                tf.reshape(self.flat_tangent[start:start + sz], shape))
            start += sz

        self.gvp = tf.add_n([
            U.sum(g * tangent)
            for (g, tangent) in zipsame(self.klgrads, self.tangents)
        ])  #pylint: disable=E1111
        self.fvp = U.flatgrad(self.gvp, var_list)

        self.assign_old_eq_new = U.function(
            [], [],
            updates=[
                tf.assign(oldv, newv) for (oldv, newv) in zipsame(
                    self.oldpi.get_variables(), self.pi.get_variables())
            ])

        self.compute_losses = U.function([self.ob, self.ac, self.atarg],
                                         self.losses)
        self.compute_lossandgrad = U.function(
            [self.ob, self.ac, self.atarg],
            self.losses + [U.flatgrad(optimgain, var_list)])
        self.compute_fvp = U.function(
            [self.flat_tangent, self.ob, self.ac, self.atarg], self.fvp)
        self.compute_vflossandgrad = U.function([self.ob, self.ret],
                                                U.flatgrad(
                                                    self.vferr, vf_var_list))

        TRPO_agent_new.index2 += 1
        U.initialize()
        self.th_init = self.get_flat()
        MPI.COMM_WORLD.Bcast(self.th_init, root=0)
        self.set_from_flat(self.th_init)
        self.vfadam.sync()
        print("Init param sum", self.th_init.sum(), flush=True)
def learn(
    env,
    policy_func,
    *,
    timesteps_per_batch,  # timesteps per actor per update
    clip_param,
    entcoeff,  # clipping parameter epsilon, entropy coeff
    optim_epochs,
    optim_stepsize,
    optim_batchsize,  # optimization hypers
    gamma,
    lam,  # advantage estimation
    max_timesteps=0,
    max_episodes=0,
    max_iters=0,
    max_seconds=0,  # time constraint
    callback=None,  # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5,
    schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
    load_model_path,
    test_only,
    stochastic,
    symmetric_training=False,
    obs_names=None,
    single_episode=False,
    horizon_hack=False,
    running_avg_len=100,
    init_three=False,
    actions=None,
    symmetric_training_trick=False,
    seeds_fn=None,
    bootstrap_seeds=False,
):
    global seeds
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi", ob_space, ac_space)  # Network for new policy
    old_pi = policy_func("old_pi", ob_space,
                         ac_space)  # Network for old policy
    adv_targ = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return
    mask = tf.placeholder(dtype=tf.bool, shape=[None])  # Mask for the trick

    lr_mult = tf.placeholder(
        name='lr_mult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule
    clip_param = clip_param * lr_mult  # Annealed clipping parameter epsilon

    ob = U.get_placeholder_cached(name="ob")
    st = U.get_placeholder_cached(name="st")
    ac = pi.pdtype.sample_placeholder([None])

    kl = old_pi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    mean_kl = U.mean(tf.boolean_mask(kl, mask))  # Mean over the batch
    mean_ent = U.mean(tf.boolean_mask(ent, mask))
    entropy_penalty = -entcoeff * mean_ent

    ratio = tf.exp(pi.pd.logp(ac) - old_pi.pd.logp(ac))  # pi_new / pi_old
    surr_1 = ratio * adv_targ  # surrogate from conservative policy iteration
    surr_2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * adv_targ  #
    surr_loss = -U.mean(tf.boolean_mask(
        tf.minimum(surr_1, surr_2),
        mask))  # PPO's pessimistic surrogate (L^CLIP), mean over the batch
    vf_loss = U.mean(tf.boolean_mask(tf.square(pi.vpred - ret), mask))
    total_loss = surr_loss + entropy_penalty + vf_loss
    losses = [surr_loss, entropy_penalty, vf_loss, mean_kl, mean_ent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    comp_loss_and_grad = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask],
                                    losses +
                                    [U.flatgrad(total_loss, var_list)])
    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(old_v, new_v)
            for (old_v,
                 new_v) in zipsame(old_pi.get_variables(), pi.get_variables())
        ])
    comp_loss = U.function([ob, st, ac, adv_targ, ret, lr_mult, mask], losses)

    if init_three:
        assign_init_three_1 = U.function(
            [], [],
            updates=[
                tf.assign(new_v, old_v) for (old_v, new_v) in zipsame(
                    pi.get_orig_variables(), pi.get_part_variables(1))
            ])
        assign_init_three_2 = U.function(
            [], [],
            updates=[
                tf.assign(new_v, old_v) for (old_v, new_v) in zipsame(
                    pi.get_orig_variables(), pi.get_part_variables(2))
            ])

    U.initialize()
    if load_model_path is not None:
        U.load_state(load_model_path)
        if init_three:
            assign_init_three_1()
            assign_init_three_2()
    adam.sync()

    if seeds_fn is not None:
        with open(seeds_fn) as f:
            seeds = [int(seed) for seed in f.readlines()]
    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=stochastic,
                                     single_episode=test_only
                                     or single_episode,
                                     actions=actions,
                                     bootstrap_seeds=bootstrap_seeds)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    len_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for episode lengths
    rew_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for episode rewards
    origrew_buffer = deque(
        maxlen=running_avg_len)  # rolling buffer for original episode rewards

    assert sum(
        [max_iters > 0, max_timesteps > 0, max_episodes > 0,
         max_seconds > 0]) == 1, "Only one time constraint permitted"

    while True:
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam, horizon_hack=horizon_hack)

        # ob, ac, adv_targ, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, st, ac, adv_targ, tdlamret = seg["ob"], seg["step"], seg[
            "ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        if symmetric_training_trick:
            first_75 = st < 75
            mask = ~np.concatenate((np.zeros_like(first_75), first_75))
        else:
            mask = np.concatenate(
                (np.ones_like(st,
                              dtype=np.bool), np.ones_like(st, dtype=np.bool)))
        if symmetric_training:
            sym_obss = []
            sym_acc = []
            for i in range(timesteps_per_batch):
                obs = OrderedDict(zip(obs_names, ob[i]))
                sym_obs = obs.copy()
                swap_legs(sym_obs)

                sym_ac = ac[i].copy()
                sym_ac = np.concatenate((sym_ac[9:], sym_ac[:9]))
                sym_obss.append(np.asarray(list(sym_obs.values())))
                sym_acc.append(sym_ac)
            sym_obss = np.asarray(sym_obss)
            sym_acc = np.asarray(sym_acc)

            ob = np.concatenate((ob, sym_obss))
            ac = np.concatenate((ac, sym_acc))
            adv_targ = np.concatenate((adv_targ, adv_targ))
            tdlamret = np.concatenate((tdlamret, tdlamret))
            vpredbefore = np.concatenate((vpredbefore, vpredbefore))
            st = np.concatenate((st, st))

        # Compute stats before updating
        if bootstrap_seeds:
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"],
                       seg["easy_seeds"], seg["hard_seeds"])  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, orig_rews, easy_seeds, hard_seeds = map(
                flatten_lists, zip(*listoflrpairs))
            easy_seeds = [x for x in easy_seeds if x != 0]
            hard_seeds = [x for x in hard_seeds if x != 0]
            print('seeds set sizes:', len(seeds), len(easy_seeds),
                  len(hard_seeds))
            seeds = list((set(seeds) - set(easy_seeds)) | set(hard_seeds))
        else:
            lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_orig_rets"]
                       )  # local values
            listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
            lens, rews, orig_rews = map(flatten_lists, zip(*listoflrpairs))

        len_buffer.extend(lens)
        rew_buffer.extend(rews)
        origrew_buffer.extend(orig_rews)
        logger.record_tabular("Iter", iters_so_far)
        logger.record_tabular("EpLenMean", np.mean(len_buffer))
        logger.record_tabular("EpRewMean", np.mean(rew_buffer))
        logger.record_tabular("EpOrigRewMean", np.mean(origrew_buffer))
        logger.record_tabular("EpOrigRewStd", np.std(origrew_buffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        n_completed = 0
        sum_completed = 0
        for ep_len, orig_rew in zip(lens, orig_rews):
            if ep_len == 1000:
                n_completed += 1
                sum_completed += orig_rew
        avg_completed = sum_completed / n_completed if n_completed > 0 else 0
        logger.record_tabular("AvgCompleted", avg_completed)
        perc_completed = 100 * n_completed / len(lens) if len(lens) > 0 else 0
        logger.record_tabular("PercCompleted", perc_completed)

        if callback: callback(locals(), globals())

        adv_targ = (adv_targ - adv_targ.mean()) / adv_targ.std(
        )  # standardized advantage function estimate
        d = Dataset(dict(ob=ob,
                         st=st,
                         ac=ac,
                         atarg=adv_targ,
                         vtarg=tdlamret,
                         mask=mask),
                    shuffle=not pi.recurrent)
        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values
        logger.log("Optimizing...")
        if not test_only:
            logger.log(fmt_row(13, loss_names))
        # Here we do a bunch of optimization epochs over the data. I log results only for the first worker (rank=0)
        for _ in range(optim_epochs):
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *batch_losses, grads = comp_loss_and_grad(
                    batch["ob"], batch["st"], batch["ac"], batch["atarg"],
                    batch["vtarg"], cur_lrmult, batch["mask"])
                if not test_only:
                    adam.update(grads, optim_stepsize * cur_lrmult)
                losses.append(batch_losses)
            logger.log(fmt_row(13, np.mean(losses, axis=0)))

        logger.log("Evaluating losses...")
        losses = []
        for batch in d.iterate_once(optim_batchsize):
            batch_losses = comp_loss(batch["ob"], batch["st"], batch["ac"],
                                     batch["atarg"], batch["vtarg"],
                                     cur_lrmult, batch["mask"])
            losses.append(batch_losses)
        meanlosses, _, _ = mpi_moments(losses, axis=0)
        logger.log(fmt_row(13, meanlosses))
        for (lossval, name) in zipsame(meanlosses, loss_names):
            logger.record_tabular("loss_" + name, lossval)
        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.dump_tabular()

        iters_so_far += 1