Ejemplo n.º 1
0
 def __init__(self,
              *,
              ac_space,
              policy_network,
              value_network=None,
              ent_coef,
              vf_coef,
              max_grad_norm):
     super(Model, self).__init__(name='PPO2Model')
     self.train_model = PolicyWithValue(ac_space,
                                        policy_network,
                                        value_network,
                                        estimate_q=False)
     if MPI is not None:
         self.optimizer = MpiAdamOptimizer(
             MPI.COMM_WORLD, self.train_model.trainable_variables)
     else:
         self.optimizer = tf.keras.optimizers.Adam()
     self.ent_coef = ent_coef
     self.vf_coef = vf_coef
     self.max_grad_norm = max_grad_norm
     self.step = self.train_model.step
     self.mode = self.train_model.mode
     self.value = self.train_model.value
     self.initial_state = self.train_model.initial_state
     self.loss_names = [
         'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
         'clipfrac'
     ]
     if MPI is not None:
         sync_from_root(self.variables)
Ejemplo n.º 2
0
def switch_training_model(update,  is_mpi_root, model_train, _run, iter_loss, session, comm,
                          save=True):
    if is_mpi_root and save:
        save_model(model_train, "model", update, _run)
    # Copy train -> Old, overwriting burnin "burnin" parameters
    vars_train = tf.get_collection(tf.GraphKeys.VARIABLES, scope="ppo_iter_train")
    vars_burnin = tf.get_collection(tf.GraphKeys.VARIABLES, scope="ppo_iter_burnin")
    if not iter_loss["dont_switch_just_reset_burnin"]:
        # Copy variables over from burnin to train
        print("Switching variables")
        for train_var in vars_train:
            # Get var name: Remove the first part of the name:
            var_name = "/".join(train_var.name.split("/")[1:])
            # Construct burnin var name by prepending the name with "ppo_iter_burnin"
            burnin_var_name = "/".join(["ppo_iter_burnin", var_name])
            # Find the burnin var
            burnin_var = [v for v in tf.global_variables() if v.name == burnin_var_name][0]
            # Assign it the "train" value
            session.run(tf.assign(train_var, burnin_var))
    else:
        print("NOT switching variables")

    print("Re-initialize burnin variables")
    # Reinitialize variables in "burnin". Should make them random again.
    re_init_train_op = tf.initialize_variables(vars_burnin)
    session.run(re_init_train_op)

    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
    if MPI is not None:
        sync_from_root(session, global_variables, comm=comm) #pylint: disable=E1101
Ejemplo n.º 3
0
	def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
	             nsteps, ent_coef, vf_coef, max_grad_norm, agent_index, microbatch_size=None):
		self.sess = sess = get_session()

		with tf.variable_scope('ppo2_model_%i_act_and_train' % agent_index, reuse=tf.AUTO_REUSE):
			# CREATE OUR TWO MODELS
			# act_model that is used for sampling
			act_model = policy(nbatch_act, 1, sess)

			# Train model for training
			if microbatch_size is None:
				train_model = policy(nbatch_train, nsteps, sess)
			else:
				train_model = policy(microbatch_size, nsteps, sess)

		with tf.variable_scope('ppo2_model_%i_e' % agent_index, reuse=tf.AUTO_REUSE):
			e_act_model = policy(nbatch_act, 1, sess)

			# Model for 'e'xtrinsic and 'c'uriosity rewards
			if microbatch_size is None:
				e_train_model = policy(nbatch_train, nsteps, sess)
			else:
				e_train_model = policy(microbatch_size, nsteps, sess)

		with tf.variable_scope('ppo2_model_%i_c' % agent_index, reuse=tf.AUTO_REUSE):
			c_act_model = policy(nbatch_act, 1, sess)

			# Model for 'e'xtrinsic and 'c'uriosity rewards
			if microbatch_size is None:
				c_train_model = policy(nbatch_train, nsteps, sess)
			else:
				c_train_model = policy(microbatch_size, nsteps, sess)

		# CREATE THE PLACEHOLDERS
		self.A = A = train_model.pdtype.sample_placeholder([None])
		self.ADV = ADV = tf.placeholder(tf.float32, [None])
		self.R = R = tf.placeholder(tf.float32, [None])
		# Keep track of old actor
		self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
		# Keep track of old critic
		self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
		self.LR = LR = tf.placeholder(tf.float32, [])
		# Cliprange
		self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

		neglogpac = train_model.pd.neglogp(A)

		# Calculate the entropy
		# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
		entropy = tf.reduce_mean(train_model.pd.entropy())

		# CALCULATE THE LOSS
		# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

		# Clip the value to reduce variability during Critic training
		# Get the predicted value
		vpred = train_model.vf
		vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
		# Unclipped value
		vf_losses1 = tf.square(vpred - R)
		# Clipped value
		vf_losses2 = tf.square(vpredclipped - R)

		vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

		# Calculate ratio (pi current policy / pi old policy)
		ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

		# Defining Loss = - J is equivalent to max J
		pg_losses = -ADV * ratio

		pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

		# Final PG loss
		pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
		approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
		clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

		# Total loss
		loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

		# UPDATE THE PARAMETERS USING LOSS
		# 1. Get the model parameters
		params = tf.trainable_variables('ppo2_model_%i_act_and_train' % agent_index)
		# 2. Build our trainer
		if MPI is not None:
			self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
		else:
			self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
		# 3. Calculate the gradients
		grads_and_var = self.trainer.compute_gradients(loss, params)
		grads, var = zip(*grads_and_var)

		if max_grad_norm is not None:
			# Clip the gradients (normalize)
			grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
		grads_and_var = list(zip(grads, var))
		# zip aggregate each gradient with parameters associated
		# For instance zip(ABCD, xyza) => Ax, By, Cz, Da

		self.grads = grads
		self.var = var
		self._train_op = self.trainer.apply_gradients(grads_and_var)
		self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
		self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

		self.train_model = train_model
		self.act_model = act_model
		self.step = act_model.step
		self.value = act_model.value
		self.initial_state = act_model.initial_state

		# self.save = functools.partial(save_variables, sess=sess)
		# self.load = functools.partial(load_variables, sess=sess)

		# END OF TRAIN MODEL

		# BEGIN OF E_MODEL

		self.e_A = e_A = e_train_model.pdtype.sample_placeholder([None])
		self.e_ADV = e_ADV = tf.placeholder(tf.float32, [None])
		self.e_R = e_R = tf.placeholder(tf.float32, [None])
		# Keep track of old actor
		self.e_OLDNEGLOGPAC = e_OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
		# Keep track of old critic
		self.e_OLDVPRED = e_OLDVPRED = tf.placeholder(tf.float32, [None])
		self.e_LR = e_LR = tf.placeholder(tf.float32, [])
		# Cliprange
		self.e_CLIPRANGE = e_CLIPRANGE = tf.placeholder(tf.float32, [])

		e_neglogpac = e_train_model.pd.neglogp(e_A)

		# Calculate the entropy
		# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
		e_entropy = tf.reduce_mean(e_train_model.pd.entropy())

		# CALCULATE THE LOSS
		# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

		# Clip the value to reduce variability during Critic training
		# Get the predicted value
		e_vpred = e_train_model.vf
		e_vpredclipped = e_OLDVPRED + tf.clip_by_value(e_train_model.vf - e_OLDVPRED, - e_CLIPRANGE, e_CLIPRANGE)
		# Unclipped value
		e_vf_losses1 = tf.square(e_vpred - e_R)
		# Clipped value
		e_vf_losses2 = tf.square(e_vpredclipped - e_R)

		e_vf_loss = .5 * tf.reduce_mean(tf.maximum(e_vf_losses1, e_vf_losses2))

		# Calculate ratio (pi current policy / pi old policy)
		e_ratio = tf.exp(e_OLDNEGLOGPAC - e_neglogpac)

		# Defining Loss = - J is equivalent to max J
		e_pg_losses = -e_ADV * e_ratio

		e_pg_losses2 = -e_ADV * tf.clip_by_value(e_ratio, 1.0 - e_CLIPRANGE, 1.0 + e_CLIPRANGE)

		# Final PG loss
		e_pg_loss = tf.reduce_mean(tf.maximum(e_pg_losses, e_pg_losses2))
		e_approxkl = .5 * tf.reduce_mean(tf.square(e_neglogpac - e_OLDNEGLOGPAC))
		e_clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(e_ratio - 1.0), e_CLIPRANGE)))

		# Total loss
		e_loss = e_vf_loss * vf_coef

		# UPDATE THE PARAMETERS USING LOSS
		# 1. Get the model parameters
		e_params = tf.trainable_variables('ppo2_model_%i_e' % agent_index)
		# 2. Build our trainer
		if MPI is not None:
			self.e_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=e_LR, epsilon=1e-5)
		else:
			self.e_trainer = tf.train.AdamOptimizer(learning_rate=e_LR, epsilon=1e-5)
		# 3. Calculate the gradients
		e_grads_and_var = self.e_trainer.compute_gradients(e_loss, e_params)
		e_grads, e_var = zip(*e_grads_and_var)

		if max_grad_norm is not None:
			# Clip the gradients (normalize)
			e_grads, _e_grad_norm = tf.clip_by_global_norm(e_grads, max_grad_norm)
		e_grads_and_var = list(zip(e_grads, e_var))
		# zip aggregate each gradient with parameters associated
		# For instance zip(ABCD, xyza) => Ax, By, Cz, Da

		self.e_grads = e_grads
		self.e_var = e_var
		self._e_train_op = self.e_trainer.apply_gradients(e_grads_and_var)
		self.e_loss_names = ['e_policy_loss', 'e_value_loss', 'e_policy_entropy', 'e_approxkl', 'e_clipfrac']
		self.e_stats_list = [e_pg_loss, e_vf_loss, e_entropy, e_approxkl, e_clipfrac]

		self.e_train_model = e_train_model
		self.e_act_model = e_act_model
		self.e_value = e_act_model.value
		self.e_initial_state = e_act_model.initial_state
		self.e_step = e_act_model.step

		# END OF E_MODEL

		# BEGIN OF C_MODEL

		self.c_A = c_A = c_train_model.pdtype.sample_placeholder([None])
		self.c_ADV = c_ADV = tf.placeholder(tf.float32, [None])
		self.c_R = c_R = tf.placeholder(tf.float32, [None])
		# Keep track of old actor
		self.c_OLDNEGLOGPAC = c_OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
		# Keep track of old critic
		self.c_OLDVPRED = c_OLDVPRED = tf.placeholder(tf.float32, [None])
		self.c_LR = c_LR = tf.placeholder(tf.float32, [])
		# Cliprange
		self.c_CLIPRANGE = c_CLIPRANGE = tf.placeholder(tf.float32, [])

		c_neglogpac = c_train_model.pd.neglogp(c_A)

		# Calculate the entropy
		# Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
		c_entropy = tf.reduce_mean(c_train_model.pd.entropy())

		# CALCULATE THE LOSS
		# Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

		# Clip the value to reduce variability during Critic training
		# Get the predicted value
		c_vpred = c_train_model.vf
		c_vpredclipped = c_OLDVPRED + tf.clip_by_value(c_train_model.vf - c_OLDVPRED, - c_CLIPRANGE, c_CLIPRANGE)
		# Unclipped value
		c_vf_losses1 = tf.square(c_vpred - c_R)
		# Clipped value
		c_vf_losses2 = tf.square(c_vpredclipped - c_R)

		c_vf_loss = .5 * tf.reduce_mean(tf.maximum(c_vf_losses1, c_vf_losses2))

		# Calculate ratio (pi current policy / pi old policy)
		c_ratio = tf.exp(c_OLDNEGLOGPAC - c_neglogpac)

		# Defining Loss = - J is equivalent to max J
		c_pg_losses = -c_ADV * c_ratio

		c_pg_losses2 = -c_ADV * tf.clip_by_value(c_ratio, 1.0 - c_CLIPRANGE, 1.0 + c_CLIPRANGE)

		# Final PG loss
		c_pg_loss = tf.reduce_mean(tf.maximum(c_pg_losses, c_pg_losses2))
		c_approxkl = .5 * tf.reduce_mean(tf.square(c_neglogpac - c_OLDNEGLOGPAC))
		c_clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(c_ratio - 1.0), c_CLIPRANGE)))

		# Total loss
		c_loss = c_vf_loss * vf_coef

		# UPDATE THE PARAMETERS USING LOSS
		# 1. Get the model parameters
		c_params = tf.trainable_variables('ppo2_model_%i_c' % agent_index)
		# 2. Build our trainer
		if MPI is not None:
			self.c_trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=c_LR, epsilon=1e-5)
		else:
			self.c_trainer = tf.train.AdamOptimizer(learning_rate=c_LR, epsilon=1e-5)
		# 3. Calculate the gradients
		c_grads_and_var = self.c_trainer.compute_gradients(c_loss, c_params)
		c_grads, c_var = zip(*c_grads_and_var)

		if max_grad_norm is not None:
			# Clip the gradients (normalize)
			c_grads, _c_grad_norm = tf.clip_by_global_norm(c_grads, max_grad_norm)
		c_grads_and_var = list(zip(c_grads, c_var))
		# zip aggregate each gradient with parameters associated
		# For instance zip(ABCD, xyza) => Ax, By, Cz, Da

		self.c_grads = c_grads
		self.c_var = c_var
		self._c_train_op = self.c_trainer.apply_gradients(c_grads_and_var)
		self.c_loss_names = ['c_policy_loss', 'c_valuc_loss', 'c_policy_entropy', 'c_approxkl', 'c_clipfrac']
		self.c_stats_list = [c_pg_loss, c_vf_loss, c_entropy, c_approxkl, c_clipfrac]

		self.c_train_model = c_train_model
		self.c_act_model = c_act_model
		self.c_value = c_act_model.value
		self.c_initial_state = c_act_model.initial_state
		self.c_step = c_act_model.step

		self.save = functools.partial(save_variables, sess=sess)
		self.load = functools.partial(load_variables, sess=sess)

		# END OF C_MODEL

		initialize()
		global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
		if MPI is not None:
			sync_from_root(sess, global_variables)  # pylint: disable=E1101
Ejemplo n.º 4
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        self.max_grad_norm = max_grad_norm
        self.head_idx_current_batch = 0
        self.critic_idx_current_batch = 0
        sess = tf.compat.v1.get_default_session()

        self.running_stats_s = RunningStats()
        self.running_stats_s_ = RunningStats()
        self.running_stats_r = RunningStats()
        self.running_stats_r_i = RunningStats()

        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps,
                             max_grad_norm)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1,
                           max_grad_norm)
        self.train_model = train_model
        # in case we don't use rep loss
        rep_loss = None
        # HEAD_IDX = tf.compat.v1.placeholder(tf.int32, [None])
        A = train_model.pdtype.sample_placeholder([None], name='A')
        A_i = train_model.A_i
        LATENT_FACTORS = train_model.pdtype.sample_placeholder(
            [
                Config.REP_LOSS_M, Config.POLICY_NHEADS, None,
                count_latent_factors(Config.ENVIRONMENT)
            ],
            name='LATENT_FACTORS')
        ADV = tf.compat.v1.placeholder(tf.float32, [None], name='ADV')
        R = tf.compat.v1.placeholder(tf.float32, [None], name='R')
        R_NCE = tf.compat.v1.placeholder(tf.float32,
                                         [Config.REP_LOSS_M, 1, None],
                                         name='R_NCE')
        OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None],
                                                name='OLDNEGLOGPAC')
        OLDNEGLOGPAC_i = tf.compat.v1.placeholder(tf.float32, [None],
                                                  name='OLDNEGLOGPAC_i')
        LR = tf.compat.v1.placeholder(tf.float32, [], name='LR')
        CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [], name='CLIPRANGE')
        # TD loss for critic
        # VF loss
        OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None],
                                            name='OLDVPRED')
        vpred = train_model.vf_train  # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not
        if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1:
            vpred = vpred[self.critic_idx_current_batch]
        vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(
            input_tensor=tf.maximum(vf_losses1, vf_losses2))

        neglogpac_train = train_model.pd_train[0].neglogp(A)
        ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train)
        pg_losses_train = -ADV * ratio_train
        pg_losses2_train = -ADV * tf.clip_by_value(
            ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(
            input_tensor=tf.maximum(pg_losses_train, pg_losses2_train))
        approxkl_train = .5 * tf.reduce_mean(
            input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC))
        clipfrac_train = tf.reduce_mean(input_tensor=tf.cast(
            tf.greater(tf.abs(ratio_train -
                              1.0), CLIPRANGE), dtype=tf.float32))

        if Config.BETA >= 0:
            entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0].
                                     _components_distribution.entropy())
        else:
            entropy = tf.reduce_mean(
                input_tensor=train_model.pd_train[0].entropy())

        # Add entropy and policy loss for the samples as well
        if Config.SNI or Config.SNI2:
            neglogpac_run = train_model.pd_run.neglogp(A)
            ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run)
            pg_losses_run = -ADV * ratio_run
            pg_losses2_run = -ADV * tf.clip_by_value(
                ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

            pg_loss += tf.reduce_mean(
                input_tensor=tf.maximum(pg_losses_run, pg_losses2_run))
            pg_loss /= 2.

            entropy += tf.reduce_mean(
                input_tensor=train_model.pd_run.entropy())
            entropy /= 2.

            approxkl_run = .5 * tf.reduce_mean(
                input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC))
            clipfrac_run = tf.reduce_mean(
                input_tensor=tf.cast(tf.greater(tf.abs(ratio_run -
                                                       1.0), CLIPRANGE),
                                     dtype=tf.float32))
        else:
            approxkl_run = tf.constant(0.)
            clipfrac_run = tf.constant(0.)

        params = tf.compat.v1.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            mpi_print('param', p, num_params)
            total_num_params += num_params

        mpi_print('total num params:', total_num_params)

        l2_loss = tf.reduce_sum(
            input_tensor=[tf.nn.l2_loss(v) for v in weight_params])

        # The first occurance should be in the train_model

        if Config.BETA >= 0:
            info_loss = tf.compat.v1.get_collection(key="INFO_LOSS",
                                                    scope="model/info_loss")
            beta = Config.BETA

        elif Config.BETA_L2A >= 0:
            info_loss = tf.compat.v1.get_collection(key="INFO_LOSS_L2A",
                                                    scope="model/info_loss")
            beta = Config.BETA_L2A
        else:
            info_loss = [tf.constant(0.)]
            beta = 0

        # print(info_loss)
        assert len(info_loss) == 1
        info_loss = info_loss[0]

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss + tf.reduce_mean(
            train_model.curl_loss)
        aux_loss = tf.reduce_mean(train_model.curl_loss)

        if Config.SYNC_FROM_ROOT:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=LR,
                                       epsilon=1e-5)
            trainer_aux = MpiAdamOptimizer(MPI.COMM_WORLD,
                                           learning_rate=3e-3,
                                           epsilon=1e-5)
        else:
            trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR,
                                                       epsilon=1e-5)

        self.opt = trainer
        grads_and_var = trainer.compute_gradients(loss, params)

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        tot_norm = tf.zeros((1, ))
        for g, v in grads_and_var:
            tot_norm += tf.norm(g)
        tot_norm = tf.reshape(tot_norm, [])

        _train = trainer.apply_gradients(grads_and_var)

        grads_and_var_aux = trainer_aux.compute_gradients(aux_loss, params)
        grads_aux, var_aux = zip(*grads_and_var_aux)
        if max_grad_norm is not None:
            grads_aux, _grad_norm_aux = tf.clip_by_global_norm(
                grads_aux, max_grad_norm)
        grads_and_var_aux = list(zip(grads_aux, var_aux))
        _train_aux = trainer_aux.apply_gradients(grads_and_var_aux)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  infos,
                  values,
                  neglogpacs,
                  values_i,
                  returns_i,
                  states_nce,
                  anchors_nce,
                  labels_nce,
                  actions_nce,
                  neglogps_nce,
                  rewards_nce,
                  infos_nce,
                  target,
                  states=None):
            values = values[:, self.
                            critic_idx_current_batch] if Config.CUSTOM_REP_LOSS else values
            advs = returns - values
            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)

            if Config.CUSTOM_REP_LOSS:

                td_map = {
                    train_model.X: obs,
                    A: actions,
                    ADV: advs,
                    R: returns,
                    LR: lr,
                    CLIPRANGE: cliprange,
                    OLDNEGLOGPAC: neglogpacs,
                    OLDVPRED: values,
                    train_model.STATE_NCE:
                    states_nce.transpose(1, 2, 0, 3, 4, 5),
                    train_model.ANCH_NCE: anchors_nce,
                    train_model.LAB_NCE: labels_nce.transpose(1, 0),
                    R_NCE: rewards_nce.transpose(1, 2, 0),
                    train_model.STATE: anchors_nce,
                    train_model.A_i: actions_nce,
                    OLDNEGLOGPAC_i: neglogps_nce[:, 0,
                                                 self.head_idx_current_batch],
                    ADV_i: advs_i,
                    OLDVPRED_i: values_i,
                    R_i: returns_i
                }
            else:
                td_map = {
                    train_model.X: obs,
                    A: actions,
                    ADV: advs,
                    R: returns,
                    LR: lr,
                    CLIPRANGE: cliprange,
                    OLDNEGLOGPAC: neglogpacs,
                    OLDVPRED: values
                }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            # import ipdb;ipdb.set_trace()
            if target == 'CURL':
                return sess.run([aux_loss, _train_aux], td_map)[:-1]
            else:
                return sess.run([
                    pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train,
                    approxkl_run, clipfrac_run, l2_loss, info_loss, _train
                ], td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train',
            'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss',
            'info_loss_cv', 'rep_loss', 'value_i_loss', 'policy_loss_i',
            'gradient_norm'
        ]

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        self.rep_vec = act_model.rep_vec
        self.custom_train = train_model.custom_train

        if Config.SYNC_FROM_ROOT:
            if MPI.COMM_WORLD.Get_rank() == 0:
                initialize()

            global_variables = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="")
            sess.run(tf.compat.v1.global_variables_initializer())
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
        else:
            initialize()
Ejemplo n.º 5
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            act_model = policy(nbatch_act, 1, sess)
            train_model = policy(nbatch_train, nsteps, sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        params = tf.trainable_variables('ppo2_model')
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                   learning_rate=LR,
                                   epsilon=1e-5)
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 6
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = tf.get_default_session()

        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps)
        norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        params = tf.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            mpi_print('param', p, num_params)
            total_num_params += num_params

        mpi_print('total num params:', total_num_params)

        l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT

        if Config.SYNC_FROM_ROOT:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=LR,
                                       epsilon=1e-5)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        grads_and_var = trainer.compute_gradients(loss, params)

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values

            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run([
                pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, _train
            ], td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'l2_loss'
        ]

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load

        if Config.SYNC_FROM_ROOT:
            if MPI.COMM_WORLD.Get_rank() == 0:
                initialize()

            global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                                 scope="")
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
        else:
            initialize()
Ejemplo n.º 7
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = tf.get_default_session()

        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps)
        norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        # VF loss
        vpred = train_model.vf_train  # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf_train - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        neglogpac_train = train_model.pd_train.neglogp(A)
        ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train)
        pg_losses_train = -ADV * ratio_train
        pg_losses2_train = -ADV * tf.clip_by_value(ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses_train, pg_losses2_train))
        approxkl_train = .5 * tf.reduce_mean(tf.square(neglogpac_train - OLDNEGLOGPAC))
        clipfrac_train = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio_train - 1.0), CLIPRANGE)))

        if Config.BETA >= 0:
            entropy = tf.reduce_mean(train_model.pd_train._components_distribution.entropy())
        else:
            entropy = tf.reduce_mean(train_model.pd_train.entropy())

        # Add entropy and policy loss for the samples as well
        if Config.SNI or Config.SNI2:
            neglogpac_run = train_model.pd_run.neglogp(A)
            ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run)
            pg_losses_run = -ADV * ratio_run
            pg_losses2_run = -ADV * tf.clip_by_value(ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

            pg_loss += tf.reduce_mean(tf.maximum(pg_losses_run, pg_losses2_run))
            pg_loss /= 2.

            entropy += tf.reduce_mean(train_model.pd_run.entropy())
            entropy /= 2.

            approxkl_run = .5 * tf.reduce_mean(tf.square(neglogpac_run - OLDNEGLOGPAC))
            clipfrac_run = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio_run - 1.0), CLIPRANGE)))
        else:
            approxkl_run = tf.constant(0.)
            clipfrac_run = tf.constant(0.)

        params = tf.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            mpi_print('param', p, num_params)
            total_num_params += num_params

        mpi_print('total num params:', total_num_params)

        l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])

        # The first occurance should be in the train_model

        if Config.BETA >= 0:
            info_loss = tf.get_collection(
                key="INFO_LOSS",
                scope="model/info_loss"
            )
            beta = Config.BETA

        elif Config.BETA_L2A >= 0:
            info_loss = tf.get_collection(
                key="INFO_LOSS_L2A",
                scope="model/info_loss"
            )
            beta = Config.BETA_L2A
        else:
            info_loss = [tf.constant(0.)]
            beta = 0

        print(info_loss)
        assert len(info_loss) == 1
        info_loss = info_loss[0]

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss

        if Config.SYNC_FROM_ROOT:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        grads_and_var = trainer.compute_gradients(loss, params)

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
            advs = returns - values

            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train, approxkl_run, clipfrac_run, l2_loss, info_loss, _train],
                td_map
            )[:-1]
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train', 'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss', 'info_loss_cv']

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load

        if Config.SYNC_FROM_ROOT:
            if MPI.COMM_WORLD.Get_rank() == 0:
                initialize()
            
            global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
            sync_from_root(sess, global_variables) #pylint: disable=E1101
        else:
            initialize()
Ejemplo n.º 8
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 fm_coeff=0.002):
        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)
            act_model_clean = policy(nbatch_act, 1, sess, randomization=False)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
                train_model_clean = policy(nbatch_train,
                                           nsteps,
                                           sess,
                                           randomization=False)
            else:
                train_model = policy(microbatch_size, nsteps, sess)
                train_model_clean = policy(microbatch_size,
                                           nsteps,
                                           sess,
                                           randomization=False)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])
        # Normalizing advantage
        ADV = (ADV - tf.reduce_mean(ADV)) / (reduce_std(ADV) + 1e-8)

        ############ Training with Randomized Obs ############
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Calculate the entropy
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Calculate value loss
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate policy gradient loss
        neglogpac = train_model.pd.neglogp(A)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))

        # Record some information
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        ############################################

        ############ Training with Clean Obs ############
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Calculate the entropy
        entropy_clean = tf.reduce_mean(train_model_clean.pd.entropy())

        # Calculate value loss
        vpred_clean = train_model_clean.vf
        vpredclipped_clean = OLDVPRED + tf.clip_by_value(
            train_model_clean.vf - OLDVPRED, -CLIPRANGE, CLIPRANGE)
        vf_losses1_clean = tf.square(vpred_clean - R)
        vf_losses2_clean = tf.square(vpredclipped_clean - R)
        vf_loss_clean = .5 * tf.reduce_mean(
            tf.maximum(vf_losses1_clean, vf_losses2_clean))

        # Calculate policy gradient loss
        neglogpac_clean = train_model_clean.pd.neglogp(A)
        ratio_clean = tf.exp(OLDNEGLOGPAC - neglogpac_clean)
        pg_losses_clean = -ADV * ratio_clean
        pg_losses2_clean = -ADV * tf.clip_by_value(
            ratio_clean, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss_clean = tf.reduce_mean(
            tf.maximum(pg_losses_clean, pg_losses2_clean))

        # Record some information
        approxkl_clean = .5 * tf.reduce_mean(
            tf.square(neglogpac_clean - self.OLDNEGLOGPAC))
        clipfrac_clean = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio_clean - 1.0), self.CLIPRANGE)))
        ############################################

        ############ Calculate the total loss ############
        fm_loss = tf.losses.mean_squared_error(
            labels=tf.stop_gradient(train_model_clean.latent_fts),
            predictions=train_model.latent_fts)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + fm_loss * fm_coeff
        loss_clean = pg_loss_clean - entropy_clean * ent_coef + vf_loss_clean * vf_coef + fm_loss * fm_coeff
        self.stats_list = [
            loss, fm_loss, pg_loss, vf_loss, entropy, approxkl, clipfrac
        ]
        self.stats_list_clean = [
            loss_clean, fm_loss, pg_loss_clean, vf_loss_clean, entropy_clean,
            approxkl_clean, clipfrac_clean
        ]
        ##################################################

        ############ UPDATE THE PARAMETERS ############
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = mpi_adam.MpiAdamOptimizer(
                comm,
                learning_rate=LR,
                mpi_rank_weight=mpi_rank_weight,
                epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads_and_var_clean = self.trainer.compute_gradients(
            loss_clean, params)
        grads, var = zip(*grads_and_var)
        grads_clean, var_clean = zip(*grads_and_var_clean)
        # 4. Clip the gradient if required
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads_clean, _grad_norm = tf.clip_by_global_norm(
                grads_clean, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        grads_and_var_clean = list(zip(grads_clean, var_clean))
        ###############################################

        self.loss_names = [
            'total_loss', 'fm_loss', 'policy_loss', 'value_loss',
            'policy_entropy', 'approxkl', 'clipfrac'
        ]
        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self._train_clean_op = self.trainer.apply_gradients(
            grads_and_var_clean)
        self.fm_coeff = fm_coeff
        self.clean_flag = False
        self._init_randcnn = tf.variables_initializer(act_model.randcnn_param)

        self.train_model = train_model
        self.train_model_clean = train_model_clean
        self.act_model = act_model
        self.act_model_clean = act_model_clean
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E1101
Ejemplo n.º 9
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm, microbatch_size=None):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]


        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables) #pylint: disable=E1101
Ejemplo n.º 10
0
 def initialize(self):
     if MPI is not None:
         sync_from_root(self.actor.trainable_variables + self.critic.trainable_variables)
     self.target_actor.set_weights(self.actor.get_weights())
     self.target_critic.set_weights(self.critic.get_weights())
Ejemplo n.º 11
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            act_model = policy(nbatch_act, 1, sess)
            train_model = policy(nbatch_train, nsteps, sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        params = tf.trainable_variables('ppo2_model')
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                   learning_rate=LR,
                                   epsilon=1e-5)
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        # def save(file_name):
        #     save_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/"
        #     ps = sess.run(params)
        #     joblib.dump(ps, save_path+file_name)
        #     print("\n------------\nModel with name '{}' saved successfully!\n------------\n".format(file_name))
        #
        # def load(path_to_file):
        #     load_path = "/media/rustam/88E4BD3EE4BD2EF6/thesis/modeling/python/training/ppo_model_backups/promising_ones/"
        #     file_name = LOAD_FILENAME
        #     if path_to_file is None:
        #         path_to_file = load_path + file_name
        #     loaded_params = joblib.load(path_to_file)
        #     restores = []
        #     for p, loaded_p in zip(params, loaded_params):
        #         restores.append(p.assign(loaded_p))
        #     sess.run(restores)
        #     print("Model with name '{}' was successfully loaded!".format(file_name))

        # was uncommented
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        # uncommented
        # self.save = save # functools.partial(save_variables, sess=sess)
        # self.load = load # functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 12
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 microbatch_size=None,
                 unsupType='action'):
        self.sess = sess = get_session()

        # icm parameters
        self.unsup = unsupType is not None
        predictor = None
        self.numaction = ac_space.n
        designHead = 'universe'

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        if self.unsup:
            with tf.variable_scope("predictor", reuse=tf.AUTO_REUSE):
                if 'state' in unsupType:
                    self.local_ap_network = predictor = StatePredictor(
                        ob_space, ac_space, designHead, unsupType)
                else:
                    self.local_ap_network = predictor = StateActionPredictor(
                        ob_space, ac_space, designHead)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # computing predictor loss
        predloss = None
        if self.unsup:
            if 'state' in unsupType:
                predloss = constants[
                    'PREDICTION_LR_SCALE'] * predictor.forwardloss
            else:
                predloss = constants['PREDICTION_LR_SCALE'] * (
                    predictor.invloss * (1 - constants['FORWARD_LOSS_WT']) +
                    predictor.forwardloss * constants['FORWARD_LOSS_WT'])

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)

        if self.unsup:
            predgrads_and_var = self.trainer.compute_gradients(
                predloss * 20.0, predictor.var_list)

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # clip predictor gradients
        if self.unsup:
            predgrads, _ = zip(*predgrads_and_var)
            predgrads, _ = tf.clip_by_global_norm(predgrads,
                                                  constants['GRAD_NORM_CLIP'])
            predgrads_and_var = list(zip(predgrads, predictor.var_list))

            # combine the policy and predictor grads and vars
            grads_and_var = grads_and_var + predgrads_and_var
            # unzip the grads and var after adding predictor grads/vars
            grads, var = zip(*grads_and_var)

            # normalize gradients for logging
            predgrad_global_norm = tf.global_norm(predgrads)

        # normalize gradients for logging
        grad_global_norm = tf.global_norm(grads)

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'grad_global_norm'
        ]
        self.stats_list = [
            pg_loss, vf_loss, entropy, approxkl, clipfrac, grad_global_norm
        ]

        if self.unsup:
            self.loss_names += [
                'predloss', 'pred_forwardloss', 'pred_invloss',
                'predgrad_global_norm'
            ]
            self.stats_list += [
                predloss, predictor.forwardloss, predictor.invloss,
                predgrad_global_norm
            ]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        # prediction bonus function for icm
        self.pred_bonus = predictor.pred_bonus
        self.pred_bonuses = predictor.pred_bonuses

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 13
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 proportion_of_exp_used_for_predictor_update,
                 microbatch_size=None):
        self.sess = sess = get_session()

        with tf.variable_scope('rnd_ppo_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # Create our RND model that will generate our intrinsic rewards
        rnd_model = RND(ob_space, proportion_of_exp_used_for_predictor_update)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.INT_R = INT_R = tf.placeholder(tf.float32, [None])
        self.EXT_R = EXT_R = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])

        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        vf_loss_int = (0.5 * vf_coef) * tf.reduce_mean(
            tf.square(train_model.vf_int - self.INT_R))
        vf_loss_ext = (0.5 * vf_coef) * tf.reduce_mean(
            tf.square(train_model.vf_ext - self.EXT_R))
        vf_loss = vf_loss_int + vf_loss_ext

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss + rnd_model.rnd_loss

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('rnd_ppo_model')

        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'rnd_loss', 'policy_entropy',
            'approxkl', 'clipfrac'
        ]
        self.stats_list = [
            pg_loss, vf_loss, rnd_model.rnd_loss, entropy, approxkl, clipfrac
        ]

        self.train_model = train_model
        self.act_model = act_model
        self.rnd_model = rnd_model
        self.step = act_model.step
        self.values = act_model.values
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 14
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight=1, comm=None, microbatch_size=None, disc_coeff=None, num_levels=200):
        self.sess = sess = get_session()

        self.num_levels = num_levels

        if disc_coeff is not None:
            self.disc_coeff = disc_coeff
        else:
            self.disc_coeff = tf.placeholder(tf.float32, [])

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        with tf.variable_scope('vae'):
            reconstruction = build_reconstructor(train_model.z)

        with tf.variable_scope('discriminator_model', reuse=tf.AUTO_REUSE):
            # CREATE DISCRIMINTATOR MODEL
            discriminator_inputs = train_model.z

            predicted_logits = build_discriminator(discriminator_inputs, num_levels)

            self.predicted_labels = tf.nn.softmax(predicted_logits)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        self.TRAIN_GEN = tf.placeholder(tf.float32, [])

        # Seed labels for the discriminator
        self.LABELS = LABELS = tf.placeholder(tf.int32, [None])

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        # VAE-related
        reconstruction_loss = tf.reduce_mean(tf.square(tf.cast(self.train_model.X, tf.float32) - reconstruction * 255.), (1, 2, 3))
        latent_loss = -0.5 * tf.reduce_sum(1. + self.train_model.z_log_std_sq - tf.square(self.train_model.z_mean) - tf.exp(self.train_model.z_log_std_sq), 1)
        vae_loss = tf.reduce_mean(reconstruction_loss + latent_loss)
        

        discriminator_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.LABELS, logits=predicted_logits))
        discriminator_accuracy = tf.reduce_mean(tf.cast(tf.equal(self.LABELS, tf.argmax(predicted_logits, axis=-1, output_type=tf.int32)), tf.float32))

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = 1. * (pg_loss - entropy * ent_coef + vf_loss * vf_coef)

        pd_loss = tf.reduce_mean(-1. * tf.reduce_sum((1. / float(num_levels) * (tf.nn.log_softmax(predicted_logits, axis=-1))), axis=-1))

        self.update_discriminator_params(comm, discriminator_loss, mpi_rank_weight, LR, max_grad_norm)

        self.update_vae_params(comm, vae_loss, mpi_rank_weight, LR, max_grad_norm=None)

        self.update_policy_params(comm, loss, mpi_rank_weight, LR, max_grad_norm)

        # self.update_all_params(comm, loss + (self.disc_coeff * pd_loss), discriminator_loss, mpi_rank_weight, LR, max_grad_norm)

        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'discriminator_loss', 'discriminator_accuracy', 'pd_loss', 'softmax_min', 'softmax_max', 'vae_loss', 'reconstruction_loss', 'latent_loss']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac, discriminator_loss, discriminator_accuracy, pd_loss, tf.reduce_min(self.predicted_labels), tf.reduce_max(self.predicted_labels), vae_loss, tf.reduce_mean(reconstruction_loss), tf.reduce_mean(latent_loss)]
        if isinstance(self.disc_coeff, tf.Tensor):
            self.loss_names.append("disc_coeff")
            self.stats_list.append(self.disc_coeff)

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101

        self.training_i = 0
Ejemplo n.º 15
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = tf.get_default_session()

        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps)
        norm_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        # Dipam:  Add placeholder for discriminator labels and hyperparameters
        #DISC_LR = tf.placeholder(tf.float32, [])
        DISC_LAM = tf.placeholder(tf.float32, [])
        DISC_LABELS = tf.placeholder(tf.int64, [None])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        #Dipam: Add loss for domain discriminator here       
        disc_logits = train_model.disc_logits
        
        domain_onehot = tf.one_hot(DISC_LABELS, 2)
        disc_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=disc_logits, labels = domain_onehot))
        #disc_trainer = tf.train.AdamOptimizer(learning_rate = DISC_LR, epsilon=1e-5)

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        params = tf.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            mpi_print('param', p, num_params)
            total_num_params += num_params

        mpi_print('total num params:', total_num_params)

        l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT

        #if Config.SYNC_FROM_ROOT:
        #    trainer = MpiAdamOptimizer(MPI.COMM_WORLD, learning_rate=LR, epsilon=1e-5)
        #else:
        orig_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        feat_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        disc_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        polc_trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)

        feat_params = tf.trainable_variables("model/features")
        disc_params = tf.trainable_variables("model/discriminator")
        polc_params = tf.trainable_variables("model/policy")

        feat_loss = loss - tf.multiply(DISC_LAM,disc_loss) # Flip gradients from discriminator

        feat_grad_var = feat_trainer.compute_gradients(feat_loss, feat_params)
        polc_grad_var = polc_trainer.compute_gradients(loss, polc_params)
        disc_grad_var = disc_trainer.compute_gradients(disc_loss, disc_params) 

        grads_and_var = orig_trainer.compute_gradients(loss, params)
        # Dipam: Compute discriminator gradients and apply here along with policy gradients
 
        grads, var = zip(*grads_and_var)
        # Dipam: Add discriminator gradients to policy gradients        
        
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

       # def apply_max_grad_norm(grads_and_var):
       #     grads, var = zip(*grads_and_var)
       # 
       #     if max_grad_norm is not None:
       #         grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
       #     return list(zip(grads, var))
# Dipam : TODO: This separate grad norm clipping is not correct, 
# correct method: ppend all the grads and vars -> clip by global norm-> separate-> apply individually
       # feat_grad_var = apply_max_grad_norm(feat_grad_var) 
       # polc_grad_var = apply_max_grad_norm(polc_grad_var)
       # disc_grad_var = apply_max_grad_norm(disc_grad_var)
        
        _train = orig_trainer.apply_gradients(grads_and_var)
        _train_feat = feat_trainer.apply_gradients(feat_grad_var)
        _train_polc = polc_trainer.apply_gradients(polc_grad_var)
        _train_disc = disc_trainer.apply_gradients(disc_grad_var)

        def train(lr, cliprange, disc_lam, obs, returns, masks, actions, values, neglogpacs, levelids, states=None):
            advs = returns - values

            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)
            
            domain_labels = levelids % 2

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values,
                    DISC_LABELS: domain_labels, DISC_LAM: disc_lam}

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            if disc_lam == 0:
                return sess.run(
                        [pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, loss,_train],
                        td_map)[:-1]
            else:
                return sess.run(
                    [pg_loss, vf_loss, entropy, approxkl, clipfrac, l2_loss, loss , feat_loss, disc_loss, 
                    _train_feat, _train_polc, _train_disc],
                    td_map)[:-3]
        self.loss_names = ['policy_grad_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss', 
                           'total_loss']
        self.disc_loss_names = ['policy_grad_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac', 'l2_loss', 
                                'total_loss', 'feat_loss', 'disc_loss']

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load

        if Config.SYNC_FROM_ROOT:
            if MPI.COMM_WORLD.Get_rank() == 0:
                initialize()
            
            global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="")
            sync_from_root(sess, global_variables) #pylint: disable=E1101
        else:
            initialize()
Ejemplo n.º 16
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        self.max_grad_norm = max_grad_norm

        self.running_stats_s = RunningStats()
        self.running_stats_s_ = RunningStats()
        self.running_stats_r = RunningStats()
        self.running_stats_r_i = RunningStats()

        sess = tf.compat.v1.get_default_session()

        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps,
                             max_grad_norm)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1,
                           max_grad_norm)

        # in case we don't use rep loss
        rep_loss = 0
        SKILLS = tf.compat.v1.placeholder(
            tf.float32, shape=[nbatch_train, Config.N_SKILLS], name='mb_skill')
        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.compat.v1.placeholder(tf.float32, [None])
        ADV_2 = tf.compat.v1.placeholder(tf.float32, [None])
        ADV = ADV + ADV_2
        R = tf.compat.v1.placeholder(tf.float32, [None])
        R_i = tf.compat.v1.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None])
        OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None])
        OLDVPRED_i = tf.compat.v1.placeholder(tf.float32, [None])
        LR = tf.compat.v1.placeholder(tf.float32, [])
        CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [])
        # VF loss
        vpred = train_model.vf_train  # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not
        vpredclipped = OLDVPRED + tf.clip_by_value(
            train_model.vf_train - OLDVPRED, -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(
            input_tensor=tf.maximum(vf_losses1, vf_losses2))

        vpred_i = train_model.vf_i_train  # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not
        vpredclipped_i = OLDVPRED_i + tf.clip_by_value(vpred_i - OLDVPRED_i,
                                                       -CLIPRANGE, CLIPRANGE)
        vf_losses1_i = tf.square(vpred_i - R_i)
        vf_losses2_i = tf.square(vpredclipped_i - R_i)
        vf_loss_i = .5 * tf.reduce_mean(
            input_tensor=tf.maximum(vf_losses1_i, vf_losses2_i))

        neglogpac_train = train_model.pd_train[0].neglogp(A)
        ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train)
        pg_losses_train = -ADV * ratio_train
        pg_losses2_train = -ADV * tf.clip_by_value(
            ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(
            input_tensor=tf.maximum(pg_losses_train, pg_losses2_train))
        approxkl_train = .5 * tf.reduce_mean(
            input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC))
        clipfrac_train = tf.reduce_mean(input_tensor=tf.cast(
            tf.greater(tf.abs(ratio_train -
                              1.0), CLIPRANGE), dtype=tf.float32))

        if Config.BETA >= 0:
            entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0].
                                     _components_distribution.entropy())
        else:
            entropy = tf.reduce_mean(
                input_tensor=train_model.pd_train[0].entropy())

        # Add entropy and policy loss for the samples as well
        if Config.SNI or Config.SNI2:
            neglogpac_run = train_model.pd_run.neglogp(A)
            ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run)
            pg_losses_run = -ADV * ratio_run
            pg_losses2_run = -ADV * tf.clip_by_value(
                ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

            pg_loss += tf.reduce_mean(
                input_tensor=tf.maximum(pg_losses_run, pg_losses2_run))
            pg_loss /= 2.

            entropy += tf.reduce_mean(
                input_tensor=train_model.pd_run.entropy())
            entropy /= 2.

            approxkl_run = .5 * tf.reduce_mean(
                input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC))
            clipfrac_run = tf.reduce_mean(
                input_tensor=tf.cast(tf.greater(tf.abs(ratio_run -
                                                       1.0), CLIPRANGE),
                                     dtype=tf.float32))
        else:
            approxkl_run = tf.constant(0.)
            clipfrac_run = tf.constant(0.)

        params = tf.compat.v1.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            mpi_print('param', p, num_params)
            total_num_params += num_params

        mpi_print('total num params:', total_num_params)

        l2_loss = tf.reduce_sum(
            input_tensor=[tf.nn.l2_loss(v) for v in weight_params])

        # The first occurance should be in the train_model

        if Config.BETA >= 0:
            info_loss = tf.compat.v1.get_collection(key="INFO_LOSS",
                                                    scope="model/info_loss")
            beta = Config.BETA

        elif Config.BETA_L2A >= 0:
            info_loss = tf.compat.v1.get_collection(key="INFO_LOSS_L2A",
                                                    scope="model/info_loss")
            beta = Config.BETA_L2A
        else:
            info_loss = [tf.constant(0.)]
            beta = 0

        # print(info_loss)
        assert len(info_loss) == 1
        info_loss = info_loss[0]

        rep_loss = tf.reduce_mean(
            tf.compat.v1.losses.softmax_cross_entropy(
                onehot_labels=SKILLS, logits=train_model.discriminator_logits))

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + l2_loss * Config.L2_WEIGHT + beta * info_loss + (
            rep_loss * Config.REP_LOSS_WEIGHT + vf_loss_i * vf_coef)

        if Config.SYNC_FROM_ROOT:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=LR,
                                       epsilon=1e-5)
        else:
            trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR,
                                                       epsilon=1e-5)

        self.opt = trainer
        grads_and_var = trainer.compute_gradients(loss, params)
        # idx 40 = v_i/w_0

        grads, var = zip(*grads_and_var)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))

        tot_norm = tf.zeros((1, ))
        for g, v in grads_and_var:
            tot_norm += tf.norm(g)
        tot_norm = tf.reshape(tot_norm, [])

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  states_nce,
                  anchors_nce,
                  labels_nce,
                  obs,
                  returns,
                  returns_i,
                  masks,
                  actions,
                  values,
                  values_i,
                  skills,
                  neglogpacs,
                  states=None):
            advs = returns - values
            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)

            advs_i = returns_i - values_i
            adv_mean_i = np.mean(advs_i, axis=0, keepdims=True)
            adv_std_i = np.std(advs_i, axis=0, keepdims=True)
            advs_i = (advs_i - adv_mean_i) / (adv_std_i + 1e-8)

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                train_model.STATE: obs,
                ADV_2: advs_i,
                OLDVPRED_i: values_i,
                R_i: returns_i,
                SKILLS: skills,
                train_model.Z: skills
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            return sess.run([
                pg_loss, vf_loss, entropy, approxkl_train, clipfrac_train,
                approxkl_run, clipfrac_run, l2_loss, info_loss, rep_loss,
                vf_loss_i, _train
            ], td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl_train',
            'clipfrac_train', 'approxkl_run', 'clipfrac_run', 'l2_loss',
            'info_loss_cv', 'discriminator_loss', 'gradient_norm'
        ]

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.value_i = act_model.value_i
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        self.rep_vec = act_model.rep_vec
        self.custom_train = train_model.custom_train

        if Config.SYNC_FROM_ROOT:
            if MPI.COMM_WORLD.Get_rank() == 0:
                initialize()

            global_variables = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="")
            sess.run(tf.compat.v1.global_variables_initializer())
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
        else:
            initialize()
Ejemplo n.º 17
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 microbatch_size=None,
                 l1regpi,
                 l2regpi,
                 l1regvf,
                 l2regvf,
                 wclippi,
                 wclipvf,
                 todropoutpi,
                 dropoutpi_keep_prob,
                 dropoutpi_keep_prob_value,
                 todropoutvf,
                 dropoutvf_keep_prob,
                 dropoutvf_keep_prob_value,
                 isbnpitrainmode,
                 isbnvftrainmode):
        self.sess = sess = get_session()
        #REGULARIZATION
        self.toregularizepi = l1regpi > 0 or l2regpi > 0
        self.toregularizevf = l1regvf > 0 or l2regvf > 0
        self.todropoutpi = todropoutpi
        self.todropoutvf = todropoutvf
        self.dropoutpi_keep_prob = dropoutpi_keep_prob  #TENSOR
        self.dropoutpi_keep_prob_value = dropoutpi_keep_prob_value
        self.dropoutvf_keep_prob = dropoutvf_keep_prob
        self.dropoutvf_keep_prob_value = dropoutvf_keep_prob_value
        self.isbnpitrainmode = isbnpitrainmode
        self.isbnvftrainmode = isbnvftrainmode
        self.toweightclippi = wclippi > 0
        self.toweightclipvf = wclipvf > 0

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)
            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        if self.toregularizepi:
            print("regularizing policy network: L1 = {}, L2 = {}".format(
                l1regpi, l2regpi))
            regularizerpi = tf.contrib.layers.l1_l2_regularizer(
                scale_l1=l1regpi, scale_l2=l2regpi, scope='ppo2_model/pi')
            all_trainable_weights_pi = tf.trainable_variables('ppo2_model/pi')
            regularization_penalty_pi = tf.contrib.layers.apply_regularization(
                regularizerpi, all_trainable_weights_pi)
            loss = loss + regularization_penalty_pi
        if self.toregularizevf:
            print("regularizing value network: L1 = {}, L2 = {}".format(
                l1regvf, l2regvf))
            regularizervf = tf.contrib.layers.l1_l2_regularizer(
                scale_l1=l1regvf, scale_l2=l2regvf, scope='ppo2_model/vf')
            all_trainable_weights_vf = tf.trainable_variables('ppo2_model/vf')
            regularization_penalty_vf = tf.contrib.layers.apply_regularization(
                regularizervf, all_trainable_weights_vf)
            loss = loss + regularization_penalty_vf

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        #self._update_op = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        #with tf.control_dependencies(self._update_op):
        grads_and_var = self.trainer.compute_gradients(loss, params)

        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)

        if self.toweightclippi:
            print("clipping policy network = {}".format(wclippi))
            policyparams = tf.trainable_variables('ppo2_model/pi')
            self._wclip_ops_pi = []
            for toclipvar in policyparams:
                if 'logstd' in toclipvar.name:
                    continue
                self._wclip_ops_pi.append(
                    tf.assign(toclipvar,
                              tf.clip_by_value(toclipvar, -wclippi, wclippi)))
            self._wclip_op_pi = tf.group(*self._wclip_ops_pi)
        if self.toweightclipvf:
            print("clipping value network = {}".format(wclipvf))
            valueparams = tf.trainable_variables('ppo2_model/vf')
            self._wclip_ops_vf = []
            for toclipvar in valueparams:
                self._wclip_ops_vf.append(
                    tf.assign(toclipvar,
                              tf.clip_by_value(toclipvar, -wclipvf, wclipvf)))
            self._wclip_op_vf = tf.group(*self._wclip_ops_vf)

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        if self.toregularizepi:
            self.loss_names.append('regularization_pi')
            self.stats_list.append(regularization_penalty_pi)
        if self.toregularizevf:
            self.loss_names.append('regularization_vf')
            self.stats_list.append(regularization_penalty_vf)

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm, adaptive_kl):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            train_model = policy(None, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        A = train_model.pdtype.sample_placeholder([None])
        MEANNOW = train_model.pdtype.sample_placeholder([None])
        LOGSTDNOW = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        NEGLOGPACNOW = tf.placeholder(tf.float32, [None])
        RHO_NOW = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        # Cliprange
        CLIPRANGE = tf.placeholder(tf.float32, [])
        KLCONST = tf.placeholder(tf.float32, [])
        KL_REST = tf.placeholder(tf.float32, [None])

        neglogpac = train_model.pd.neglogp(A)
        mean = train_model.pd.mean
        logstd = train_model.pd.logstd

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(-0.5 * tf.square((A - mean) / tf.exp(logstd)) - logstd +
                       0.5 * tf.square((A - MEANNOW) / tf.exp(LOGSTDNOW)) +
                       LOGSTDNOW)
        sgn = tf.ones_like(ratio) * tf.expand_dims(tf.sign(ADV), 1)
        ratio_clip = tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        # Defining Loss = - J is equivalent to max J
        r = tf.reduce_prod(sgn * tf.minimum(ratio * sgn, ratio_clip * sgn),
                           axis=-1)
        pg_losses = -r * ADV / tf.stop_gradient(
            tf.reduce_mean(r))  # * tf.minimum(1.0,RHO_NOW)

        # Final PG loss
        # pg_loss = tf.reduce_mean(tf.stop_gradient(tf.maximum(pg_losses, pg_losses2))*(-neglogpac)) + .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))

        approxkl = .5 * tf.reduce_mean(
            tf.square(neglogpac - OLDNEGLOGPAC) * KL_REST)
        approxklold = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        approxklnow = .5 * tf.reduce_mean(
            tf.square(neglogpac - NEGLOGPACNOW) * tf.minimum(1.0, RHO_NOW))
        kloldnew = tf.reduce_mean(
            tf.reduce_sum(
                logstd - LOGSTDNOW + 0.5 *
                (tf.square(tf.exp(LOGSTDNOW)) + tf.square(mean - MEANNOW)) /
                tf.square(tf.exp(logstd)) - 0.5,
                axis=1) * KL_REST)
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        pg_loss = tf.reduce_mean(pg_losses)  # * tf.minimum(1.0,RHO_NOW))
        # Total loss# * tf.minimum(1.0,RHO_NOW))
        if adaptive_kl:
            pg_loss = pg_loss + KLCONST * kloldnew
        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        print(params)
        # 2. Build our trainer
        if MPI is not None:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=LR,
                                       epsilon=1e-5)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        _grad_norm = tf.sqrt(
            tf.reduce_sum([tf.norm(grad)**2 for grad in grads]))
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  klconst,
                  rgae,
                  trunc_rho,
                  obs,
                  returns,
                  advs,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  mean_now,
                  logstd_now,
                  kl_rest,
                  rho_now,
                  neglogpnow,
                  states=None):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            # Normalize the advantages
            if rgae:
                r = np.minimum(trunc_rho, rho_now)
                radvs = r * advs
                advs = (advs - radvs.mean() / r.mean()) / (radvs.std() + 1e-8)
            else:
                advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                MEANNOW: mean_now,
                LOGSTDNOW: logstd_now,
                KLCONST: klconst,
                KL_REST: kl_rest,
                RHO_NOW: rho_now,
                NEGLOGPACNOW: neglogpnow
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run([
                pg_loss, vf_loss, entropy, approxkl, clipfrac, kloldnew,
                approxklold, approxklnow, _grad_norm, _train
            ], td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac', 'kloldnew', 'approxklold', 'approxklnow', 'gradnorm'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.meanlogstd = act_model.meanlogstd
        self.value = act_model.value
        self.values = train_model.value
        self.meanlogstds = train_model.meanlogstd
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")

        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
    def __init__(self, agent, network, nsteps, rho, ent_coef, vf_coef,
                 max_grad_norm, seed, load_path, **network_kwargs):
        super(AgentModel, self).__init__(name='MAPPO2Model')
        set_global_seeds(seed)
        # Get state_space and action_space
        ob_space = agent.observation_space
        ac_space = agent.action_space

        if isinstance(network, str):
            network_type = network
            policy_network_fn = get_network_builder(network_type)(
                **network_kwargs)
            network = policy_network_fn(ob_space.shape)

        self.train_model = PolicyWithValue(ac_space, network)
        if MPI is not None:
            self.optimizer = MpiAdamOptimizer(
                MPI.COMM_WORLD, self.train_model.trainable_variables)
        else:
            self.optimizer = tf.keras.optimizers.Adam()

        # if isinstance(network, str):
        #     network = get_network_builder(network)(**network_kwargs)
        # policy_network = network(ob_space.shape)
        # value_network = network(ob_space.shape)
        # self.train_model = pi = PolicyWithValue(ac_space, policy_network, value_network)
        # self.pi_var_list = policy_network.trainable_variables + list(pi.pdtype.trainable_variables)
        # self.vf_var_list = value_network.trainable_variables + pi.value_fc.trainable_variables

        # if MPI is not None:
        #     self.pi_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.pi_var_list)
        #     self.vf_optimizer = MpiAdamOptimizer(MPI.COMM_WORLD, self.vf_var_list)
        # else:
        #     self.pi_optimizer = tf.keras.optimizers.Adam()
        #     self.vf_optimizer = tf.keras.optimizers.Adam()
        self.agent = agent
        self.nsteps = nsteps
        self.rho = rho
        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.max_grad_norm = max_grad_norm
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
        self.loss_names = [
            'Lagrange_loss', 'sync_loss', 'policy_loss', 'value_loss',
            'policy_entropy', 'approxkl', 'clipfrac'
        ]
        if MPI is not None:
            sync_from_root(self.variables)

        self.comm_matrix = agent.comm_matrix.copy()
        self.estimates = np.ones([agent.nmates, nsteps], dtype=np.float32)
        self.multipliers = np.zeros([agent.nmates, nsteps], dtype=np.float32)
        for i, comm_i in enumerate(self.comm_matrix):
            self.estimates[i] = comm_i[self.agent.id] * self.estimates[i]

        if load_path is not None:
            load_path = osp.expanduser(load_path)
            ckpt = tf.train.Checkpoint(model=self.train_model)
            manager = tf.train.CheckpointManager(ckpt,
                                                 load_path,
                                                 max_to_keep=None)
            ckpt.restore(manager.latest_checkpoint)
Ejemplo n.º 20
0
    def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale):

        sess = get_session()

        #TODO find a better way
        input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]]
        self.action_shape = 36

        # Placeholders
        self.state_ = phi_state = tf.placeholder(tf.float32,
                                                 [None, *input_shape],
                                                 name="icm_state")
        self.next_state_ = phi_next_state = tf.placeholder(
            tf.float32, [None, *input_shape], name="icm_next_state")
        self.action_ = action = tf.placeholder(tf.float32, [None],
                                               name="icm_action")

        with tf.variable_scope('icm_model'):
            # Feature encoding
            # Aka pass state and next_state to create phi(state), phi(next_state)
            # state --> phi(state)
            phi_state = self.feature_encoding(self.state_)

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                # next_state to phi(next_state)
                phi_next_state = self.feature_encoding(self.next_state_)

            # INVERSE MODEL
            pred_actions_logits, pred_actions_prob = self.inverse_model(
                phi_state, phi_next_state)

            # FORWARD MODEL
            pred_phi_next_state = self.forward_model(action, phi_state)

        # CALCULATE THE ICM LOSS
        # Inverse Loss LI
        # We calculate the cross entropy between our ât and at
        # Squeeze the labels (required)
        labels = tf.cast(action, tf.int32)

        self.inv_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=pred_actions_logits, labels=labels),
            name="inverse_loss")

        # Foward Loss
        # LF = 1/2 || pred_phi_next_state - phi_next_state ||
        # TODO 0.5 * ?
        self.forw_loss = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                        name="forward_loss")

        # Todo predictor lr scale ?
        # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale
        self.icm_loss = (
            (1 - beta) * self.inv_loss + beta * self.forw_loss) * icm_lr_scale

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        icm_params = tf.trainable_variables('icm_model')
        # 2. Build our trainer
        icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=1e-4,
                                       epsilon=1e-5)
        # 3. Calculate the gradients
        icm_grads_and_var = icm_trainer.compute_gradients(
            self.icm_loss, icm_params)
        icm_grads, icm_var = zip(*icm_grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            icm_grads, icm__grad_norm = tf.clip_by_global_norm(
                icm_grads, max_grad_norm)
        icm_grads_and_var = list(zip(icm_grads, icm_var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _icm_train = icm_trainer.apply_gradients(icm_grads_and_var)

        if MPI.COMM_WORLD.Get_rank() == 0:
            print("Initialize")
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        print("GLOBAL VARIABLES", global_variables)
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 21
0
def learn(*,
          network,
          env,
          total_timesteps,
          iter_loss,
          arch,
          _run,
          seed=None,
          nsteps=2048,
          ent_coef=0.0,
          learning_rate=3e-4,
          lr_schedule=None,
          vf_coef=0.5,
          max_grad_norm=0.5,
          gamma=0.99,
          lam=0.95,
          log_interval=10,
          nminibatches=4,
          noptepochs=4,
          cliprange=0.2,
          load_path=None,
          mpi_rank_weight=1,
          comm=None,
          eval=None,
          **network_kwargs):
    '''
    Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347)

    Parameters:
    ----------
    network:
       The network model. Will only work with the one in this repo because of IBAC
    env: baselines.common.vec_env.VecEnv
    total_timesteps: int
         number of timesteps (i.e. number of actions taken in the environment)
    iter_loss: dict
        the config dict as specified in default.yaml and/or overwritting by command line arguments
        see sacred for further documentation
    arch: dict
        config dict similar to iter_loss
    eval: dict
        config dict similar to iter_loss
    _run:
        sacred Experiment._run object. Used for logging
    ent_coef: float
        policy entropy coefficient in the optimization objective
    seed: float
        random seed
    nsteps: int
        number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
        nenv is number of environment copies simulated in parallel)
    ent_coef: float
        value function loss coefficient in the optimization objective
    learning_rate: float
        learning rate
    lr_schedule: None or str
        If None, use a const. learning rate. If string, only "linear" is implemented at the moment
    vf_coef: float
        Coefficient for vf optimisation
    max_grad_norm: flaot
        Max gradient norm before it's clipped
    gamma: float
        Discount factor
    lam: float
        For GAE
    log_interval: int
        number of timesteps between logging events
    nminibatches: int
        number of training minibatches per update. For recurrent policies,
        should be smaller or equal than number of environments run in parallel.
    noptepochs: int
        number of training epochs per update
    cliprange: float or function
        clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
        and 0 is the end of the training
    save_interval: int
        number of timesteps between saving events
    load_path: str
        path to load the model from
    **network_kwargs:
        keyword arguments to the policy / network builder.
        See baselines.common/policies.py/build_policy and arguments to a particular type of network
        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''
    # Set learning rate schedule
    lr = get_lr_fn(lr_schedule, start_learning_rate=learning_rate)

    set_global_seeds(seed)
    session = get_session()

    # if isinstance(lr, float): lr = constfn(lr)
    # else: assert callable(lr)
    if isinstance(cliprange, float): cliprange = constfn(cliprange)
    else: assert callable(cliprange)
    total_timesteps = int(total_timesteps)

    # Get the nb of env
    nenvs = env.num_envs

    # Get state_space and action_space
    ob_space = env.observation_space
    ac_space = env.action_space

    # Calculate the batch_size
    nbatch = nenvs * nsteps
    nbatch_train = nbatch // nminibatches
    is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0)
    model_fn = Model

    policy = build_policy(env, network, arch, **network_kwargs)

    # Instantiate the model object (that creates act_model and train_model)
    def create_model(scope_name, **kwargs):
        return model_fn(scope_name=scope_name,
                        policy=policy,
                        ob_space=ob_space,
                        ac_space=ac_space,
                        nbatch_act=nenvs,
                        nbatch_train=nbatch_train,
                        nsteps=nsteps,
                        ent_coef=ent_coef,
                        vf_coef=vf_coef,
                        max_grad_norm=max_grad_norm,
                        comm=comm,
                        mpi_rank_weight=mpi_rank_weight,
                        iter_loss=iter_loss,
                        arch=arch,
                        **kwargs)

    # model_train is the teacher and always executed
    # model_burnin is trained. If teacher and student are swapped, the parameters from burnin are
    # copied into the teacher and burnin is re-initialized
    model_train = create_model("ppo_iter_train")
    model_burnin = create_model(
        "ppo_iter_burnin",
        target_vf=model_train.train_model.vf_run,
        target_dist_param=model_train.train_model.pi_run)

    get_session().run(tf.variables_initializer(tf.global_variables()))
    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="")
    if MPI is not None:
        sync_from_root(session, global_variables, comm=comm)  # pylint: disable=E1101

    if load_path is not None:
        print("Load model...")
    if eval["load_id"]:
        # Only works with mongodb as backend, not with tinydb
        raise NotImplementedError("Requires MongoDB backend to work")
        docs = get_docs(db_uri, db_name, "runs")
        projection = {'config': True}
        projection.update({'artifacts': True})

        doc = docs.find_one({'_id': eval["load_id"]}, projection)
        print("Loading model from db to disc")
        file_id = get_file_id(doc, eval["file_name"])
        load_path = os.path.join(logger.get_dir(),
                                 "loadmodel_{}".format(_run._id))
        save_file_from_db(file_id, load_path, db_uri, db_name)
        model_train.load(load_path)
        if eval["switch_after_load"]:
            switch_training_model(0,
                                  is_mpi_root,
                                  model_train,
                                  _run,
                                  iter_loss,
                                  session,
                                  comm,
                                  save=False)

    # Instantiate the runner object
    runner = Runner(env=env,
                    model=model_train,
                    model_burnin=model_burnin,
                    nsteps=nsteps,
                    gamma=gamma,
                    lam=lam,
                    iter_loss=iter_loss,
                    eval=eval)

    epinfobuf = deque(maxlen=100)

    burnin_data_idx = 0
    all_burnin_data = None

    assert iter_loss["timesteps_anneal"] > iter_loss["v2_buffer_size"] * env.num_envs * nsteps, \
    "{}, {}".format(iter_loss["timesteps_anneal"], iter_loss["v2_buffer_size"] * env.num_envs * nsteps)

    # Start total timer
    tfirststart = time.perf_counter()

    nupdates = total_timesteps // nbatch
    current_cycle_count = 0
    for update in range(1, nupdates + 1):
        assert nbatch % nminibatches == 0
        num_timesteps = update * nbatch
        # Start timer
        frac = 1.0 - (update - 1.0) / nupdates
        # Calculate the learning rate
        lrnow = lr(frac)

        # Calculate the cliprange
        cliprangenow = cliprange(frac)

        # 'Burnin_phase' tells us whether we need regularization
        cycle_count, alpha_reg, burnin_phase = scheduling(
            num_timesteps, iter_loss, "alpha_reg")

        if cycle_count != current_cycle_count:
            current_cycle_count = cycle_count
            if iter_loss["v2"]:
                logger.info("Training student")
                train_student(
                    teacher=model_train,
                    student=model_burnin,
                    data=all_burnin_data,
                    iter_loss=iter_loss,
                    lr=lrnow,
                    cliprange=cliprangenow,
                    nminibatches=nminibatches,
                    session=session,
                    max_idx=burnin_data_idx,
                    nenvs=env.num_envs,
                    nsteps=nsteps,
                    id=_run._id,
                )
            switch_training_model(update, is_mpi_root, model_train, _run,
                                  iter_loss, session, comm)
            # Resetting
            all_burnin_data = None
            burnin_data_idx = 0
            logger.info("Switched training model")

        tstart = time.perf_counter()

        if update % log_interval == 0 and is_mpi_root:
            logger.info('Stepping environment...')

        # Get minibatch
        obs, returns, b_returns, masks, actions, values, b_values, neglogpacs, states, b_states, epinfos, burnin_data= \
            runner.run(burnin_phase) #pylint: disable=E0632

        if burnin_phase and (iter_loss["v2"] or eval["save_latent"]):
            print("Saving data")
            if iter_loss["v2_use_files"] or eval["save_latent"]:
                # Burnin_data_idx is incremented by nsteps, which is nr. of files
                save_data(burnin_data, burnin_data_idx, _run._id, nsteps)
            else:
                if all_burnin_data is None:
                    all_burnin_data = get_all_burnin_data_dict(
                        env, iter_loss, nsteps, comm)
                for key, value in burnin_data.items():
                    all_burnin_data[key][burnin_data_idx:burnin_data_idx +
                                         nsteps] = value
            burnin_data_idx += nsteps

        if update % log_interval == 0 and is_mpi_root: logger.info('Done.')

        epinfobuf.extend(epinfos)

        # Here what we're going to do is for each minibatch calculate the loss and append it.
        mblossvals = []
        mblossvals_burnin = []
        if states is None:  # nonrecurrent version
            # Index of each element of batch_size
            # Create the indices array
            inds = np.arange(nbatch)
            for _ in range(noptepochs):
                # Randomize the indexes
                np.random.shuffle(inds)
                # 0 to batch_size with batch_train_size step
                for start in range(0, nbatch, nbatch_train):

                    end = start + nbatch_train
                    mbinds = inds[start:end]
                    slices_train = (arr[mbinds]
                                    for arr in (obs, returns, actions, values,
                                                neglogpacs))
                    slices_burnin = (arr[mbinds]
                                     for arr in (obs, b_returns, actions,
                                                 b_values, neglogpacs))
                    stats_train, train_op_train, feed = model_train.train(
                        lrnow,
                        cliprangenow,
                        *slices_train,
                    )

                    stats_burnin, train_op_burnin, feed_burnin = model_burnin.train(
                        lrnow,
                        cliprangenow,
                        *slices_burnin,
                        alpha=alpha_reg,
                    )
                    feed.update(feed_burnin)  # Needs both!

                    fetches = {}
                    if eval["eval_only"]:
                        pass
                        session_outputs = {}
                    elif not burnin_phase or iter_loss["v2"]:
                        # For v2, normal PPO training is only the old policy,
                        # The student policy is trained differently
                        fetches.update({
                            "stats_train": stats_train,
                        })
                        fetches.update({"train_op": train_op_train})
                        session_outputs = session.run(fetches, feed)
                    elif (iter_loss["update_old_policy"]
                          or (iter_loss["update_old_policy_in_initial"]
                              and cycle_count == 0)):
                        fetches.update({"stats_burnin": stats_burnin})
                        fetches.update({"train_op": train_op_burnin})
                        session_outputs_burnin = session.run(fetches, feed)

                        fetches.update({
                            "stats_train": stats_train,
                        })
                        fetches.update({"train_op": train_op_train})
                        session_outputs = session.run(fetches, feed)

                        session_outputs.update(session_outputs_burnin)
                    else:
                        fetches.update({"stats_burnin": stats_burnin})
                        fetches.update({"train_op": train_op_burnin})
                        session_outputs = session.run(fetches, feed)

                    if "stats_train" in session_outputs.keys():
                        mblossvals.append(session_outputs["stats_train"])
                    else:
                        mblossvals.append(
                            [0 for loss in model_train.loss_names])

                    if "stats_burnin" in session_outputs.keys():
                        mblossvals_burnin.append(
                            session_outputs["stats_burnin"])
                    else:
                        mblossvals_burnin.append(
                            [0 for loss in model_burnin.loss_names])

        else:  # recurrent version
            raise NotImplementedError("Recurrent version not implemented")

        # Feedforward --> get losses --> update
        lossvals = np.mean(mblossvals, axis=0)
        lossvals_burnin = np.mean(mblossvals_burnin, axis=0)
        # End timer
        tnow = time.perf_counter()
        # Calculate the fps (frame per second)
        fps = int(nbatch / (tnow - tstart))

        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, returns)
            logger.logkv("misc/serial_timesteps", update * nsteps)
            logger.logkv("misc/nupdates", update)
            logger.logkv("misc/total_timesteps", update * nbatch)
            logger.logkv("fps", fps)
            logger.logkv("misc/explained_variance", float(ev))
            logger.logkv('eprewmean',
                         safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.logkv('eplenmean',
                         safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.logkv('misc/time_elapsed', tnow - tfirststart)
            for (lossval, lossname) in zip(lossvals, model_train.loss_names):
                logger.logkv('loss/' + lossname, lossval)
            for (lossval, lossname) in zip(lossvals_burnin,
                                           model_burnin.loss_names):
                logger.logkv('loss_burnin/' + lossname, lossval)
            logger.logkv("schedule/alpha_reg", alpha_reg)
            logger.logkv("schedule/current_cycle_count", current_cycle_count)
            logger.logkv("schedule/burnin_phase", burnin_phase)

            logger.dumpkvs()

    if is_mpi_root:
        save_model(model_train, "model", update, _run)
    return model_train
Ejemplo n.º 22
0
    def __init__(self, ob_space, ac_space, ent_coef, vf_coef,
                max_grad_norm, mpi_rank_weight=1, comm=None,
                normalize_observations=True, normalize_returns=True,
                use_tensorboard=False, tb_log_dir=None):
        self.sess = sess = get_session()
        self.use_tensorboard = use_tensorboard

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        # CREATE OUR TWO MODELS
        network_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]
        vnetwork_spec = [
            {
                'layer_type': 'dense',
                'units': int (256),
                'activation': 'relu',
                'nodes_in': ['observation_self'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            },
            {
                'layer_type': 'dense',
                'units': int (128),
                'activation': 'relu',
                'nodes_in': ['main'],
                'nodes_out': ['main']
            }
        ]

        # Act model that is used for both sampling
        act_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                stochastic=True, reuse=False, build_act=True,
                trainable_vars=None, not_trainable_vars=None,
                gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999,
                normalize_observations=normalize_observations, normalize_returns=normalize_returns)

        # Train model for training
        train_model = PpoPolicy(scope='ppo', ob_space=ob_space, ac_space=ac_space, network_spec=network_spec, v_network_spec=vnetwork_spec,
                    stochastic=True, reuse=True, build_act=True,
                    trainable_vars=None, not_trainable_vars=None,
                    gaussian_fixed_var=True, weight_decay=0.0, ema_beta=0.99999,
                    normalize_observations=normalize_observations, normalize_returns=normalize_returns)
        
        # CREATE THE PLACEHOLDERS
        self.A = A = {k: v.sample_placeholder([None]) for k, v in train_model.pdtypes.items()}
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = sum([train_model.pds[k].neglogp(A[k]) for k in train_model.pdtypes.keys()])

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        #entropy = tf.reduce_mean(train_model.entropy)
        entropy = tf.reduce_mean(sum([train_model.pds[k].entropy() for k in train_model.pdtypes.keys()]))

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.scaled_value_tensor
        vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables(scope="ppo")
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm, learning_rate=LR, mpi_rank_weight=mpi_rank_weight, epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model

        self.step = act_model.act
        self.value = act_model.value
        self.initial_state = act_model.zero_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm) #pylint: disable=E1101

        if self.use_tensorboard:
            self.attach_tensorboard(tb_log_dir)
            self.tb_step = 0
Ejemplo n.º 23
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 mix_mode='nomix',
                 mix_alpha=0.2,
                 mix_beta=0.2,
                 fix_representation=False,
                 use_l2reg=False,
                 l2reg_coeff=1e-4):
        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train,
                                     nsteps,
                                     sess,
                                     mix_mode=mix_mode)
            else:
                train_model = policy(microbatch_size,
                                     nsteps,
                                     sess,
                                     mix_mode=mix_mode)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        # Interpolating the supervision
        if mix_mode == 'mixreg':
            # get coeff and indices
            coeff = train_model.coeff
            indices = train_model.indices
            other_indices = train_model.other_indices
            # mixup
            OLDNEGLOGPAC = coeff * tf.gather(OLDNEGLOGPAC, indices, axis=0) \
                    + (1 - coeff) * tf.gather(
                            OLDNEGLOGPAC, other_indices, axis=0)
            OLDVPRED = coeff * tf.gather(OLDVPRED, indices, axis=0) \
                    + (1 - coeff) * tf.gather(OLDVPRED, other_indices, axis=0)
            R = coeff * tf.gather(R, indices, axis=0) \
                    + (1 - coeff) * tf.gather(R, other_indices, axis=0)
            ADV = coeff * tf.gather(ADV, indices, axis=0) \
                    + (1 - coeff) * tf.gather(ADV, other_indices, axis=0)
            A = tf.gather(A, indices, axis=0)
        elif mix_mode == 'mixobs':
            # get indices
            indices = train_model.indices
            # gather
            OLDNEGLOGPAC = tf.gather(OLDNEGLOGPAC, train_model.indices, axis=0)
            OLDVPRED = tf.gather(OLDVPRED, train_model.indices, axis=0)
            R = tf.gather(R, train_model.indices, axis=0)
            ADV = tf.gather(ADV, train_model.indices, axis=0)
            A = tf.gather(A, train_model.indices, axis=0)
        elif mix_mode == 'nomix':
            pass
        else:
            raise ValueError(f"Unknown mixing mode: {mix_mode} !")

        # Store the nodes to be recorded
        self.loss_names = []
        self.stats_list = []

        ############ CALCULATE LOSS ############
        # Total loss = Policy gradient loss - entropy * entropy coefficient
        #   + Value coefficient * value loss

        # Normalizing advantage
        ADV = (ADV - tf.reduce_mean(ADV)) / (reduce_std(ADV) + 1e-8)

        # Calculate the entropy
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Calculate value loss
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate policy gradient loss
        neglogpac = train_model.pd.neglogp(A)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Record some information
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        self.loss_names.extend([
            'total_loss',
            'policy_loss',
            'value_loss',
            'policy_entropy',
            'approxkl',
            'clipfrac',
        ])
        self.stats_list.extend([
            loss,
            pg_loss,
            vf_loss,
            entropy,
            approxkl,
            clipfrac,
        ])
        ############################################

        ############ UPDATE THE PARAMETERS ############
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        if use_l2reg:
            weight_params = [v for v in params if '/b' not in v.name]
            l2_loss = tf.reduce_sum([tf.nn.l2_loss(v) for v in weight_params])
            self.loss_names.append('l2_loss')
            self.stats_list.append(l2_loss)
            loss = loss + l2_loss * l2reg_coeff
        if fix_representation:
            params = params[-4:]
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)
        # 4. Clip the gradient if required
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        ###############################################

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self._init_op = tf.variables_initializer(params)
        self._sync_param = lambda: sync_from_root(sess, params, comm=comm)

        self.mix_mode = mix_mode
        self.mix_alpha = mix_alpha
        # JAG: Add beta parameter
        self.mix_beta = mix_beta
        self.fix_representation = fix_representation
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.adv_gradient = act_model.adv_gradient
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # Exclude the random convolution layer from syncing
        global_variables = [
            v for v in global_variables if 'randcnn' not in v.name
        ]
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)
Ejemplo n.º 24
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 lf_coef,
                 max_grad_norm,
                 init_labda=1.,
                 microbatch_size=None,
                 threshold=1.):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_lyapunov_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.l_ADV = l_ADV = tf.placeholder(tf.float32, [None])
        # 这两个R都是带衰减的R
        self.R = R = tf.placeholder(tf.float32, [None])

        self.v_l = v_l = tf.placeholder(tf.float32, [None])
        log_labda = tf.get_variable('ppo2_lyapunov_model/Labda',
                                    None,
                                    tf.float32,
                                    initializer=tf.log(init_labda))
        self.labda = tf.exp(log_labda)

        self.safety_threshold = tf.placeholder(tf.float32, None, 'threshold')

        self.threshold = threshold
        # self.log_labda = tf.placeholder(tf.float32, None, 'Labda')
        # self.labda = tf.constant(10.)
        # self.Lam=10.

        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.OLDLPRED = OLDLPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Get the predicted value
        lpred = train_model.lf
        lpredclipped = OLDLPRED + tf.clip_by_value(train_model.lf - OLDLPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        lf_losses1 = tf.square(lpred - v_l)
        # Clipped value
        lf_losses2 = tf.square(lpredclipped - v_l)

        lf_loss = .5 * tf.reduce_mean(tf.maximum(lf_losses1, lf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining safety loss

        lpred = train_model.lf
        lpred_ = train_model.lf_
        # self.l_lambda = tf.reduce_mean(ratio *  tf.stop_gradient(lpred_) - tf.stop_gradient(lpred))
        l_lambda1 = tf.reduce_mean(ratio * l_ADV + v_l - self.safety_threshold)
        l_lambda2 = tf.reduce_mean(
            tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) * l_ADV +
            v_l - self.safety_threshold)

        l_lambda = tf.maximum(l_lambda1, l_lambda2)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))+ l_lambda*tf.stop_gradient(self.labda) - \
                  tf.stop_gradient(l_lambda) * log_labda
        # pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)+ self.l_lambda * self.labda)
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + lf_loss * lf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_lyapunov_model')
        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'safety_value_loss', 'policy_entropy',
            'approxkl', 'clipfrac', 'lagrangian'
        ]
        self.stats_list = [
            pg_loss, vf_loss, lf_loss, entropy, approxkl, clipfrac, self.labda
        ]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.eval_step = act_model.eval_step
        self.value = act_model.value
        self.l_value = act_model.l_value
        self.l_value_ = act_model.l_value_
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 25
0
    def __init__(self,
                 *,
                 network,
                 env,
                 lr=3e-4,
                 cliprange=0.2,
                 nsteps=128,
                 nminibatches=4,
                 noptepochs=4,
                 ent_coef=0.0,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 gamma=0.99,
                 lam=0.95,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 load_path=None,
                 **network_kwargs):
        """
        Parameters:
        ----------

        network:                          policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                                          specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                                          tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                                          neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                                          See common/models.py/lstm for more details on using recurrent nets in policies.py

        env: baselines.common.vec_env.VecEnv     environment. Needs to be vectorized for parallel environment simulation.
                                          The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class.


        lr: float or function             learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the
                                          training and 0 is the end of the training.

        cliprange: float or function      clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training
                                          and 0 is the end of the training

        nsteps: int                       number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                                          nenv is number of environment copies simulated in parallel)


        nminibatches: int                 number of training minibatches per update. For recurrent policies.py,
                                          should be smaller or equal than number of environments run in parallel.

        noptepochs: int                   number of training epochs per update

        ent_coef: float                   policy entropy coefficient in the optimization objective

        vf_coef: float                    value function loss coefficient in the optimization objective

        gamma: float                      discounting factor

        lam: float                        advantage estimation discounting factor (lambda in the paper)

        log_interval: int                 number of timesteps between logging events

        load_path: str                    path to load the model from

        **network_kwargs:                 keyword arguments to the policy / network builder. See baselines.common/policies.py.py/build_policy and arguments to a particular type of network
                                          For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

        """

        self.sess = sess = get_session()

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        policy = build_policy(env, network, **network_kwargs)

        self.env = env

        if isinstance(lr, float):
            self.lr = constfn(lr)
        else:
            assert callable(lr)
        if isinstance(cliprange, float):
            self.cliprange = constfn(cliprange)
        else:
            assert callable(cliprange)
        self.nminibatches = nminibatches

        # if eval_env is not None:
        #     eval_runner = Runner(env=eval_env, model=model, nsteps=nsteps, gamma=gamma, lam=lam)

        # Calculate the batch_size
        self.nenvs = self.env.num_envs
        self.nsteps = nsteps
        self.nbatch = self.nenvs * self.nsteps
        self.nbatch_train = self.nbatch // nminibatches
        self.noptepochs = noptepochs

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(self.nenvs, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(self.nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder(
            [None])  # action placeholder
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0),
                                   CLIPRANGE)))  # ratio 裁剪量

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS

        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.def_path_pre = os.path.dirname(
            os.path.abspath(__file__)) + '/tmp/'

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  # pylint: disable=E1101

        if load_path is not None:
            self.load_newest(load_path)

        # Instantiate the runner object
        self.runner = Runner(env=self.env,
                             model=self,
                             nsteps=nsteps,
                             gamma=gamma,
                             lam=lam)
Ejemplo n.º 26
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            train_model = policy(nbatch_train, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        # Cliprange
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model')
        # 2. Build our trainer
        trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                   learning_rate=LR,
                                   epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        _train = trainer.apply_gradients(grads_and_var)

        def train(lr,
                  cliprange,
                  obs,
                  returns,
                  masks,
                  actions,
                  values,
                  neglogpacs,
                  states=None):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advs = returns - values

            # Normalize the advantages
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run(
                [pg_loss, vf_loss, entropy, approxkl, clipfrac, _train],
                td_map)[:-1]

        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)

        if MPI.COMM_WORLD.Get_rank() == 0:
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 27
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 load_path,
                 skip_layers=[],
                 frozen_weights=[],
                 transfer_weights=False,
                 microbatch_size=None):
        self.sess = sess = get_session()

        with tf.variable_scope('ppo2_model', reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        def print_weights(params):
            variables_names = [v.name for v in params]
            values = sess.run(variables_names)
            for k, v in zip(variables_names, values):
                if str(k) == 'ppo2_model/vf/w:0':
                    print("Variable: " + str(k))
                    print("Shape: " + str(v.shape))
                    print(v)

        # Initialise the already_initialised array
        already_inits = []

        # Transfer weights from an already trained model
        # TODO: this is if we are going to use transfer learning
        if transfer_weights:
            # Get all variables from the model.
            variables_to_restore = {
                v.name.split(":")[0]: v
                for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
            }

            # Skip some variables during restore.
            skip_pretrained_var = skip_layers

            variables_to_restore = {
                v: variables_to_restore[v]
                for v in variables_to_restore
                if not any(x in v for x in skip_pretrained_var)
            }

            already_inits = variables_to_restore

            # Restore the remaining variables
            if variables_to_restore:
                saver_pre_trained = tf.train.Saver(
                    var_list=variables_to_restore)

                saver_pre_trained.restore(
                    sess, tf.train.latest_checkpoint(load_path))

            # Collect all trainale variables
            params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)

            # Freeze certain variables
            params = tf.contrib.framework.filter_variables(
                params,
                include_patterns=['model'],
                exclude_patterns=frozen_weights)

            # Initialise all the other variables
            '''
            """Initialize all the uninitialized variables in the global scope."""
            new_variables = set(tf.global_variables())
            new_variables = tf.contrib.framework.filter_variables(
                    new_variables,
                    include_patterns=[],
                    exclude_patterns= variables_to_restore)
            tf.get_default_session().run(tf.variables_initializer(new_variables))   
            '''
        else:
            # If we are not using transfer learning
            # 1. Get the model parameters
            params = tf.trainable_variables('ppo2_model')

        # 2. Build our trainer
        if MPI is not None:
            self.trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=LR,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = zip(grads, var)
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        #self.save = functools.partial(save_variables, sess=sess)
        #self.load = functools.partial(load_variables, sess=sess)

        initialize(already_inits)

        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        if MPI is not None:
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 28
0
    def __init__(self, ob_space, ac_space, max_grad_norm, beta, icm_lr_scale,
                 idf):

        sess = get_session()

        #TODO find a better way
        input_shape = [ob_space.shape[0], ob_space.shape[1], ob_space.shape[2]]

        # input_shape = ob_space
        print("ICM state Input shape ", np.shape(input_shape), "  ",
              input_shape)
        self.action_shape = 36
        self.idf = idf

        # Placeholders

        self.state_ = phi_state = tf.placeholder(tf.float32,
                                                 [None, *input_shape],
                                                 name="icm_state")
        self.next_state_ = phi_next_state = tf.placeholder(
            tf.float32, [None, *input_shape], name="icm_next_state")
        self.action_ = action = tf.placeholder(tf.float32, [None],
                                               name="icm_action")
        # self.R = rewards = tf.placeholder(tf.float32, shape=[None], name="maxR")

        with tf.variable_scope('icm_model'):
            # Feature encoding
            # Aka pass state and next_state to create phi(state), phi(next_state)
            # state --> phi(state)
            print("Feature Encodding of phi state with shape :: ", self.state_)
            phi_state = self.feature_encoding(self.state_)

            with tf.variable_scope(tf.get_variable_scope(),
                                   reuse=tf.AUTO_REUSE):
                # next_state to phi(next_state)
                phi_next_state = self.feature_encoding(self.next_state_)

            # INVERSE MODEL
            if self.idf:
                pred_actions_logits, pred_actions_prob = self.inverse_model(
                    phi_state, phi_next_state)

            # FORWARD MODEL
            pred_phi_next_state = self.forward_model(action, phi_state)

        # CALCULATE THE ICM LOSS
        # Inverse Loss LI
        # We calculate the cross entropy between our ât and at
        # Squeeze the labels (required)
        labels = tf.cast(action, tf.int32)

        print("prediction pred_actions_logits")
        if self.idf:
            self.inv_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    logits=pred_actions_logits, labels=labels),
                name="inverse_loss")

        # Foward Loss
        # LF = 1/2 || pred_phi_next_state - phi_next_state ||
        # TODO 0.5 * ?
        self.forw_loss_axis = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                             axis=-1,
                                             name="forward_loss_axis")

        self.forw_loss = tf.reduce_mean(tf.square(
            tf.subtract(pred_phi_next_state, phi_next_state)),
                                        name="forward_loss")

        # Todo predictor lr scale ?
        # ICM_LOSS = [(1 - beta) * LI + beta * LF ] * Predictor_Lr_scale
        if self.idf:
            self.icm_loss = ((1 - beta) * self.inv_loss + beta * self.forw_loss
                             )  #* icm_lr_scale
        else:
            self.icm_loss = self.forw_loss

        ####
        # self.icm_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name)
        # print("ICM var list ::: " , self.icm_var_list)
        ####

        #
        # if max_grad_norm is not None :
        # t_icm_grads , _ = tf.clip_by_global_norm(self.icm_loss, constants['GRAD_NORM_CLIP'] )
        # t_icm_grads_and_vars = list(zip(self.icm_loss , self.icm_var_list))
        # print("\n\n\nit works \n\n\n")
        #

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        self.icm_params = tf.trainable_variables(
            'icm_model')  ## var_list same as

        ## testing phase
        self.predgrads = tf.gradients(self.icm_loss, self.icm_params)
        self.predgrads, _ = tf.clip_by_global_norm(self.predgrads,
                                                   max_grad_norm)
        self.pred_grads_and_vars = list(zip(self.predgrads, self.icm_params))

        ## testing phase

        # print("\n\nTrainable variables \n ",icm_params)
        # # 2. Build our trainer
        self.icm_trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                            learning_rate=1e-3,
                                            epsilon=1e-5)
        # # 3. Calculate the gradients
        icm_grads_and_var = self.icm_trainer.compute_gradients(
            self.icm_loss, self.icm_params)
        # # t_grads_and_var = tf.gradients()
        icm_grads, icm_var = zip(*icm_grads_and_var)

        if max_grad_norm is not None:
            #     # Clip the gradients (normalize)
            icm_grads, icm__grad_norm = tf.clip_by_global_norm(
                icm_grads, max_grad_norm)
        icm_grads_and_var = list(zip(icm_grads, icm_var))
        # # zip aggregate each gradient with parameters associated
        # # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self._icm_train = self.icm_trainer.apply_gradients(icm_grads_and_var)

        if MPI.COMM_WORLD.Get_rank() == 0:
            print("Initialize")
            initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # print("GLOBAL VARIABLES", global_variables)
        sync_from_root(sess, global_variables)  #pylint: disable=E1101
Ejemplo n.º 29
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                 nsteps, ent_coef, vf_coef, max_grad_norm):
        self.max_grad_norm = max_grad_norm
        self.head_idx_current_batch = 0
        sess = tf.compat.v1.get_default_session()

        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps,
                             max_grad_norm)
        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1,
                           max_grad_norm)

        # in case we don't use rep loss
        rep_loss = None
        # HEAD_IDX = tf.compat.v1.placeholder(tf.int32, [None])
        LATENT_FACTORS = train_model.pdtype.sample_placeholder(
            [
                Config.REP_LOSS_M, Config.POLICY_NHEADS, Config.NUM_ENVS,
                count_latent_factors(Config.ENVIRONMENT)
            ],
            name='LATENT_FACTORS')
        ADV = tf.compat.v1.placeholder(tf.float32, [None], name='ADV')
        U_T = tf.compat.v1.placeholder(tf.float32, [None, 256, 128])
        Z_T_1 = tf.compat.v1.placeholder(tf.float32, [None, 256, 128])
        R = tf.compat.v1.placeholder(tf.float32, [None], name='R')
        R_NCE = tf.compat.v1.placeholder(
            tf.float32,
            [Config.REP_LOSS_M, Config.POLICY_NHEADS, Config.NUM_ENVS],
            name='R_NCE')
        OLDNEGLOGPAC = tf.compat.v1.placeholder(tf.float32, [None],
                                                name='OLDNEGLOGPAC')
        LR = tf.compat.v1.placeholder(tf.float32, [], name='LR')
        CLIPRANGE = tf.compat.v1.placeholder(tf.float32, [], name='CLIPRANGE')
        STEP = tf.compat.v1.placeholder(tf.float32, [], name='STEP')

        # TD loss for critic
        # VF loss
        OLDVPRED = tf.compat.v1.placeholder(tf.float32, [None],
                                            name='OLDVPRED')
        vpred = train_model.vf_train  # Same as vf_run for SNI and default, but noisy for SNI2 while the boostrap is not
        if Config.CUSTOM_REP_LOSS and Config.POLICY_NHEADS > 1:
            vpred = vpred[self.head_idx_current_batch]
        vpredclipped = OLDVPRED + tf.clip_by_value(vpred - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(
            input_tensor=tf.maximum(vf_losses1, vf_losses2))

        neglogpac_train = train_model.pd_train[0].neglogp(train_model.A)
        ratio_train = tf.exp(OLDNEGLOGPAC - neglogpac_train)
        pg_losses_train = -ADV * ratio_train
        pg_losses2_train = -ADV * tf.clip_by_value(
            ratio_train, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(
            input_tensor=tf.maximum(pg_losses_train, pg_losses2_train))
        approxkl_train = .5 * tf.reduce_mean(
            input_tensor=tf.square(neglogpac_train - OLDNEGLOGPAC))
        clipfrac_train = tf.reduce_mean(input_tensor=tf.cast(
            tf.greater(tf.abs(ratio_train -
                              1.0), CLIPRANGE), dtype=tf.float32))

        if Config.BETA >= 0:
            entropy = tf.reduce_mean(input_tensor=train_model.pd_train[0].
                                     _components_distribution.entropy())
        else:
            entropy = tf.reduce_mean(
                input_tensor=train_model.pd_train[0].entropy())

        # Add entropy and policy loss for the samples as well
        if Config.SNI or Config.SNI2:
            neglogpac_run = train_model.pd_run.neglogp(train_model.A)
            ratio_run = tf.exp(OLDNEGLOGPAC - neglogpac_run)
            pg_losses_run = -ADV * ratio_run
            pg_losses2_run = -ADV * tf.clip_by_value(
                ratio_run, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)

            pg_loss += tf.reduce_mean(
                input_tensor=tf.maximum(pg_losses_run, pg_losses2_run))
            pg_loss /= 2.

            entropy += tf.reduce_mean(
                input_tensor=train_model.pd_run.entropy())
            entropy /= 2.

            approxkl_run = .5 * tf.reduce_mean(
                input_tensor=tf.square(neglogpac_run - OLDNEGLOGPAC))
            clipfrac_run = tf.reduce_mean(
                input_tensor=tf.cast(tf.greater(tf.abs(ratio_run -
                                                       1.0), CLIPRANGE),
                                     dtype=tf.float32))
        else:
            approxkl_run = tf.constant(0.)
            clipfrac_run = tf.constant(0.)

        adv_pred = tf.reduce_mean(
            input_tensor=tf.square(tf.stop_gradient(ADV) - train_model.adv_pi))
        # v_pred = tf.reduce_mean(input_tensor=tf.square(tf.stop_gradient(vpred) - train_model.v_pi))
        # bc = tf.reduce_mean(input_tensor=(tf.stop_gradient(OLDNEGLOGPAC)-neglogpac_train))

        params = tf.compat.v1.trainable_variables()
        weight_params = [v for v in params if '/b' not in v.name]

        total_num_params = 0

        for p in params:
            shape = p.get_shape().as_list()
            num_params = np.prod(shape)
            mpi_print('param', p, num_params)
            total_num_params += num_params

        mpi_print('total num params:', total_num_params)

        l2_loss = tf.reduce_sum(
            input_tensor=[tf.nn.l2_loss(v) for v in weight_params])

        # The first occurence should be in the train_model

        if Config.BETA >= 0:
            info_loss = tf.compat.v1.get_collection(key="INFO_LOSS",
                                                    scope="model/info_loss")
            beta = Config.BETA

        elif Config.BETA_L2A >= 0:
            info_loss = tf.compat.v1.get_collection(key="INFO_LOSS_L2A",
                                                    scope="model/info_loss")
            beta = Config.BETA_L2A
        else:
            info_loss = [tf.constant(0.)]
            beta = 0

        # print(info_loss)
        assert len(info_loss) == 1
        info_loss = info_loss[0]
        """"
        Sinkhorn clustering of state sequences
        """

        p_t = tf.nn.log_softmax(
            tf.linalg.matmul(train_model.u_t, train_model.protos) / 0.1,
            axis=1)
        cluster_loss = -tf.compat.v1.reduce_mean(
            tf.compat.v1.reduce_sum(tf.stop_gradient(train_model.codes) * p_t,
                                    axis=1))

        #+ 0.25 * adv_pred
        pi_loss = pg_loss - entropy * ent_coef + Config.REP_LOSS_WEIGHT * train_model.rep_loss + Config.REP_LOSS_WEIGHT * cluster_loss  #+ vf_coef*vf_loss
        v_loss = vf_loss * vf_coef
        aux_loss = (
            (1 - 0.0368)**STEP
        ) * Config.REP_LOSS_WEIGHT * train_model.rep_loss + Config.REP_LOSS_WEIGHT * cluster_loss  #0.5 * v_pred + bc

        if Config.SYNC_FROM_ROOT:
            trainer = MpiAdamOptimizer(MPI.COMM_WORLD,
                                       learning_rate=LR,
                                       epsilon=1e-5)
            trainer_v = MpiAdamOptimizer(MPI.COMM_WORLD,
                                         learning_rate=LR,
                                         epsilon=1e-5)
            trainer_aux = MpiAdamOptimizer(MPI.COMM_WORLD,
                                           learning_rate=LR,
                                           epsilon=1e-5)
        else:
            trainer = tf.compat.v1.train.AdamOptimizer(learning_rate=LR,
                                                       epsilon=1e-5)
            trainer_v = tf.compat.v1.train.AdamOptimizer(learning_rate=LR,
                                                         epsilon=1e-5)

        self.opt = trainer
        # import ipdb;ipdb.set_trace()
        pi_params = [p for p in params if 'pi_branch' in p.name]
        grads_and_var_pi = trainer.compute_gradients(pi_loss, pi_params)

        grads_pi, var_pi = zip(*grads_and_var_pi)
        if max_grad_norm is not None:
            grads_pi, _grad_norm_pi = tf.clip_by_global_norm(
                grads_pi, max_grad_norm)
        grads_and_var_pi = list(zip(grads_pi, var_pi))
        tot_norm = tf.zeros((1, ))
        for g, v in grads_and_var_pi:
            tot_norm += tf.norm(g)
        tot_norm = tf.reshape(tot_norm, [])
        _train_pi = trainer.apply_gradients(grads_and_var_pi)

        v_params = [p for p in params if 'model_0' in p.name]
        grads_and_var_v = trainer_v.compute_gradients(v_loss, v_params)
        grads_v, var_v = zip(*grads_and_var_v)
        if max_grad_norm is not None:
            grads_v, _grad_norm_v = tf.clip_by_global_norm(
                grads_v, max_grad_norm)
        grads_and_var_v = list(zip(grads_v, var_v))
        _train_v = trainer_v.apply_gradients(grads_and_var_v)

        grads_and_var_aux = trainer_aux.compute_gradients(aux_loss, pi_params)
        grads_aux, var_aux = zip(*grads_and_var_aux)
        if max_grad_norm is not None:
            grads_aux, _grad_norm_aux = tf.clip_by_global_norm(
                grads_aux, max_grad_norm)
        grads_and_var_aux = list(zip(grads_aux, var_aux))
        _train_aux = trainer_aux.apply_gradients(grads_and_var_aux)

        def train(lr,
                  cliprange,
                  states_nce,
                  anchors_nce,
                  labels_nce,
                  rewards_nce,
                  infos_nce,
                  obs,
                  returns,
                  masks,
                  actions,
                  infos,
                  values,
                  neglogpacs,
                  step,
                  states=None,
                  train_target='pi'):
            values = values[:, self.
                            head_idx_current_batch] if Config.CUSTOM_REP_LOSS else values
            advs = returns - values
            adv_mean = np.mean(advs, axis=0, keepdims=True)
            adv_std = np.std(advs, axis=0, keepdims=True)
            advs = (advs - adv_mean) / (adv_std + 1e-8)
            # import ipdb;ipdb.set_trace()
            td_map = {
                train_model.X: obs,
                train_model.A: actions,
                ADV: advs,
                R: returns,
                LR: lr,
                train_model.X_pi: obs,
                CLIPRANGE: cliprange,
                OLDNEGLOGPAC: neglogpacs,
                OLDVPRED: values,
                STEP: step,
                train_model.REP_PROC: states_nce
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            if train_target == 'pi':
                pi_res = sess.run([
                    pi_loss, entropy, train_model.rep_loss, cluster_loss,
                    _train_pi
                ], td_map)[:-1]
                return pi_res
            elif train_target == 'value':
                v_res = sess.run([v_loss, _train_v], td_map)[:-1]
                return v_res[0]
            elif train_target == 'aux':
                aux_res = sess.run(
                    [train_model.rep_loss, cluster_loss, _train_aux],
                    td_map)[:-1]
                return aux_res

        self.loss_names = ['policy_loss', 'rep_loss', 'value_loss']

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        self.rep_vec = act_model.rep_vec
        self.custom_train = train_model.custom_train

        if Config.SYNC_FROM_ROOT:
            if MPI.COMM_WORLD.Get_rank() == 0:
                initialize()

            global_variables = tf.compat.v1.get_collection(
                tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, scope="")
            sess.run(tf.compat.v1.global_variables_initializer())
            sync_from_root(sess, global_variables)  #pylint: disable=E1101
        else:
            initialize()
Ejemplo n.º 30
0
    def __init__(self,
                 *,
                 policy,
                 ob_space,
                 ac_space,
                 nbatch_act,
                 nbatch_train,
                 nsteps,
                 ent_coef,
                 vf_coef,
                 max_grad_norm,
                 mpi_rank_weight=1,
                 comm=None,
                 microbatch_size=None,
                 model_index=0):
        self.sess = sess = get_session()
        self.model_index = model_index

        if MPI is not None and comm is None:
            comm = MPI.COMM_WORLD

        with tf.variable_scope('ppo2_model%s' % model_index,
                               reuse=tf.AUTO_REUSE):
            # CREATE OUR TWO MODELS
            # act_model that is used for sampling
            act_model = policy(nbatch_act, 1, sess)

            # Train model for training
            if microbatch_size is None:
                train_model = policy(nbatch_train, nsteps, sess)
            else:
                train_model = policy(microbatch_size, nsteps, sess)

        # CREATE THE PLACEHOLDERS
        self.A = A = train_model.pdtype.sample_placeholder([None])
        self.ADV = ADV = tf.placeholder(tf.float32, [None])
        self.R = R = tf.placeholder(tf.float32, [None])
        # Keep track of old actor
        self.OLDNEGLOGPAC = OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        # Keep track of old critic
        self.OLDVPRED = OLDVPRED = tf.placeholder(tf.float32, [None])
        self.LR = LR = tf.placeholder(tf.float32, [])
        # Cliprange
        self.CLIPRANGE = CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)

        # Calculate the entropy
        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # CALCULATE THE LOSS
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Clip the value to reduce variability during Critic training
        # Get the predicted value
        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED,
                                                   -CLIPRANGE, CLIPRANGE)
        # Unclipped value
        vf_losses1 = tf.square(vpred - R)
        # Clipped value
        vf_losses2 = tf.square(vpredclipped - R)

        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))

        # Calculate ratio (pi current policy / pi old policy)
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)

        # Defining Loss = - J is equivalent to max J
        pg_losses = -ADV * ratio

        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE,
                                             1.0 + CLIPRANGE)

        # Final PG loss
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(
            tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))

        # Total loss
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # UPDATE THE PARAMETERS USING LOSS
        # 1. Get the model parameters
        params = tf.trainable_variables('ppo2_model%s' % model_index)
        # print("para",model_index,params)
        # 2. Build our trainer
        if comm is not None and comm.Get_size() > 1:
            self.trainer = MpiAdamOptimizer(comm,
                                            learning_rate=LR,
                                            mpi_rank_weight=mpi_rank_weight,
                                            epsilon=1e-5)
        else:
            self.trainer = tf.train.AdamOptimizer(learning_rate=LR,
                                                  epsilon=1e-5)
        # 3. Calculate the gradients
        grads_and_var = self.trainer.compute_gradients(loss, params)
        grads, var = zip(*grads_and_var)

        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_var = list(zip(grads, var))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        self.grads = grads
        self.var = var
        self._train_op = self.trainer.apply_gradients(grads_and_var)
        self.loss_names = [
            'policy_loss', 'value_loss', 'policy_entropy', 'approxkl',
            'clipfrac'
        ]
        self.stats_list = [pg_loss, vf_loss, entropy, approxkl, clipfrac]

        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state

        self.save = functools.partial(save_trainable_variables,
                                      scope="ppo2_model%s" % model_index,
                                      sess=sess)
        self.load = functools.partial(load_trainable_variables,
                                      scope="ppo2_model%s" % model_index,
                                      sess=sess)

        initialize()
        global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                             scope="")
        # print("global_variables",model_index,global_variables)
        if MPI is not None:
            sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E1101
Ejemplo n.º 31
0
def main():
    """Run DQN until the environment throws an exception."""
    # Hyperparameters
    learning_rate = 2.5e-4
    gamma = 0.99
    nstep_return = 3
    timesteps_per_proc = 50_000_000
    train_interval = 4
    target_interval = 8192
    batch_size = 512
    min_buffer_size = 20000

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Process procgen training arguments.')
    parser.add_argument('--env_name', type=str, default='starpilot')
    parser.add_argument(
        '--distribution_mode',
        type=str,
        default='easy',
        choices=["easy", "hard", "exploration", "memory", "extreme"])
    parser.add_argument('--num_levels', type=int, default=1)
    parser.add_argument('--start_level', type=int, default=0)
    parser.add_argument('--test_worker_interval', type=int, default=0)
    parser.add_argument('--run_id', type=int, default=1)
    parser.add_argument('--gpus_id', type=str, default='')
    parser.add_argument('--level_setup',
                        type=str,
                        default='procgen',
                        choices=["procgen", "oracle"])
    parser.add_argument('--mix_mode',
                        type=str,
                        default='nomix',
                        choices=['nomix', 'mixreg'])
    parser.add_argument('--mix_alpha', type=float, default=0.2)
    parser.add_argument('--use_l2reg', action='store_true')
    parser.add_argument('--data_aug',
                        type=str,
                        default='no_aug',
                        choices=['no_aug', 'cutout_color', 'crop'])
    parser.add_argument('--PER',
                        type=lambda x: bool(strtobool(x)),
                        default=True,
                        help='Whether to use PER')
    parser.add_argument('--num_envs', type=int, default=64)
    args = parser.parse_args()

    # Setup test worker
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    test_worker_interval = args.test_worker_interval
    is_test_worker = False
    if test_worker_interval > 0:
        is_test_worker = comm.Get_rank() % test_worker_interval == (
            test_worker_interval - 1)
    mpi_rank_weight = 0 if is_test_worker else 1

    num_envs = args.num_envs

    # Setup env specs
    if args.level_setup == "procgen":
        env_name = args.env_name
        num_levels = 0 if is_test_worker else args.num_levels
        start_level = args.start_level
    elif args.level_setup == "oracle":
        env_name = args.env_name
        num_levels = 0
        start_level = args.start_level

    # Setup logger
    log_comm = comm.Split(1 if is_test_worker else 0, 0)
    format_strs = ['csv', 'stdout'] if log_comm.Get_rank() == 0 else []
    logger.configure(
        dir=LOG_DIR +
        f'/{args.level_setup}/{args.mix_mode}/{env_name}/run_{args.run_id}',
        format_strs=format_strs)

    # Create env
    logger.info("creating environment")
    venv = ProcgenEnv(num_envs=num_envs,
                      env_name=env_name,
                      num_levels=num_levels,
                      start_level=start_level,
                      distribution_mode=args.distribution_mode)
    venv = VecExtractDictObs(venv, "rgb")
    venv = VecMonitor(venv=venv, filename=None, keep_buf=100)

    # Setup Tensorflow
    logger.info("creating tf session")
    if args.gpus_id:
        gpus_id = [x.strip() for x in args.gpus_id.split(',')]
        os.environ["CUDA_VISIBLE_DEVICES"] = gpus_id[rank % len(gpus_id)]
    setup_mpi_gpus()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  # pylint: disable=E1101
    sess = tf.Session(config=config)
    sess.__enter__()

    # Setup Rainbow models
    logger.info("building models")
    online_net, target_net = rainbow_models(
        sess,
        venv.action_space.n,
        gym_space_vectorizer(venv.observation_space),
        min_val=REWARD_RANGE_FOR_C51[env_name][0],
        max_val=REWARD_RANGE_FOR_C51[env_name][1])
    dqn = MpiDQN(online_net,
                 target_net,
                 discount=gamma,
                 comm=comm,
                 mpi_rank_weight=mpi_rank_weight,
                 mix_mode=args.mix_mode,
                 mix_alpha=args.mix_alpha,
                 use_l2reg=args.use_l2reg,
                 data_aug=args.data_aug)
    player = NStepPlayer(VecPlayer(venv, dqn.online_net), nstep_return)
    optimize = dqn.optimize(learning_rate=learning_rate)

    # Initialize and sync variables
    sess.run(tf.global_variables_initializer())
    global_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope="")
    if comm.Get_size() > 1:
        sync_from_root(sess, global_variables, comm=comm)  #pylint: disable=E110

    # Training
    logger.info("training")
    if args.PER:
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0.5,
                                                        0.4,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)
    else:
        #set alpha and beta equal to 0 for uniform prioritization and no importance sampling
        dqn.train(num_steps=timesteps_per_proc,
                  player=player,
                  replay_buffer=PrioritizedReplayBuffer(500000,
                                                        0,
                                                        0,
                                                        epsilon=0.1),
                  optimize_op=optimize,
                  train_interval=train_interval,
                  target_interval=target_interval,
                  batch_size=batch_size,
                  min_buffer_size=min_buffer_size)