예제 #1
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            step_model = policy(nenvs, 1, sess)
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("a2c_model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #2
0
    def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma,
                 max_grad_norm, lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c, trust_region,
                 alpha, delta, scope, load_path, debug, policy_inputs):
        self.sess = sess
        self.nenv = nenvs
        self.policy_inputs = policy_inputs.copy()

        nact = ac_space.n
        nbatch = nenvs * nsteps
        eps = 1e-6

        self.scope = scope
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self.A = tf.placeholder(tf.int32, [nbatch], name="action")  # actions
            self.D = tf.placeholder(tf.float32, [nbatch], name="dones")  # dones
            self.R = tf.placeholder(tf.float32, [nbatch], name="rewards")  # rewards, not returns
            self.MU = tf.placeholder(tf.float32, [nbatch, nact], name="mus")  # mu's
            self.LR = tf.placeholder(tf.float32, [], name="lr")

            self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="value_next")  # (by lzn: we revise goal-conditioned next value)

            if isinstance(ob_space, gym.spaces.Dict):
                self.obs_shape = ob_space.spaces['observation'].shape
                self.obs_dtype = ob_space.spaces['observation'].dtype
            else:
                self.obs_shape = ob_space.shape
                self.obs_dtype = ob_space.dtype
            self.achieved_goal_sh = achieved_goal_sh = ACHIEVED_GOAL_SHAPE
            self.desired_goal_sh = desired_goal_sh = DESIRED_GOAL_SHAPE
            self.desired_goal_state_sh = desired_goal_state_sh = self.obs_shape

            self.step_obs_tf = tf.placeholder(self.obs_dtype, (nenvs,) + self.obs_shape, 'step_obs')
            self.step_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs,) + achieved_goal_sh, 'step_achieved_goal')
            self.step_desired_goal_tf = tf.placeholder(tf.float32, (nenvs, ) + desired_goal_sh, 'step_desired_goal')
            self.step_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs,) + desired_goal_state_sh, 'step_desired_goal_state')

            self.train_obs_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + self.obs_shape, 'train_obs')
            self.train_achieved_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + achieved_goal_sh, 'train_achieved_goal')
            self.train_desired_goal_tf = tf.placeholder(tf.float32, (nenvs * nsteps,) + desired_goal_sh, 'train_desired_goal')
            self.train_desired_goal_state_tf = tf.placeholder(self.obs_dtype, (nenvs * nsteps,) + desired_goal_state_sh, 'train_desired_goal_state')

            # normalize embedding
            normalizer = 2500
            step_achieved_goal_tf = self.step_achieved_goal_tf / normalizer
            step_desired_goal_tf = self.step_desired_goal_tf / normalizer
            train_achieved_goal_tf = self.train_achieved_goal_tf / normalizer
            train_desired_goal_tf = self.train_desired_goal_tf / normalizer

            step_obs_tf = self.step_obs_tf
            step_desired_goal_state_tf = self.step_desired_goal_state_tf
            train_obs_tf = self.train_obs_tf
            train_desired_goal_state_tf = self.train_desired_goal_state_tf

            assert 'obs' in policy_inputs
            logger.info('policy_inputs:{}'.format(policy_inputs))
            logger.info('achieved_goal_sh:{}'.format(self.achieved_goal_sh))
            logger.info('desired_goal_sh:{}'.format(self.desired_goal_sh))
            logger.info('normalizer:{}'.format(normalizer))
            policy_inputs.remove('obs')
            if 'desired_goal_state' in policy_inputs:
                policy_inputs.remove('desired_goal_state')
                step_state_tf = tf.concat([step_obs_tf, step_desired_goal_state_tf], axis=-1, name='step_state')
                train_state_tf = tf.concat([train_obs_tf, train_desired_goal_state_tf], axis=-1, name='train_state')
            else:
                step_state_tf = step_obs_tf
                train_state_tf = train_obs_tf

            if 'achieved_goal' in policy_inputs and 'desired_goal' not in policy_inputs:
                policy_inputs.remove('achieved_goal')
                step_goal_tf = step_achieved_goal_tf
                train_goal_tf = train_achieved_goal_tf
            elif 'achieved_goal' not in policy_inputs and 'desired_goal' in policy_inputs:
                policy_inputs.remove('desired_goal')
                step_goal_tf = step_desired_goal_tf
                train_goal_tf = train_desired_goal_tf
            elif 'achieved_goal' in policy_inputs and 'desired_goal' in policy_inputs:
                policy_inputs.remove('achieved_goal')
                policy_inputs.remove('desired_goal')
                step_goal_tf = tf.concat([step_achieved_goal_tf, step_desired_goal_tf], axis=-1, name='step_goal')
                train_goal_tf = tf.concat([train_achieved_goal_tf, train_desired_goal_tf], axis=-1, name='train_goal')
            else:
                step_goal_tf, train_goal_tf = None, None
            if len(policy_inputs) > 0:
                raise ValueError("Unused policy inputs:{}".format(policy_inputs))

            self.step_model = policy(nbatch=nenvs, nsteps=1, state_placeholder=step_state_tf, sess=self.sess,
                                     goal_placeholder=step_goal_tf)
            self.train_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf,
                                      sess=self.sess, goal_placeholder=train_goal_tf, summary_stats=True)

        variables = find_trainable_variables
        self.params = params = variables(scope)
        logger.info("========================== {} =============================".format(scope))
        for var in params:
            logger.info(var)
        logger.info("========================== {} =============================\n".format(scope))

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        # print("========================== Ema =============================")

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            # print(v.name)
            return v

        # print("========================== Ema =============================")

        with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True):
            self.polyak_model = policy(nbatch=nbatch, nsteps=nsteps, state_placeholder=train_state_tf,
                                       goal_placeholder=train_goal_tf, sess=self.sess,)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to self.train_model, self.polyak_model and self.step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(self.train_model.pi)
        polyak_model_p = tf.nn.softmax(self.polyak_model.pi)
        self.step_model_p = tf.nn.softmax(self.step_model.pi)
        # (todo by lizn, use this to calculate next value)
        v = self.v = tf.reduce_sum(train_model_p * self.train_model.q, axis=-1)  # shape is [nenvs * (nsteps)]

        # strip off last step
        # (todo by lizn, we don't need strip)
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, self.train_model.q])
        # f, f_pol, q = map(lambda x: x, [train_model_p, polyak_model_p, self.train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, self.A)
        q_i = get_by_index(q, self.A)

        # Compute ratios for importance truncation
        rho = f / (self.MU + eps)
        rho_i = get_by_index(rho, self.A)

        # Calculate Q_retrace targets
        qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps, gamma)  # (todo by lizn, use new next state value)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)  # (todo by lzn: we do not need the strip the last one)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
                                axis=1)  # IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)

        # Goal loss
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f)  # [nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = - f_pol / (f + eps)  # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) + eps))  # [nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps)  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            # print("=========================== gards add ==============================")
            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]
            # print("=========================== gards add ==============================\n")
            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR, decay=rprop_alpha, epsilon=rprop_epsilon)
        _policy_opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_policy_opt_op]):
            _train_policy = tf.group(ema_apply_op)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        self.run_ops_policy = [_train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
        self.names_ops_policy = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                                 'norm_grads']
        if trust_region:
            self.run_ops_policy = self.run_ops_policy + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
                avg_norm_adj]
            self.names_ops_policy = self.names_ops_policy + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g',
                'avg_norm_adj']
        self.names_ops_policy = [scope + "_" + x for x in self.names_ops_policy]  # scope as prefix

        self.save = functools.partial(save_variables, sess=self.sess, variables=params)

        self.initial_state = self.step_model.initial_state
        # with tf.variable_scope('stats'):
        #     with tf.variable_scope('achieved_goal'):
        #         self.ag_stats = Normalizer(size=self.achieved_goal_sh[0], sess=self.sess)
        #     with tf.variable_scope('desired_goal'):
        #         self.g_stats = Normalizer(size=self.desired_goal_sh[0], sess=self.sess)
        if debug:
            tf.global_variables_initializer().run(session=self.sess)
            load_variables(load_path, self.params, self.sess)
        else:
            tf.global_variables_initializer().run(session=self.sess)
예제 #3
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 v_mix_coef=0.5,
                 max_grad_norm=0.5,
                 lr_alpha=7e-4,
                 lr_beta=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 r_ex_coef=1.0,
                 r_in_coef=0.0,
                 v_ex_coef=1.0):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch], 'A')
        R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX')
        ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX')
        RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX')
        V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX')
        DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST')
        COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT')
        LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA')
        LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA')

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum(
            train_model.r_in * tf.one_hot(A, nact), axis=1)
        ret_mix = tf.squeeze(
            tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])),
            [1]) + DIS_V_MIX_LAST
        adv_mix = ret_mix - V_MIX

        neglogpac = train_model.pd.neglogp(A)
        pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac)
        v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix),
                                        ret_mix))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss

        policy_params = train_model.policy_params
        policy_grads = tf.gradients(policy_loss, policy_params)
        if max_grad_norm is not None:
            policy_grads, policy_grad_norm = tf.clip_by_global_norm(
                policy_grads, max_grad_norm)
        policy_grads_and_vars = list(zip(policy_grads, policy_params))
        policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA,
                                                   decay=alpha,
                                                   epsilon=epsilon)
        policy_train = policy_trainer.apply_gradients(policy_grads_and_vars)

        rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params]
        policy_params_new = {}
        for grad, rms, var in zip(policy_grads, rmss, policy_params):
            ms = rms + (tf.square(grad) - rms) * (1 - alpha)
            policy_params_new[
                var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon)
        policy_new = train_model.policy_new_fn(policy_params_new, ob_space,
                                               ac_space, nbatch, nsteps)

        neglogpac_new = policy_new.pd.neglogp(A)
        ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new)
        pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new)
        v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX))
        intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss

        intrinsic_params = train_model.intrinsic_params
        intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params)
        if max_grad_norm is not None:
            intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm(
                intrinsic_grads, max_grad_norm)
        intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params))
        intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA,
                                                      decay=alpha,
                                                      epsilon=epsilon)
        intrinsic_train = intrinsic_trainer.apply_gradients(
            intrinsic_grads_and_vars)

        lr_alpha = Scheduler(v=lr_alpha,
                             nvalues=total_timesteps,
                             schedule=lrschedule)
        lr_beta = Scheduler(v=lr_beta,
                            nvalues=total_timesteps,
                            schedule=lrschedule)

        all_params = tf.global_variables()

        def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex,
                  v_mix, dis_v_mix_last, coef_mat):
            advs_ex = ret_ex - v_ex
            for step in range(len(obs)):
                cur_lr_alpha = lr_alpha.value()
                cur_lr_beta = lr_beta.value()
            td_map = {
                train_model.X: obs,
                policy_new.X: obs,
                A: actions,
                R_EX: r_ex,
                ADV_EX: advs_ex,
                RET_EX: ret_ex,
                V_MIX: v_mix,
                DIS_V_MIX_LAST: dis_v_mix_last,
                COEF_MAT: coef_mat,
                LR_ALPHA: cur_lr_alpha,
                LR_BETA: cur_lr_beta
            }
            if policy_states is not None:
                td_map[train_model.PS] = policy_states
                td_map[train_model.M] = masks
            return sess.run([entropy, policy_train, intrinsic_train],
                            td_map)[0]

        def save(save_path):
            ps = sess.run(all_params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(all_params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.intrinsic_reward = step_model.intrinsic_reward
        self.init_policy_state = step_model.init_policy_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #4
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs * nsteps,
                                           nsteps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #5
0
파일: model.py 프로젝트: liziniu/Maze
    def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps,
                 ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha,
                 rprop_epsilon, total_timesteps, lrschedule, c, trust_region,
                 alpha, delta, scope, goal_shape):
        self.sess = sess
        self.nenv = nenvs
        self.goal_shape = goal_shape

        nact = ac_space.n
        nbatch = nenvs * nsteps
        eps = 1e-6

        self.scope = scope
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self.A = tf.placeholder(tf.int32, [nbatch],
                                    name="action")  # actions
            self.D = tf.placeholder(tf.float32, [nbatch],
                                    name="dones")  # dones
            self.R = tf.placeholder(tf.float32, [nbatch],
                                    name="rewards")  # rewards, not returns
            self.MU = tf.placeholder(tf.float32, [nbatch, nact],
                                     name="mus")  # mu's
            self.LR = tf.placeholder(tf.float32, [], name="lr")

            step_ob_placeholder = tf.placeholder(ob_space.dtype,
                                                 (nenvs, ) + ob_space.shape,
                                                 "step_ob")
            step_goal_placeholder = tf.placeholder(tf.float32,
                                                   (nenvs, ) + goal_shape,
                                                   "step_goal")
            step_goal_encoded = step_goal_placeholder

            train_ob_placeholder = tf.placeholder(
                ob_space.dtype, (nenvs * (nsteps + 1), ) + ob_space.shape,
                "train_ob")
            train_goal_placeholder = tf.placeholder(
                tf.float32, (nenvs * (nsteps + 1), ) + goal_shape,
                "train_goal")
            train_goal_encoded = train_goal_placeholder
            concat_on_latent = False

            self.step_model = policy(nbatch=nenvs,
                                     nsteps=1,
                                     observ_placeholder=step_ob_placeholder,
                                     sess=self.sess,
                                     goal_placeholder=step_goal_placeholder,
                                     concat_on_latent=concat_on_latent,
                                     goal_encoded=step_goal_encoded)
            self.train_model = policy(nbatch=nbatch,
                                      nsteps=nsteps,
                                      observ_placeholder=train_ob_placeholder,
                                      sess=self.sess,
                                      goal_placeholder=train_goal_placeholder,
                                      concat_on_latent=concat_on_latent,
                                      goal_encoded=train_goal_encoded)

        variables = find_trainable_variables
        self.params = params = variables(scope)
        logger.info(
            "========================== {} =============================".
            format(scope))
        for var in params:
            logger.info(var)
        logger.info(
            "========================== {} =============================\n".
            format(scope))

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        # print("========================== Ema =============================")

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            # print(v.name)
            return v

        # print("========================== Ema =============================")

        with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True):
            self.polyak_model = policy(nbatch=nbatch,
                                       nsteps=nsteps,
                                       observ_placeholder=train_ob_placeholder,
                                       goal_placeholder=train_goal_placeholder,
                                       sess=self.sess,
                                       concat_on_latent=concat_on_latent,
                                       goal_encoded=train_goal_encoded)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to self.train_model, self.polyak_model and self.step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(self.train_model.pi)
        polyak_model_p = tf.nn.softmax(self.polyak_model.pi)
        self.step_model_p = tf.nn.softmax(self.step_model.pi)
        self.v = v = tf.reduce_sum(train_model_p * self.train_model.q,
                                   axis=-1)  # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model_p, polyak_model_p, self.train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, self.A)
        q_i = get_by_index(q, self.A)

        # Compute ratios for importance truncation
        rho = f / (self.MU + eps)
        rho_i = get_by_index(rho, self.A)

        # Calculate Q_retrace targets
        self.qret = qret = q_retrace(self.R, self.D, q_i, v, rho_i, nenvs,
                                     nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  # IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)

        # Goal loss
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  # [nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  # [nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            # print("=========================== gards add ==============================")
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]
            # print("=========================== gards add ==============================\n")
            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _policy_opt_op = trainer.apply_gradients(grads)
        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_policy_opt_op]):
            _train_policy = tf.group(ema_apply_op)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        self.run_ops_policy = [
            _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc,
            ev, norm_grads
        ]
        self.names_ops_policy = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            self.run_ops_policy = self.run_ops_policy + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            self.names_ops_policy = self.names_ops_policy + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]
        self.names_ops_policy = [
            scope + "_" + x for x in self.names_ops_policy
        ]  # scope as prefix

        self.save = functools.partial(save_variables,
                                      sess=self.sess,
                                      variables=params)

        self.initial_state = self.step_model.initial_state
        tf.global_variables_initializer().run(session=self.sess)
예제 #6
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0)
        entropy = tf.reduce_sum(cat_entropy(train_model.pi))
        params = find_trainable_variables("model")
        tf.summary.histogram("vf", train_model.vf)
        tf.summary.histogram("R", R)

        if train_model.relaxed:
            pg_loss = tf.constant(0.0)
            oh_A = tf.one_hot(train_model.a0, ac_space.n)

            params = find_trainable_variables("model")
            policy_params = [v for v in params if "pi" in v.name]
            vf_params = [v for v in params if "vf" in v.name]
            entropy_grads = tf.gradients(entropy, policy_params)

            ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t)
            ddiff_grads = tf.gradients(ddiff_loss, policy_params)

            sm = tf.nn.softmax(train_model.pi)
            dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm)
            pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi)
            pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)
            pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)]

            pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)

            cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0)
            cv_grad_splits = tf.reduce_sum(tf.square(cv_grads))
            vf_loss = cv_grad_splits * vf_coef

            cv_grads = tf.gradients(vf_loss, vf_params)

            policy_grads = []
            for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params):
                grad = -e_grad * ent_coef + p_grad
                policy_grads.append(grad)
            grad_dict = {}

            for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)):
                grad_dict[v] = g

            grads = [grad_dict[v] for v in params]
            print(grads)


        else:
            pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac)
            policy_params = [v for v in params if "pi" in v.name]
            pg_grads = tf.gradients(pg_loss, policy_params)

            vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R))
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            grads = tf.gradients(loss, params)

        grads = list(zip(grads, params))

        ema = tf.train.ExponentialMovingAverage(.99)
        all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0)
        all_policy_grads_sq = tf.square(all_policy_grads)
        apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq])
        em_mean = ema.average(all_policy_grads)
        em_mean_sq = ema.average(all_policy_grads_sq)
        em_var = em_mean_sq - tf.square(em_mean)
        em_log_var = tf.log(em_var + 1e-20)
        mlgv = tf.reduce_mean(em_log_var)

        for g, v in grads:
            print(v.name, g)
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name+"_grad", g)

        self.sum_op = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter(logdir)

        trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999)
        with tf.control_dependencies([apply_mean_op]):
            _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self._step = 0
        def train(obs, states, rewards, masks, u1, u2, values, summary=False):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X:obs, train_model.U1:u1, train_model.U2:u2,
                ADV:advs, R:rewards, LR:cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if summary:
                sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
                self.writer.add_summary(sum_str, self._step)
            else:
                policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
            self._step += 1
            return policy_loss, value_loss, policy_entropy, lv

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #7
0
파일: acktr.py 프로젝트: MrGoogol/baselines
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear', is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy


        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #8
0
class Model(object):
    def __init__(self, sess, policy, dynamics, ob_space, ac_space, nenvs,
                 nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c,
                 trust_region, alpha, delta, scope, goal_shape, residual):
        self.sess = sess
        self.nenv = nenvs
        self.residual = residual
        self.goal_shape = goal_shape
        self.goal_as_image = goal_as_image = len(goal_shape) == 3
        if self.goal_as_image:
            assert self.goal_shape == ob_space.shape
        else:
            logger.info("normalize goal using RunningMeanStd")
            with tf.variable_scope("RunningMeanStd", reuse=tf.AUTO_REUSE):
                self.goal_rms = RunningMeanStd(epsilon=1e-4,
                                               shape=self.goal_shape)

        nact = ac_space.n
        nbatch = nenvs * nsteps
        eps = 1e-6

        self.dynamics = dynamics

        self.scope = scope
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self.A = tf.placeholder(tf.int32, [nbatch],
                                    name="action")  # actions
            self.D = tf.placeholder(tf.float32, [nbatch],
                                    name="dones")  # dones
            self.R = tf.placeholder(tf.float32, [nbatch],
                                    name="rewards")  # rewards, not returns
            self.MU = tf.placeholder(tf.float32, [nbatch, nact],
                                     name="mus")  # mu's
            self.LR = tf.placeholder(tf.float32, [], name="lr")
            self.V_NEXT = tf.placeholder(tf.float32, [nbatch], name="v_next")

            step_ob_placeholder = tf.placeholder(ob_space.dtype,
                                                 (nenvs, ) + ob_space.shape,
                                                 "step_ob")
            if self.dynamics.dummy:
                step_goal_placeholder, concat_on_latent, step_goal_encoded = None, None, None
            else:
                if goal_as_image:
                    step_goal_placeholder = tf.placeholder(
                        ob_space.dtype, (nenvs, ) + ob_space.shape,
                        "step_goal")
                    concat_on_latent, train_goal_encoded, step_goal_encoded = False, None, None
                else:
                    step_goal_placeholder = tf.placeholder(
                        tf.float32, (nenvs, ) + goal_shape, "step_goal")
                    step_goal_encoded = tf.clip_by_value(
                        (step_goal_placeholder - self.goal_rms.mean) /
                        self.goal_rms.std, -5., 5.)

            train_ob_placeholder = tf.placeholder(
                ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape,
                "train_ob")
            if self.dynamics.dummy:
                train_goal_placeholder, concat_on_latent, train_goal_encoded = None, None, None
            else:
                if goal_as_image:
                    train_goal_placeholder = tf.placeholder(
                        ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape,
                        "train_goal")
                    concat_on_latent, train_goal_encoded = False, None
                else:
                    train_goal_placeholder = tf.placeholder(
                        tf.float32, (nenvs * nsteps, ) + goal_shape,
                        "train_goal")
                    concat_on_latent = True
                    train_goal_encoded = tf.clip_by_value(
                        (train_goal_placeholder - self.goal_rms.mean) /
                        self.goal_rms.std, -5., 5.)
            self.step_model = policy(nbatch=nenvs,
                                     nsteps=1,
                                     observ_placeholder=step_ob_placeholder,
                                     sess=self.sess,
                                     goal_placeholder=step_goal_placeholder,
                                     concat_on_latent=concat_on_latent,
                                     goal_encoded=step_goal_encoded)
            self.train_model = policy(nbatch=nbatch,
                                      nsteps=nsteps,
                                      observ_placeholder=train_ob_placeholder,
                                      sess=self.sess,
                                      goal_placeholder=train_goal_placeholder,
                                      concat_on_latent=concat_on_latent,
                                      goal_encoded=train_goal_encoded)

        variables = find_trainable_variables
        self.params = params = variables(scope)
        logger.info(
            "========================== {} =============================".
            format(scope))
        for var in params:
            logger.info(var)
        logger.info(
            "========================== {} =============================\n".
            format(scope))

        logger.info(
            "======================={}: Aux & Dyna =========================".
            format(scope))
        for var in self.dynamics.params:
            logger.info(var)
        logger.info(
            "======================={}: Aux & Dyna =========================\n"
            .format(scope))

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        # print("========================== Ema =============================")

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            # print(v.name)
            return v

        # print("========================== Ema =============================")

        with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True):
            self.polyak_model = policy(nbatch=nbatch,
                                       nsteps=nsteps,
                                       observ_placeholder=train_ob_placeholder,
                                       goal_placeholder=train_goal_placeholder,
                                       sess=self.sess,
                                       concat_on_latent=concat_on_latent,
                                       goal_encoded=train_goal_encoded)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to self.train_model, self.polyak_model and self.step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(self.train_model.pi)
        polyak_model_p = tf.nn.softmax(self.polyak_model.pi)
        self.step_model_p = tf.nn.softmax(self.step_model.pi)
        v = self.v = tf.reduce_sum(train_model_p * self.train_model.q,
                                   axis=-1)  # shape is [nenvs * (nsteps)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model_p, polyak_model_p, self.train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, self.A)
        q_i = get_by_index(q, self.A)

        # Compute ratios for importance truncation
        rho = f / (self.MU + eps)
        rho_i = get_by_index(rho, self.A)

        # Calculate Q_retrace targets
        qret = q_retrace(self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs,
                         nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  # IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)

        # Goal loss
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  # [nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  # [nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            # print("=========================== gards add ==============================")
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]
            # print("=========================== gards add ==============================\n")
            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _policy_opt_op = trainer.apply_gradients(grads)
        if not self.dynamics.dummy:
            _train_dynamics = trainer.minimize(self.dynamics.loss)
            self.run_ops_dynamics = [
                _train_dynamics,
                self.dynamics.aux_loss,
                self.dynamics.dyna_loss,
            ]
            self.name_ops_dynamics = ["aux_loss", "dyna_loss"]
        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_policy_opt_op]):
            _train_policy = tf.group(ema_apply_op)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        self.run_ops_policy = [
            _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc,
            ev, norm_grads
        ]
        self.names_ops_policy = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            self.run_ops_policy = self.run_ops_policy + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            self.names_ops_policy = self.names_ops_policy + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]
        self.names_ops_policy = [
            scope + "_" + x for x in self.names_ops_policy
        ]  # scope as prefix

        self.save = functools.partial(save_variables,
                                      sess=self.sess,
                                      variables=params)

        self.initial_state = self.step_model.initial_state
        tf.global_variables_initializer().run(session=self.sess)

    def train_policy(self,
                     obs,
                     next_obs,
                     actions,
                     rewards,
                     dones,
                     mus,
                     states,
                     masks,
                     steps,
                     goal_obs,
                     verbose=False):
        cur_lr = self.lr.value_steps(steps)
        # 1. calculate v_{t+1} using obs_{t+1} and g_t
        td_map = {self.train_model.X: next_obs}
        if not self.dynamics.dummy:
            assert hasattr(self.train_model, "goals")
            if self.residual:
                td_map[self.train_model.goals] = goal_obs - next_obs
            else:
                td_map[self.train_model.goals] = goal_obs
        v_next = self.sess.run(self.v, feed_dict=td_map)
        # 2. use obs_t, goal_t, v_{t+1} to train policy
        td_map = {
            self.train_model.X: obs,
            self.polyak_model.X: obs,
            self.A: actions,
            self.R: rewards,
            self.D: dones,
            self.MU: mus,
            self.LR: cur_lr,
            self.V_NEXT: v_next
        }
        if not self.dynamics.dummy:
            assert hasattr(self.train_model, "goals")
            assert hasattr(self.polyak_model, "goals")
            if hasattr(self, "goal_rms"):
                self.goal_rms.update(goal_obs)
            if self.residual:
                td_map[self.train_model.goals] = goal_obs - obs
                td_map[self.polyak_model.goals] = goal_obs - obs
            else:
                td_map[self.train_model.goals] = goal_obs
                td_map[self.polyak_model.goals] = goal_obs
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks
            td_map[self.polyak_model.S] = states
            td_map[self.polyak_model.M] = masks
        if verbose:
            names_ops_policy = self.names_ops_policy.copy()
            values_ops_policy = self.sess.run(self.run_ops_policy,
                                              td_map)[1:]  # strip off _train
        else:
            names_ops_policy = self.names_ops_policy.copy(
            )[:8]  # not including trust region
            values_ops_policy = self.sess.run(self.run_ops_policy,
                                              td_map)[1:][:8]

        unimportant_key = ["loss_f", "loss_bc"]
        for name in names_ops_policy.copy():
            for suffix in unimportant_key:
                if name.endswith(suffix):
                    index = names_ops_policy.index(name)
                    names_ops_policy.pop(index)
                    values_ops_policy.pop(index)
                    break

        return names_ops_policy, values_ops_policy

    def train_dynamics(self, obs, actions, next_obs, steps, nb_epoch=1):
        value_ops_dynamics = []
        for epoch in range(nb_epoch):
            cur_lr = self.lr.value_steps(steps)
            td_map = {
                self.dynamics.obs: obs,
                self.dynamics.next_obs: next_obs,
                self.dynamics.ac: actions,
                self.LR: cur_lr
            }
            value = self.sess.run(self.run_ops_dynamics, td_map)[1:]
            value_ops_dynamics.append(value)
        value_ops_dynamics = np.asarray(value_ops_dynamics)
        value_ops_dynamics = list(np.mean(value_ops_dynamics, axis=0))
        return self.name_ops_dynamics.copy(), value_ops_dynamics

    def step(self, observation, **kwargs):
        if self.residual and not self.dynamics.dummy:
            kwargs["goals"] = kwargs["goals"] - observation
        return self.step_model.evaluate(
            [self.step_model.action, self.step_model_p, self.step_model.state],
            observation, **kwargs)
예제 #9
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 use_adda,
                 adda_lr,
                 adda_batch,
                 seed,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'
                 ):  # The epsilion and alpha mentioned here is for RMSProp

        sess = tf_util.make_session()
        nbatch = nenvs * nsteps  # 16*20 nsteps set in learn()
        print('nbatch defined and size is ', nbatch)

        #A = tf.placeholder(tf.int32, [nbatch])

        R = tf.placeholder(tf.float32, [nbatch])  # This is your TD Target
        LR = tf.placeholder(tf.float32, [])

        #source_array = np.load('/misc/lmbraid18/raob/source_dataset.npy') # (100000, 84, 84, 1)
        #target_array = np.load('/misc/lmbraid18/raob/target_dataset.npy') # (100000, 84, 84, 1)

        print('adda_batch:', adda_batch)
        step_model = policy(
            sess,
            ob_space,
            ac_space,
            adda_batch,
            seed,
            nbatch=nenvs * 1,
            nsteps=1,
            reuse=False,
            use_adda=use_adda
        )  # nbatch = nenvs*nsteps, model for generating data, Take 1 step for each env
        train_model = policy(
            sess,
            ob_space,
            ac_space,
            adda_batch,
            seed,
            nbatch=nenvs * nsteps,
            nsteps=nsteps,
            reuse=True,
            use_adda=use_adda)  # model for training using collected data

        print('Qf:', train_model.Qf.get_shape())
        print('R:', R.get_shape())

        ##########################################################    RL    ###############################################################

        ########### Loss for RL Part ################
        loss = tf.reduce_sum(huber_loss(
            train_model.Qf -
            R))  # This is your TD Error (Prediction (320,) - TD Target (320,))
        #############################################

        ########### Optimizer for RL Part ###########
        params = find_trainable_variables(
            "model")  # Returns a list of variable objects for RL Model
        grads = tf.gradients(
            loss, params
        )  #Calculate gradients of loss wrt params.Returns a list of sum(d_loss/d_param) for each param in params
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads_and_vars = list(zip(
            grads,
            params))  # grads_and_vars is a list of (gradient, variable) pairs
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(
            grads_and_vars
        )  # Returns an operation that applies the specified gradients.
        #############################################

        #####################################################################################################################################

        ############################################################   ADDA   ##############################################################
        if use_adda:

            source_array = np.load('/misc/lmbraid18/raob/source_dataset.npy'
                                   )  # (100000, 84, 84, 1)
            target_array = np.load('/misc/lmbraid18/raob/target_dataset.npy'
                                   )  # (100000, 84, 84, 1)
            print('Size of Datasets: ', len(source_array), len(target_array))
            # Initialize Iterators
            sess.run(train_model.source_iter_op,
                     feed_dict={train_model.dataset_imgs: source_array})
            sess.run(train_model.target_iter_op,
                     feed_dict={train_model.dataset_imgs: target_array})

            ########### Loss for DA Part ###########
            mapping_loss = tf.losses.sparse_softmax_cross_entropy(
                1 - train_model.adversary_labels, train_model.adversary_logits)
            adversary_loss = tf.losses.sparse_softmax_cross_entropy(
                train_model.adversary_labels, train_model.adversary_logits)
            #############################################

            adversary_vars = find_trainable_variables(
                "adversary"
            )  # Returns a list of variable objects for Discriminator

            # extract vars used in target encoder for optimizing in DA part
            part_vars_names = ('model/c1/b', 'model/c1/w', 'model/c2/b',
                               'model/c2/w', 'model/c3/b', 'model/c3/w',
                               'model/fc1/b', 'model/fc1/w')
            target_vars = [
                var for var in params if var.name[:-2] in part_vars_names
            ]

            ########### Optimizer for DA Part ###########
            da_lr_ph = tf.placeholder(tf.float32, [])
            #lr_var = tf.Variable(adda_lr, name='learning_rate', trainable=False) # Uncomment for constant LR

            optimizer = tf.train.RMSPropOptimizer(
                da_lr_ph)  # da_lr_ph to lr_var for constant LR
            mapping_step = optimizer.minimize(mapping_loss,
                                              var_list=list(target_vars))
            adversary_step = optimizer.minimize(adversary_loss,
                                                var_list=list(adversary_vars))
            #############################################

            print('########################')
            print(target_vars)
            print('########################')
            print('\n')
            print('########################')
            print(adversary_vars)
            print('########################')
        #####################################################################################################################################

        lr = Scheduler(v=lr, nvalues=total_timesteps,
                       schedule=lrschedule)  # Learning Rate Scheduling
        da_lr = Scheduler(v=adda_lr, nvalues=26e6, schedule=lrschedule)

        def train(obs, rewards, actions, update):
            for step in range(len(obs)):  # len(obs) = 320
                cur_lr = lr.value()

            ########### Run Session for RL Part ###########
            td_map = {
                train_model.X: obs,
                R: rewards,
                LR: cur_lr,
                train_model.A: actions
            }

            action_value_loss, _ = sess.run([loss, _train], td_map)
            #############################################

            ########### Run Session for DA Part ###########
            # run DA losses in a session here. Start with running them after every update step. Later, condsider running after every 10 steps
            #if update > 62500: # If u want DA to run after 20e6 steps (20e6//320 = 62500)
            if (update > 125000) and (update % 5 == 0):
                #if update % 5 == 0:
                # Linearly reduce learning rate over RL batch size
                for step in range(len(obs)):
                    cur_adda_lr = da_lr.value()

                # Update adda_lr
                feed_dict = {da_lr_ph: cur_adda_lr}
                mapping_loss_val, adversary_loss_val, _, _ = sess.run([
                    mapping_loss, adversary_loss, mapping_step, adversary_step
                ], feed_dict)

                if update % 3125 == 0:
                    print('After {} Steps, DA LR is:{}'.format(
                        update * 320, cur_adda_lr))
            #############################################

            return action_value_loss, cur_lr  #, cur_adda_lr

        saver = tf.train.Saver(max_to_keep=100)
        part_vars_names = ('model/c1/b', 'model/c1/w', 'model/c2/b',
                           'model/c2/w', 'model/c3/b', 'model/c3/w',
                           'model/fc1/b', 'model/fc1/w')
        #part_vars_names = ('model/c1/b','model/c1/w','model/c2/b','model/c2/w','model/c3/b','model/c3/w')
        part_vars = [var for var in params if var.name[:-2] in part_vars_names]
        #print(part_vars)
        saver_adda = tf.train.Saver(part_vars)

        def save_model(save_step):
            #saver.save(sess, './hg_normal_with_da/MultiTexture/5steps_after_20e6/hg_normal_many_textures_with_da_model',global_step = save_step, write_meta_graph=False)
            #saver.save(sess, './hg_normal_with_da/MultiTexture/Seed 1/hg_normal_many_textures_with_da_model',global_step = save_step, write_meta_graph=False)
            saver.save(
                sess,
                '/misc/lmbraid18/raob/Snapshots_with_DA/Source/Small_High_Frequency_Updates/5steps_after_40e6/linearly_decrease_LR/Seed 2/hg_normal_5steps_40e6_decLR_model',
                global_step=save_step,
                write_meta_graph=False)
            #saver.save(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_normal_target_no_da/Seed 3/hg_normal_target_no_da_model', global_step = save_step, write_meta_graph=False)
        def load_model(snapshot, seed, adda_mode=False):

            # Load the saved parameters of the graph
            #if snapshot == 0:
            #saver.restore(sess, './hg_normal/hg_normal_model')
            #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_multiTexture_no_da/Seed 0/hg_multiTexture_no_da_model-66')
            saver.restore(
                sess,
                '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_normal_no_da/Seed 0/hg_normal_no_da_model-66'
            )
            #saver.restore(sess, './hg_normal_many_textures/hg_normal_many_textures_model')

            #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_with_DA/Source/Small_High_Frequency_Updates/5steps_after_40e6/linearly_decrease_LR/Seed {}/hg_normal_5steps_40e6_decLR_model-{}'.format(seed, snapshot))
            #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_with_DA/MultiTexture/Small High Frequency Updates/5steps_after_20e6/linearly decrease LR/Seed {}/hg_multiTexture_5steps_20e6_decLR_model-{}'.format(seed, snapshot))
            #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_normal_no_da/Seed {}/hg_normal_no_da_model-{}'.format(seed, snapshot))
            #saver.restore(sess, '/misc/lmbraid18/raob/Snapshots_no_DA/Multiple Snapshots/hg_multiTexture_no_da/Seed {}/hg_multiTexture_no_da_model-{}'.format(seed, snapshot))

            #if snapshot > 0 and adda_mode:

            #saver_adda.restore(sess, './adda_doom_DA/hg_multiTexture_snapshots/2e-4/Seed {}/adda_doom_DA-{}'.format(seed, snapshot))
            #saver_adda.restore(sess, './adda_doom_DA/hg_normal_snapshots/Seed {}/adda_doom_DA-{}'.format(seed, snapshot))
            #saver_adda.restore(sess, './adda_doom_DA/hg_normal_many_textures_snapshots/Seed {}/adda_doom_DA-{}'.format(seed, snapshot))
            #saver.restore(sess, './hg_normal_many_textures/hg_normal_many_textures_model')
            #print(sess.run('model/c1/b:0'))

        copy_op = step_model.get_copy_weights_operator()

        def update_target():
            sess.run(copy_op)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.save_model = save_model
        self.load_model = load_model
        self.update_target = update_target
        tf.global_variables_initializer().run(session=sess)

        #var = [var for var in tf.global_variables() if var.op.name=="model/Qf/b"][0]
        #var_tar = [var for var in tf.global_variables() if var.op.name=="target_model/Qf_target/b"][0]

        def print_var():
            print(sess.run(var))
            print(sess.run(var_tar))

        self.print_var = print_var
예제 #10
0
    def __init__(self, policy, env, nsteps, icm,idf,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs*nsteps

        self.idf=idf

        print("This is Icm in Model Init function " , type(icm))


        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        if icm is not None :

            grads = grads + icm.pred_grads_and_vars
            # print("Gradients added ")
            # print("independetly there shape were a2c : {} icm :{} and together {} ".format(np.shape(grads),np.shape(icm.pred_grads_and_vars),
                # np.shape(grads_and_vars)))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values , next_obs ) :
        #, icm_rewards,cumulative_dicounted_icm): #, new_rew):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            # print(" icm called in train function ", type(icm))
            advs = rewards - values



            # print("Now the advantage ", advs )

            # icm_adv = icm_rewards - values
            # m , s = get_mean_and_std(icm_adv)

            # > adv Normaliztion
            # m , s = get_mean_and_std(advs)
            # advs = (advs - m) / (s + 1e-7)



            # advs = (icm_adv - m) / (s + 1e-7)


            # icm_adv = (icm_adv - icm_adv.mean()) / (  + 1e-7) 
            # print("icm advantage ", icm_adv)


            # advs = new_rew - values
            # print("Advantage :", advs)
            # print("On train shapes are  ")
            # print(" obs {} states {} rewards {} masks {} actions {} values {} ".
                # format(np.shape(obs) , np.shape(states) , np.shape(rewards) , np.shape(masks) ,np.shape(actions) ,
                # np.shape(values) ))
            # print("Received Advantage {} rewards {} values {}".format(
                # advs , rewards , values) )

           
            # print("advantage reward and values shape ")
            # print("advs {} , rewards shape {} , values {}".format(np.shape(advs) , np.shape(rewards) , np.shape(values)))

            for step in range(len(obs)):
                cur_lr = lr.value()

            if icm is None :

                td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            else :
                # print("curiosity Td Map ")
                # print(" obs {} , next obs {} , actions  {} ".format(np.shape(obs) , np.shape(next_obs),
                    # np.shape(actions)))
                td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr , 
                icm.state_:obs, icm.next_state_ : next_obs , icm.action_ : actions }# , icm.R :rewards }



            if icm is None :
                if states is not None:
                    td_map[train_model.S] = states
                    td_map[train_model.M] = masks
                
                policy_loss, value_loss, policy_entropy, _ = sess.run(
                    [pg_loss, vf_loss, entropy, _train],
                    td_map
                )
                return policy_loss, value_loss, policy_entropy
            else :
                if states is not None:
                    td_map[train_model.S] = states
                    td_map[train_model.M] = masks
                if self.idf :    
                    policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, _ = sess.run(
                        [pg_loss, vf_loss, entropy, icm.forw_loss , icm.inv_loss, icm.icm_loss ,_train],
                        td_map)
                    return policy_loss, value_loss, policy_entropy,forward_loss , inverse_loss , icm_loss, advs

                else :
                    policy_loss, value_loss, policy_entropy,forward_loss , icm_loss, _ = sess.run(
                        [pg_loss, vf_loss, entropy, icm.forw_loss , icm.icm_loss ,_train],
                        td_map)

                    return policy_loss, value_loss, policy_entropy,forward_loss , 0.0 , icm_loss, advs




        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #11
0
    def __init__(self,
                 model_template,
                 num_options,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 option_eps=0.001,
                 delib_cost=0.001):

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)

        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        self.sess = sess

        self.rng = np.random.RandomState(0)  # TODO

        nact = ac_space.n
        nbatch = nenvs * nsteps
        nopt = num_options

        self.option_eps = option_eps
        self.action_eps = epsilon

        batch_indexer = tf.range(nbatch)

        print("Building rest of the graph.")

        self.actions = tf.placeholder(shape=[nbatch], dtype=tf.int32)
        self.options = tf.placeholder(shape=[nbatch], dtype=tf.int32)
        self.rewards = tf.placeholder(shape=[nbatch], dtype=tf.float32)
        self.deliberation_costs = tf.placeholder(shape=[nbatch],
                                                 dtype=tf.float32)
        self.lr = tf.placeholder(shape=[], dtype=tf.float32)

        summary = []

        # Networks
        self.step_model = Network(model_template,
                                  nopt,
                                  ob_space,
                                  ac_space,
                                  nenvs,
                                  1,
                                  nstack,
                                  reuse=False)
        self.train_model = Network(model_template,
                                   nopt,
                                   ob_space,
                                   ac_space,
                                   nenvs,
                                   nsteps,
                                   nstack,
                                   reuse=True)

        # Indexers
        self.responsible_options = tf.stack([batch_indexer, self.options],
                                            axis=1)
        self.responsible_actions = tf.stack([batch_indexer, self.actions],
                                            axis=1)
        self.network_indexer = tf.stack([self.options, batch_indexer], axis=1)

        # Q Values OVER options
        self.disconnected_q_vals = tf.stop_gradient(
            self.train_model.q_values_options)

        # Q values of each option that was taken
        self.responsible_opt_q_vals = tf.gather_nd(
            params=self.train_model.q_values_options,
            indices=self.responsible_options
        )  # Extract q values for each option
        self.disconnected_q_vals_option = tf.gather_nd(
            params=self.disconnected_q_vals, indices=self.responsible_options)

        # Termination probability of each option that was taken
        self.terminations = tf.gather_nd(
            params=self.train_model.termination_fn,
            indices=self.responsible_options)

        # Q values for each action that was taken
        relevant_networks = tf.gather_nd(
            params=self.train_model.intra_option_policies,
            indices=self.network_indexer)
        relevant_networks = tf.nn.softmax(relevant_networks, dim=1)

        self.action_values = tf.gather_nd(params=relevant_networks,
                                          indices=self.responsible_actions)

        # Weighted average value
        self.value = tf.reduce_max(
            self.train_model.q_values_options) * (1 - option_eps) + (
                option_eps * tf.reduce_mean(self.train_model.q_values_options))
        disconnected_value = tf.stop_gradient(self.value)

        # Losses; TODO: Why reduce sum vs reduce mean?
        self.value_loss = vf_coef * tf.reduce_mean(
            vf_coef * 0.5 *
            tf.square(self.rewards - self.responsible_opt_q_vals))
        self.policy_loss = tf.reduce_mean(
            tf.log(self.action_values) *
            (self.rewards - self.disconnected_q_vals_option))
        self.termination_loss = tf.reduce_mean(
            self.terminations *
            ((self.disconnected_q_vals_option - disconnected_value) +
             self.deliberation_costs))

        action_probabilities = self.train_model.intra_option_policies
        self.entropy = ent_coef * tf.reduce_mean(
            action_probabilities * tf.log(action_probabilities))

        self.loss = -self.policy_loss - self.entropy - self.value_loss - self.termination_loss

        # Gradients
        train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                       'model')
        gradients = tf.gradients(self.loss, train_vars)
        grads, grad_norms = tf.clip_by_global_norm(gradients, max_grad_norm)
        grads = list(zip(grads, train_vars))
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr,
                                            decay=alpha,
                                            epsilon=epsilon)
        self.apply_grads = trainer.apply_gradients(grads)

        # Summary
        avg_reward = tf.reduce_mean(self.rewards)

        summary.append(tf.summary.scalar('policy_loss', self.policy_loss))
        summary.append(tf.summary.scalar('value_loss', self.value_loss))
        summary.append(
            tf.summary.scalar('termination_loss', self.termination_loss))
        summary.append(tf.summary.scalar('entropy', self.entropy))
        summary.append(tf.summary.scalar('avg_reward', avg_reward))
        self.summary_op = tf.summary.merge(summary)

        self.print_op = [
            self.policy_loss, self.value_loss, self.termination_loss,
            avg_reward
        ]

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, options, actions, rewards, costs):
            feed_dict = {
                self.train_model.observations: obs,
                self.actions: actions,
                self.options: options,
                self.rewards: rewards,
                self.deliberation_costs: costs
            }

            train_ops = [self.apply_grads, self.summary_op, self.print_op]
            _, summary, summary_str = sess.run(train_ops, feed_dict=feed_dict)

            print(summary_str)

            return summary

        def setup_tensorflow(sess, writer):
            self.step_model.setup_tensorflow(sess, writer)
            self.train_model.setup_tensorflow(sess, writer)

        self.train = train
        self.setup_tensorflow = setup_tensorflow

        self.initial_state = self.step_model.initial_state
        self.step = self.step_model.step
        self.value = self.step_model.value
        self.update_options = self.step_model.update_options

        tf.global_variables_initializer().run(session=sess)
예제 #12
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 diverse_r_coef=0.1,
                 gamma=0.99,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):
        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('vfo_model', reuse=tf.AUTO_REUSE):
            step_model = policy(nbatch=nenvs, nsteps=1, sess=sess)
            train_model = policy(nbatch=nbatch, nsteps=nsteps, sess=sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        params = find_trainable_variables('vfo_model')
        print(params)

        # ==============================
        # model-free actor-critic loss
        # ==============================
        with tf.variable_scope('mf_loss'):
            neglogpac = train_model.pd.neglogp(A)
            entropy = tf.reduce_mean(train_model.pd.entropy())

            pg_loss = tf.reduce_mean(ADV * neglogpac)
            vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

            grads = tf.gradients(loss, params)
            if max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        # ==============================
        # diverse options policy loss
        # ==============================
        option_train_ops = []
        option_losses = []
        option_losses_names = []
        option_distil_train_op = None
        with tf.variable_scope('options_loss'):
            diversity_reward = -1 * tf.nn.softmax_cross_entropy_with_logits_v2(
                labels=train_model.op_z,
                logits=train_model.option_discriminator)
            diversity_reward = tf.check_numerics(
                diversity_reward, 'Check numerics (1): diversity_reward')
            diversity_reward -= tf.log(
                tf.reduce_sum(train_model.prior_op_z * train_model.op_z) +
                1e-6)
            print('d_reward:', diversity_reward.get_shape().as_list())

            intrinsic_reward = tf.multiply(
                train_model.next_pvfs - train_model.pvfs, train_model.op_z)
            intrinsic_reward = tf.reduce_sum(intrinsic_reward, 1)
            print('i_reward:', intrinsic_reward.get_shape().as_list())
            reward = diverse_r_coef * diversity_reward + intrinsic_reward

            with tf.variable_scope('critic'):
                next_vf = tf.reduce_sum(
                    tf.multiply(train_model.next_pvfs, train_model.op_z), 1)
                print('next_vf:', next_vf.get_shape().as_list())
                option_q_y = tf.stop_gradient(reward +
                                              (1 - train_model.dones) * gamma *
                                              next_vf)
                option_q = tf.squeeze(train_model.option_q, 1)
                print('option_q_y:', option_q_y.get_shape().as_list())
                print('option_q:', option_q.get_shape().as_list())

                option_q_loss = 0.5 * tf.reduce_mean(
                    (option_q_y - option_q)**2)

            with tf.variable_scope('actor'):
                log_op_pi_t = train_model.option_pd.logp(A)
                log_target_t = tf.squeeze(train_model.option_q, 1)
                pvf = tf.reduce_sum(
                    tf.multiply(train_model.pvfs, train_model.op_z), 1)
                print('op_pi:', log_op_pi_t.get_shape().as_list())
                print('op_t:', log_target_t.get_shape().as_list())
                print('pvf:', pvf.get_shape().as_list())
                kl_surrogate_loss = tf.reduce_mean(
                    log_op_pi_t *
                    tf.stop_gradient(log_op_pi_t - log_target_t - pvf))

            with tf.variable_scope('discriminator'):
                print('op_z:', train_model.op_z.get_shape().as_list())
                print('op_dis:',
                      train_model.option_discriminator.get_shape().as_list())
                discriminator_loss = tf.reduce_mean(
                    tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=train_model.op_z,
                        logits=train_model.option_discriminator_logits))

            with tf.variable_scope('distillation'):
                # NOTE: to train distillation, op_z should be feed with q(z|s)
                print('mf_pi:', train_model.pi.get_shape().as_list())
                print('op_pi:', train_model.option_pi.get_shape().as_list())
                distillation_loss = losses.mean_squared_error(
                    tf.stop_gradient(train_model.pi), train_model.option_pi)

        _train_option_q = tf.train.AdamOptimizer(lr).minimize(
            loss=option_q_loss, var_list=params)
        option_train_ops.append(_train_option_q)
        option_losses.append(option_q_loss)
        option_losses_names.append('option_critic')

        _train_option_policy = tf.train.AdamOptimizer(lr).minimize(
            loss=kl_surrogate_loss, var_list=params)
        option_train_ops.append(_train_option_policy)
        option_losses.append(kl_surrogate_loss)
        option_losses_names.append('option_actor')

        _train_option_disc = tf.train.AdamOptimizer(lr).minimize(
            loss=discriminator_loss, var_list=params)
        option_train_ops.append(_train_option_disc)
        option_losses.append(discriminator_loss)
        option_losses_names.append('option_discriminator')

        option_distil_train_op = tf.train.AdamOptimizer(lr).minimize(
            loss=distillation_loss, var_list=params)

        tf.summary.FileWriter(logger.get_dir(), sess.graph)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def train_options(obs, next_obs, states, next_states, masks,
                          next_masks, actions, actions_full, dones, options_z):
            feed = {
                train_model.X: obs,
                train_model.X_next: next_obs,
                A: actions,
                train_model.ac: actions_full,
                train_model.dones: dones,
                train_model.op_z: options_z
            }
            if states is not None:
                feed[train_model.S] = states
                feed[train_model.next_S] = next_states
                feed[train_model.M] = masks
                feed[train_model.next_M] = next_masks

            record_loss_values = []
            for name, loss, train_op in zip(option_losses_names, option_losses,
                                            option_train_ops):
                loss_value, _ = sess.run([loss, train_op], feed)
                record_loss_values.append((name + '_loss', loss_value))

            return record_loss_values

        def distill_mf_to_options(obs, states, masks):
            feed = {train_model.X: obs}
            if states is not None:
                feed[train_model.S] = states
                feed[train_model.M] = masks

            option_ensembles = sess.run(train_model.option_discriminator, feed)
            feed[train_model.op_z] = option_ensembles
            distillation_loss_value, _ = sess.run(
                [distillation_loss, option_distil_train_op], feed)

            return distillation_loss_value

        self.train = train
        self.train_options = train_options
        self.distill_mf_to_options = distill_mf_to_options
        self.train_model = train_model
        self.prior_op_z = train_model.prior_op_z
        self.step_model = step_model
        self.step = step_model.step
        self.option_step = step_model.option_step
        self.option_select = step_model.option_select
        self.selective_option_step = step_model.selective_option_step
        self.value = step_model.value
        self.proto_value = step_model.proto_value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #13
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5,
            lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'):

        '''
        sess = tf.get_default_session()
        nbatch = nenvs*nsteps

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        '''

        # begin diff
        sess = tf.get_default_session()

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, reuse=True)

        L = tf.placeholder(tf.int32, [1])
        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        # end diff

        neglogpac = train_model.pd.neglogp(A) # length max_episode_steps
        pg_loss = tf.reduce_mean(tf.slice(ADV * neglogpac, [0], L))
        vf_loss = tf.reduce_mean(tf.slice(mse(tf.squeeze(train_model.vf), R), [0], L))
        entropy = tf.reduce_mean(tf.slice(train_model.pd.entropy(), [0], L))
        loss = pg_loss-entropy*ent_coef+vf_loss*vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values, length):
            advs = rewards-values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr, L:np.asarray([length])}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run([pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #14
0
파일: acktr.py 프로젝트: yoniosin/A2C
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs * nsteps,
                                               nsteps,
                                               sess=sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr,
                VF_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #15
0
class Model(object):
    def __init__(self, sess, policy, ob_space, ac_space, nenvs, nsteps,
                 ent_coef, q_coef, gamma, max_grad_norm, lr, rprop_alpha,
                 rprop_epsilon, total_timesteps, lrschedule, c, trust_region,
                 alpha, delta, scope, load_path, goal_shape):
        self.sess = sess
        self.nenv = nenvs
        self.nsteps = nsteps
        self.goal_shape = goal_shape

        nact = ac_space.n
        nbatch = nenvs * nsteps
        eps = 1e-6

        self.scope = scope
        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self.A = tf.placeholder(tf.int32, [nbatch],
                                    name="action")  # actions
            self.D = tf.placeholder(tf.float32, [nbatch],
                                    name="dones")  # dones
            self.R = tf.placeholder(tf.float32, [nbatch],
                                    name="rewards")  # rewards, not returns
            self.MU = tf.placeholder(tf.float32, [nbatch, nact],
                                     name="mus")  # mu's
            self.LR = tf.placeholder(tf.float32, [], name="lr")
            self.AUX = tf.placeholder(tf.float32, [nbatch], name="aux")

            self.V_NEXT = tf.placeholder(
                tf.float32, [nbatch], name="value_next"
            )  # (by lzn: we revise goal-conditioned next value)

            step_ob_placeholder = tf.placeholder(ob_space.dtype,
                                                 (nenvs, ) + ob_space.shape,
                                                 "step_ob")
            step_goal_placeholder = tf.placeholder(tf.float32,
                                                   (nenvs, ) + goal_shape,
                                                   "step_goal")
            step_goal_encoded = step_goal_placeholder

            train_ob_placeholder = tf.placeholder(
                ob_space.dtype, (nenvs * nsteps, ) + ob_space.shape,
                "train_ob")
            train_goal_placeholder = tf.placeholder(
                tf.float32, (nenvs * nsteps, ) + goal_shape, "train_goal")
            train_goal_encoded = train_goal_placeholder
            concat_on_latent = False
            self.step_model = policy(nbatch=nenvs,
                                     nsteps=1,
                                     observ_placeholder=step_ob_placeholder,
                                     sess=self.sess,
                                     goal_placeholder=step_goal_placeholder,
                                     concat_on_latent=concat_on_latent,
                                     goal_encoded=step_goal_encoded)
            self.train_model = policy(nbatch=nbatch,
                                      nsteps=nsteps,
                                      observ_placeholder=train_ob_placeholder,
                                      sess=self.sess,
                                      goal_placeholder=train_goal_placeholder,
                                      concat_on_latent=concat_on_latent,
                                      goal_encoded=train_goal_encoded)

        variables = find_trainable_variables
        self.params = params = variables(scope)
        logger.info(
            "========================== {} =============================".
            format(scope))
        for var in params:
            logger.info(var)
        logger.info(
            "========================== {} =============================\n".
            format(scope))

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        # print("========================== Ema =============================")

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            # print(v.name)
            return v

        # print("========================== Ema =============================")

        with tf.variable_scope(scope, custom_getter=custom_getter, reuse=True):
            self.polyak_model = policy(nbatch=nbatch,
                                       nsteps=nsteps,
                                       observ_placeholder=train_ob_placeholder,
                                       goal_placeholder=train_goal_placeholder,
                                       sess=self.sess,
                                       concat_on_latent=concat_on_latent,
                                       goal_encoded=train_goal_encoded)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to self.train_model, self.polyak_model and self.step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(self.train_model.pi)
        polyak_model_p = tf.nn.softmax(self.polyak_model.pi)
        self.step_model_p = tf.nn.softmax(self.step_model.pi)
        # (todo by lizn, use this to calculate next value)
        v = self.v = tf.reduce_sum(train_model_p * self.train_model.q,
                                   axis=-1)  # shape is [nenvs * (nsteps)]

        # strip off last step
        # (todo by lizn, we don't need strip)
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model_p, polyak_model_p, self.train_model.q])
        # f, f_pol, q = map(lambda x: x, [train_model_p, polyak_model_p, self.train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, self.A)
        q_i = get_by_index(q, self.A)

        # Compute ratios for importance truncation
        rho = f / (self.MU + eps)
        rho_i = get_by_index(rho, self.A)

        # Calculate Q_retrace targets
        qret = q_retrace(
            self.R, self.D, q_i, self.V_NEXT, rho_i, nenvs, nsteps,
            gamma)  # (todo by lizn, use new next state value) = q_retrace()
        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(self.train_model.pd.entropy(), nenvs, nsteps))

        entropy = tf.reduce_mean(self.AUX * cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps,
                  True)  # (todo by lzn: we do not need the strip the last one)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  # IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)

        # Goal loss
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  # [nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  # [nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  # [nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            # print("=========================== gards add ==============================")
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]
            # print("=========================== gards add ==============================\n")
            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        # trainer = tf.train.AdamOptimizer(learning_rate=self.LR)
        _policy_opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_policy_opt_op]):
            _train_policy = tf.group(ema_apply_op)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        self.run_ops_policy = [
            _train_policy, loss, loss_q, entropy, loss_policy, loss_f, loss_bc,
            ev, norm_grads
        ]
        self.names_ops_policy = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            self.run_ops_policy = self.run_ops_policy + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            self.names_ops_policy = self.names_ops_policy + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]
        self.names_ops_policy = [
            scope + "_" + x for x in self.names_ops_policy
        ]  # scope as prefix

        self.save = functools.partial(save_variables,
                                      sess=self.sess,
                                      variables=params)
        self.load = functools.partial(load_variables,
                                      sess=self.sess,
                                      variables=params)

        self.initial_state = self.step_model.initial_state
        if load_path is not None:
            tf.global_variables_initializer().run(session=self.sess)
            logger.info("loading pretrained model from {}".format(load_path))
            self.load(load_path)
        else:
            tf.global_variables_initializer().run(session=self.sess)

    def train_policy(self,
                     obs,
                     next_obs,
                     actions,
                     rewards,
                     dones,
                     mus,
                     states,
                     masks,
                     steps,
                     goal_obs,
                     aux,
                     verbose=False):
        cur_lr = self.lr.value_steps(steps)
        # 1. calculate v_{t+1} using obs_{t+1} and g_t
        td_map = {self.train_model.X: next_obs}
        assert hasattr(self.train_model, "goals")
        td_map[self.train_model.goals] = goal_obs
        v_next = self.sess.run(self.v, feed_dict=td_map)
        # 2. use obs_t, goal_t, v_{t+1} to train policy
        td_map = {
            self.train_model.X: obs,
            self.polyak_model.X: obs,
            self.A: actions,
            self.R: rewards,
            self.D: dones,
            self.MU: mus,
            self.LR: cur_lr,
            self.V_NEXT: v_next,
            self.AUX: aux
        }

        ##########################################
        debug = False
        if debug:
            self._test(obs, next_obs, actions, rewards, dones, mus, goal_obs)
        ############################################
        assert hasattr(self.train_model, "goals")
        assert hasattr(self.polyak_model, "goals")
        if hasattr(self, "goal_rms"):
            self.goal_rms.update(goal_obs)
        td_map[self.train_model.goals] = goal_obs
        td_map[self.polyak_model.goals] = goal_obs
        if states is not None:
            td_map[self.train_model.S] = states
            td_map[self.train_model.M] = masks
            td_map[self.polyak_model.S] = states
            td_map[self.polyak_model.M] = masks
        if verbose:
            names_ops_policy = self.names_ops_policy.copy()
            values_ops_policy = self.sess.run(self.run_ops_policy,
                                              td_map)[1:]  # strip off _train
        else:
            names_ops_policy = self.names_ops_policy.copy(
            )[:8]  # not including trust region
            values_ops_policy = self.sess.run(self.run_ops_policy,
                                              td_map)[1:][:8]

        return names_ops_policy, values_ops_policy

    def step(self, observation, **kwargs):
        return self.step_model.evaluate(
            [self.step_model.action, self.step_model_p, self.step_model.state],
            observation, **kwargs)

    def _test(self, obs, next_obs, actions, rewards, dones, mus, goal_obs):
        _obs, _next_obs, _actions, _dones, _goals, _mus, _rewards = self.generate_fake(
            obs, next_obs, actions, dones, goal_obs, mus, rewards)
        td_map = dict()
        td_map[self.train_model.goals] = _goals
        td_map[self.train_model.X] = _next_obs
        v_next = self.sess.run(self.v, feed_dict=td_map)
        print("v_next", v_next)
        td_map[self.train_model.X] = _obs
        td_map[self.A] = _actions
        td_map[self.R] = _rewards
        td_map[self.MU] = _mus
        td_map[self.D] = _dones
        td_map[self.V_NEXT] = v_next
        print("------td map--------")
        print(td_map)
        print("------td map--------")
        print("-------q_iter--------")
        q_iter = self.sess.run(self.q_iter, feed_dict=td_map)
        print(q_iter)
        print("-------q_iter--------")
        print("-------q_iter_after--------")
        q_iter_after = self.sess.run(self.q_iter, feed_dict=td_map)
        print(q_iter_after)
        print("-------q_iter_after--------")
        print("--------rs---------")
        rs = self.sess.run(self.rs, feed_dict=td_map)
        print(rs)
        print("--------rs---------")
        q_i, rho_i, qret = self.sess.run([self.q_i, self.rho_i, self.qret],
                                         feed_dict=td_map)
        print("q_i", q_i)
        print("rho_i", rho_i)
        print("q_ret", qret)
        assert 0

    def generate_fake(self, obs, next_obs, actions, dones, goals, mus,
                      rewards):
        obs_new = np.random.randn(self.nenv, self.nsteps + 1, *obs.shape[1:])
        _obs = obs_new[:, :-1].reshape((-1, ) + obs.shape[1:])
        _next_obs = obs_new[:, 1:].reshape((-1, ) + next_obs.shape[1:])
        _actions = np.ones_like(actions)
        _dones = dones
        _goals = np.zeros_like(goals)
        _mus = np.random.randn(*mus.shape)
        _mus = _mus / np.sum(_mus, axis=-1, keepdims=True)
        print(self.sess.run(self.params))
        print("obs", obs)
        print("_obs", obs_new)
        _rewards = np.ones_like(rewards)
        return _obs, _next_obs, _actions, _dones, _goals, _mus, _rewards
예제 #16
0
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)



        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #17
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 hparams=None):
        assert hparams != None
        hparams['_vf_coef'] = vf_coef

        # Create the session.
        sess = tf_util.make_session(
            per_process_gpu_memory_fraction=hparams.get('gpu_fraction', 0.25))
        self.sess = sess

        # Copy hparams.
        self.hparams = hparams
        self.nenvs = nenvs
        self.nsteps = nsteps

        self.hparams['batch_size'] = nenvs * nsteps

        # Setup constants.
        nact = ac_space.n
        nbatch = nenvs * nsteps
        self.nbatch = nbatch
        nh, nw, nc = ob_space.shape
        ob_shape_train = (nbatch, nh, nw, nc)
        ob_shape_step = (nenvs, nh, nw, nc)

        # Setup placeholders.
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        TEACHER_C = tf.placeholder(tf.float32, [])
        DROPOUT_STRENGTH = tf.placeholder(tf.float32, [],
                                          name='DROPOUT_STRENGTH')
        self.DROPOUT_STRENGTH = DROPOUT_STRENGTH
        X_train = tf.placeholder(tf.float32, ob_shape_train,
                                 name='Ob_train')  #obs
        X_step = tf.placeholder(tf.float32, ob_shape_step,
                                name='Ob_step')  #obs
        attention_truth = None

        step_hparams = copy.deepcopy(hparams)
        train_hparams = copy.deepcopy(hparams)

        # if self.hparams.get('fixed_dropout_noise'):
        #     self.step_env_random = tf.get_variable(
        #         shape=[nenvs, 7, 7, 1],
        #         name='env_random',
        #         initializer=tf.truncated_normal_initializer(),
        #         trainable=False,
        #     )

        #     self.train_env_random = tf.tile(tf.expand_dims(self.step_env_random, axis=0), multiples=[nsteps, 1, 1, 1, 1])
        #     self.train_env_random = tf.reshape(
        #         tf.transpose(self.train_env_random, perm=[1, 0, 2, 3, 4]),
        #         [nbatch, 7, 7, 1])

        #     step_hparams['_env_random'] = self.step_env_random
        #     train_hparams['_env_random'] = self.train_env_random

        # train_hparams['_dropout_strength'] = DROPOUT_STRENGTH
        # step_hparams['_dropout_strength'] = DROPOUT_STRENGTH

        # Create the models.
        step_model = policy(sess,
                            X_step,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            reuse=False,
                            hparams=step_hparams)
        train_model = policy(sess,
                             X_train,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True,
                             hparams=train_hparams)

        if hparams.get('teacher_ckpt'):
            assert hparams.get('use_fixed_attention') or hparams.get(
                'learn_attention_from_teacher') or hparams.get(
                    'do_joint_training')

            # Create the teacher, so that way we can use its attention weights
            # instead of learning how to do attention on our own.
            # step_teacher = self._create_sfmnet(X_step, reuse=False, is_step_model=True)

            train_teacher = self._create_object_segmentation_net(
                X_train,
                reuse=False,
                is_step_model=False,
                embedding=train_model.original_h
                if hparams['do_joint_training'] else None,
            )
            train_attention_truth, train_attention_mask = self._get_attention_truth(
                train_teacher, is_step_model=False)

            # step_attention_truth = self._get_attention_truth(step_teacher, is_step_model=True)

            # if hparams.get('use_fixed_attention'):
            #     step_hparams['_attention_truth'] = step_attention_truth
            #     train_hparams['_attention_truth'] = train_attention_truth

            # if hparams.get('do_joint_training'):
            #     step_hparams['_teacher_h3'] = step_teacher.conv3
            #     step_hparams['_teacher_h'] = step_teacher.embedding

            #     train_hparams['_teacher_h3'] = train_teacher.conv3
            #     train_hparams['_teacher_h'] = train_teacher.embedding

        # if hparams.get('use_target_model'):
        #     assert not hparams.get('do_joint_training')

        #     target_hparams = copy.copy(train_hparams)
        #     target_hparams['_policy_scope'] = 'target_model'
        #     target_hparams['_src_scope'] = 'model'
        #     target_model = policy(sess, X_step, ob_space, ac_space, nenvs, 1, reuse=False, hparams=target_hparams)
        #     target_model.setup_copy_weights()
        #     self.target_model = target_model

        scaled_images = tf.cast(train_model.X, tf.float32) / 255.
        print('scaled_images shape: {}'.format(scaled_images))

        sfm_base = object_segmentation.ObjectSegmentationBase(
            frames=scaled_images, embedding=train_model.h)
        sfm_hparams = copy.deepcopy(hparams)
        sfm_hparams['batch_size'] = nenvs * nsteps

        tf.summary.image('frame0',
                         tf.expand_dims(train_model.X[..., -2], axis=-1),
                         max_outputs=1)
        tf.summary.image('frame1',
                         tf.expand_dims(train_model.X[..., -1], axis=-1),
                         max_outputs=1)

        # Create the loss function.
        def a2c_loss(pi, vf):
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=A)
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            entropy = tf.reduce_mean(cat_entropy(train_model.pi))

            # ent_coef_mode = hparams.get('ent_coef_mode', 'default')
            # ent_coef_val = hparams.get('ent_coef_val', ent_coef)

            # if ent_coef_mode == 'default':
            #     actual_ent_coef = ent_coef_val
            # elif ent_coef_mode == 'linear_teacher':
            #     actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C)
            # elif ent_coef_mode == 'additive_teacher':
            #     actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C
            # else:
            #     raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode))

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            return loss, pg_loss, vf_loss, entropy

        loss, pg_loss, vf_loss, entropy = a2c_loss(train_model.pi,
                                                   train_model.vf)

        # if hparams.get('dropout_data_aug_c'):
        #     logged_augs = False
        #     loss_c = 1.0 - hparams['num_dropout_models'] * hparams['dropout_data_aug_c']
        #     assert loss_c >= hparams['dropout_data_aug_c'] - 1e-5
        #     loss = loss_c * loss

        #     for pi_noise, vf_noise in zip(train_model.pi_noises, train_model.vf_noises):
        #         l2, pg2, vf2, entropy2 = a2c_loss(pi_noise, vf_noise)
        #         loss += l2 * hparams['dropout_data_aug_c']

        #         if not logged_augs:
        #             logged_augs = True
        #             tf.summary.scalar('aug_loss', tf.reduce_mean(l2))
        #             tf.summary.scalar('aug_pgloss', tf.reduce_mean(pg2))
        #             tf.summary.scalar('aug_vfloss', tf.reduce_mean(vf2))
        #             tf.summary.scalar('aug_entropyloss', tf.reduce_mean(entropy2))

        #     print("ADDING DROPOUT DATA AUG")

        # if hasattr(train_model, 'noise_loss') and hparams.get('noise_loss_c'):
        #     loss += train_model.noise_loss
        #     print("ADDING NOISE LOSS")

        # tf.summary.image('frame0', tf.expand_dims(train_model.X[..., -2],-1), max_outputs=1)
        # tf.summary.image('frame1', tf.expand_dims(train_model.X[..., -1],-1),  max_outputs=1)

        teacher_loss = 0.0

        if hparams.get('teacher_ckpt') and hparams.get(
                'learn_attention_from_teacher'):
            assert hparams.get('attention_20') or hparams.get(
                'inverted_attention_20')
            # Load in the teacher.
            # teacher = sfmnet.SfmNet(hparams=sfm_hparams, sfm_base=sfm_base, is_teacher_network=True)

            # attention_loss = tf.nn.softmax_cross_entropy_with_logits(
            #     labels=train_attention_truth,
            #     logits=tf.reshape(train_model.attention_logits, [nbatch,-1])
            # )
            # print('attention_loss: {}'.format(attention_loss.get_shape()))
            # print('train_attention_mask: {}'.format(train_attention_mask.get_shape()))
            # attention_loss = attention_loss * train_attention_mask
            # attention_loss = tf.reduce_mean(attention_loss)

            # # for t in [5., 10., 20., 40., 75., 100., 200., 500., 1000.]:
            # #     truth = tf.nn.softmax(coarse_masks / t)
            # #     tf.summary.image('attention_truth_{}'.format(t), tf.reshape(truth, [nbatch, 7, 7, 1]), max_outputs=1)
            # tf.summary.scalar('attention_loss', attention_loss)
            # tf.summary.scalar('attention_teaching', tf.reduce_mean(train_attention_mask))

            # teacher_loss = TEACHER_C * attention_loss

            tf.summary.scalar('teacher_c', TEACHER_C)
            truth, mask = self._get_attention_truth_20(train_teacher,
                                                       is_step_model=False)
            tf.summary.image('attention_20_truth',
                             tf.reshape(truth, [80, 20, 20, 1]),
                             max_outputs=1)

            if hparams.get('attention_20'):
                attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits(
                    labels=truth,
                    logits=tf.reshape(train_model.attention_logits_20,
                                      [-1, 400]))
                attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask)

                tf.summary.scalar('attention_loss_20', attention_loss_20)
                tf.summary.scalar('attention_teaching_20',
                                  tf.reduce_mean(mask))
                teacher_loss += TEACHER_C * attention_loss_20

            if hparams.get('extrapath_attention_20'):
                print("EXTRAPATH ATTENTION!!!")
                attention_loss_20 = tf.nn.softmax_cross_entropy_with_logits(
                    labels=truth,
                    logits=tf.reshape(
                        train_model.extrapath_attention_logits_20, [-1, 400]))
                attention_loss_20 = tf.reduce_mean(attention_loss_20 * mask)

                tf.summary.scalar('attention_loss_20', attention_loss_20)
                tf.summary.scalar('attention_teaching_20',
                                  tf.reduce_mean(mask))
                teacher_loss += (-TEACHER_C) * attention_loss_20

        # if hparams.get('learn_attention_from_pg'):
        #     attention_logits = tf.reshape(train_model.attention_logits, [nbatch, 49])
        #     attention_actions = sample(attention_logits)
        #     attention_actions = tf.stop_gradient(attention_actions)

        #     attention_neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=attention_logits, labels=attention_actions)
        #     attention_pg_loss = tf.reduce_mean(ADV * attention_neglogpac)

        #     tf.summary.scalar('attention_pg_loss', attention_pg_loss)

        #     loss += attention_pg_loss * hparams['learn_attention_from_pg']

        # if hparams.get('teacher_ckpt') and hparams.get('learn_translation_from_teacher'):
        #     with tf.variable_scope("model"):
        #         with tf.variable_scope('object_translation'):
        #             pred_translation = fc(train_model.h, 'obj_t', nh=2*self.hparams['k_obj'], init_scale=1.0)
        #             pred_translation = tf.reshape(pred_translation, (-1, self.hparams['k_obj'], 2))

        #     teacher_translation = tf.stop_gradient(train_teacher.object_translation)
        #     translation_loss = mse(pred_translation, teacher_translation)
        #     translation_loss = tf.reduce_mean(translation_loss)
        #     teacher_loss += TEACHER_C * translation_loss
        #     tf.summary.scalar('translation_loss', translation_loss)

        if hparams['do_joint_training']:
            teacher_loss += tf.reduce_mean(
                train_teacher.transform_loss +
                train_teacher.mask_reg_loss) * TEACHER_C

        if hasattr(train_model, 'attention_logits_20'):
            # Want a low entropy distribution, so that we are focused on only a small part of the image per frame.
            reshaped_logits = tf.reshape(train_model.attention_logits_20,
                                         [-1, 400])
            attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits))
            teacher_loss -= hparams[
                'attention_entropy_c'] * attention_entropy * TEACHER_C

            tf.summary.scalar('attention_entropy', attention_entropy)

        if hasattr(train_model, 'extrapath_attention_logits_20'):
            # Want a low entropy distribution, so that we are focused on only a small part of the image per frame.
            reshaped_logits = tf.reshape(
                train_model.extrapath_attention_logits_20, [-1, 400])
            attention_entropy = tf.reduce_mean(cat_entropy(reshaped_logits))
            teacher_loss -= hparams[
                'attention_entropy_c'] * attention_entropy * TEACHER_C

            tf.summary.scalar('extrapath_attention_entropy', attention_entropy)

        # if hasattr(train_model, 'attention_weights_20'):
        #     # Want this to be high entropy, so we are looking at different parts of the image on different images.
        #     batch_logits = tf.reshape(tf.reduce_sum(train_model.attention_weights_20, axis=0), [1, 400])
        #     attention_entropy = tf.reduce_mean(cat_entropy_softmax(batch_logits))
        #     loss -= hparams['batch_entropy_c'] * attention_entropy
        #     tf.summary.scalar('batch_entropy', attention_entropy)

        # if hparams['do_joint_training'] and False:
        #     assert hparams.get('teacher_ckpt')
        #     teacher_loss += TEACHER_C * train_teacher.total_loss
        # else:
        #     sfm_loss = None

        # if hparams['do_flow_prediction']:
        #     assert hparams.get('teacher_ckpt')
        #     flow_truth_x, flow_truth_y = self._get_flow_truth(train_teacher)
        #     predicted_flow = conv(train_model.flow_base, 'pred_flow', nf=4, rf=1, stride=1, trainable=True)

        #     flow_pred_x = tf.reshape(predicted_flow[..., :2], [-1, 2])
        #     flow_pred_y = tf.reshape(predicted_flow[..., 2:], [-1, 2])

        #     flow_x_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_x, logits=flow_pred_x))
        #     flow_y_loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=flow_truth_y, logits=flow_pred_y))
        #     flow_loss = flow_x_loss + flow_y_loss

        #     # flow_error = tf.reduce_mean(mse(flow_truth, predicted_flow))
        #     teacher_loss += TEACHER_C * flow_loss * hparams['flow_error_c']

        #     flow_x_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_x, axis=-1) == flow_truth_x, tf.int32))
        #     flow_y_acc = tf.reduce_mean(tf.cast(tf.argmax(flow_pred_y, axis=-1) == flow_truth_y, tf.int32))

        #     # tf.summary.scalar('flow_error_if_predict_zeros', tf.reduce_mean(0.5 * tf.square(flow_truth)))
        #     tf.summary.scalar('flow_x_loss', flow_x_loss)
        #     tf.summary.scalar('flow_y_loss', flow_y_loss)
        #     tf.summary.scalar('flow_x_acc', flow_x_acc)
        #     tf.summary.scalar('flow_y_acc', flow_y_acc)
        #     # tf.summary.image('predicted_flow_x', tf.expand_dims(predicted_flow[..., 0], axis=-1), max_outputs=1)
        #     # tf.summary.image('predicted_flow_y', tf.expand_dims(predicted_flow[..., 1], axis=-1), max_outputs=1)

        self.train_writer = tf.summary.FileWriter(
            os.path.join(hparams['base_dir'], 'logs',
                         hparams['experiment_name']), sess.graph)
        # TODO(vikgoel): when we don't need the teacher, we should ensure that we don't merge its summaries so that way
        #                we don't need to execute that part of the graph.
        merged_summaries = tf.summary.merge_all()

        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)

        def get_train_op(loss_op):
            params = find_trainable_variables("model")

            # Switch from GATE_NONE to GATE_GRAPH to enhance reproducibility.
            #grads = tf.gradients(loss, params)
            grads_and_params = trainer.compute_gradients(
                loss=loss_op,
                var_list=params,
                gate_gradients=tf.train.RMSPropOptimizer.GATE_GRAPH)
            grads = [x[0] for x in grads_and_params]
            params = [x[1] for x in grads_and_params]

            if max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

            return trainer.apply_gradients(grads)

        _fast_train = get_train_op(loss)
        _teacher_train = get_train_op(loss + teacher_loss)

        params = find_trainable_variables("model")
        print('*' * 20)
        print('chosen trainable variables')
        for p in params:
            print(p.name)
        print('*' * 20)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self.lr = lr

        write_counter = 0

        def train(obs, states, rewards, masks, actions, values):
            nonlocal write_counter

            if lr.n % hparams['target_model_update_frequency'] == 0 and hasattr(
                    self, 'target_model'):
                print('COPYING WEIGHTS INTO TARGET MODEL')
                self.target_model.copy_weights()

            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            # Smooth approximation:
            #teacher_decay_c = hparams['teacher_decay_c']#9.9e-6 # 2.5e-5
            #teacher_c = 1.0 / (teacher_decay_c * lr.n + 1)
            #teacher_c = min(hparams['max_teacher_c'], teacher_c)

            if not hparams['use_extra_path']:
                lerp = float(lr.n) / 1e7
                lerp = min(lerp, 1)
                teacher_c = hparams['max_teacher_c'] * (1. - lerp)
            else:
                teacher_c = 1

            # Linear decay schedule
            # teacher_c = (hparams['teacher_cutoff_step'] - lr.n) / hparams['teacher_cutoff_step']
            # teacher_c = max(teacher_c, 0)

            # # Lower bound on the decay
            # teacher_c = (1 - hparams['teacher_loss_c']) * teacher_c + hparams['teacher_loss_c']

            _train = _fast_train if teacher_c == 0 else _teacher_train

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                TEACHER_C: teacher_c
            }
            # td_map[DROPOUT_STRENGTH] = get_dropout_strength(hparams, lr.n)

            if self.hparams['teacher_ckpt'] and self.hparams[
                    'do_joint_training']:
                td_map[train_teacher.mask_reg_c] = 1

            #if states is not None:
            #    td_map[train_model.S] = states
            #    td_map[train_model.M] = masks

            ops = [pg_loss, vf_loss, entropy, _train]

            # if hparams.get('no_train_a2c'):
            #     ops = ops[:-1]

            if 'attention' in hparams['policy']:
                ops.append(train_model.attention_weights_20)

            write_summaries = hparams.get(
                'teacher_ckpt') or 'attention' in hparams['policy']

            if write_summaries:
                if write_counter % 10 != 0:
                    write_summaries = False
                write_counter += 1

            if write_summaries:
                ops.append(merged_summaries)

            sess_results = sess.run(ops, td_map)

            policy_loss = sess_results[0]
            value_loss = sess_results[1]
            policy_entropy = sess_results[2]

            if write_summaries:
                summary = sess_results[-1]
                self.train_writer.add_summary(summary, lr.n)

            if 'attention' in hparams['policy']:
                attention_output = sess_results[-2 if write_summaries else -1]
                publish_attention_weights(attention_output[:5, ...])

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load

        # Initialize all of the variables in a deterministic order so that each experiment is reproducible.
        global_vars = tf.global_variables()
        global_vars = sorted(global_vars, key=lambda x: x.name)
        for var in global_vars:
            tf.variables_initializer([var]).run(session=sess)
        #tf.global_variables_initializer().run(session=sess)

        if hparams.get('teacher_ckpt'):
            # Load in the teacher AFTER doing the init so we don't overwrite the weights.
            restore_teacher_from_checkpoint(sess, hparams['teacher_ckpt'])
예제 #18
0
파일: a2c.py 프로젝트: hitersyw/baselines
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6),
                 lrschedule='linear', replay_lambda=1, ss_rate=1,
                 replay_loss=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*nsteps

        # If we have replay_loss, create replay buffer and stage buffer
        # Use this to enforce replay loss lower
        if replay_loss is not None:
            self.replay_buffer = [] # holds all past data

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))

        # Introduce replay_loss if given
        if replay_loss == "L2":
            # Replace train_model.pi with whatever is predicted label
            # Replace A with whatever is recorded label
            re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch
        elif replay_loss == "Distillation":
            # Replace y_donor with whatever is recorded label
            # Replace y_acceptor with whatever is predicted label
            re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor)
                                                      * tf.log(y_acceptor),
                                                      reduction_indices=1))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
        if replay_loss is not None:
            loss = loss + replay_lambda*re_loss
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #19
0
    def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear',
            algo='regular', beta=1e-3):

        print('Create Session')
        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        nact = ac_space.n
        nbatch = nenvs*master_ts*worker_ts

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo)
        train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo)
        print('model_setting_done')

        #loss construction
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
        pg_loss = pg_loss - entropy * ent_coef
        print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm))
        try:
            if algo == 'regular':
                loss = pg_loss + vf_coef * vf_loss
            elif algo == 'VIB':
                '''
                implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss,
                the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1
                the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss
                '''
                prior = ds.Normal(0.0, 1.0)
                kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior))
                loss = pg_loss + vf_coef * vf_loss + beta*kl_loss
                # pass
            else:
                raise Exception('Algorithm not exists')
        except Exception as e:
            print(e)

        grads, global_norm = grad_clip(loss, max_grad_norm, ['model'])
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(wobs, whs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(whs)):
                cur_lr = lr.value()

            td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.wS] = states
                td_map[train_model.wM] = masks

            '''
            you can add and run additional loss for VIB here for debugging, such as kl_loss
            '''
            tloss, value_loss, policy_loss, policy_entropy, _ = sess.run(
                [loss, vf_loss, pg_loss, entropy, _train],
                feed_dict=td_map
            )
            return tloss, value_loss, policy_loss, policy_entropy

        params = find_trainable_variables("model")
        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.wvalue
        self.get_wh = step_model.get_wh
        self.initial_state = step_model.w_initial_state
        self.train = train
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack,
                 num_procs, ent_coef, vf_coef, max_grad_norm, lr, rprop_alpha,
                 rprop_epsilon, total_timesteps, lrschedule):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nbatch = nenvs * nsteps

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        eps = 1e-6

        #nadv = ADV / (train_model.ret_rms.std + eps)
        #nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps)

        nadv = (ADV - train_model.ret_rms.mean) / (train_model.ret_rms.std +
                                                   eps)
        nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps)

        nlogpac = -train_model.pd.logp(A)
        pg_loss = tf.reduce_mean(nadv * nlogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), nr))
        #vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vnorm), nr))

        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        avg_norm_ret = tf.reduce_mean(tf.abs(train_model.ret_rms.mean))
        avg_norm_obs = tf.reduce_mean(tf.abs(train_model.ob_rms.mean))

        def train(obs, states, returns, masks, actions, values):

            advs = returns - values
            #advs = (advs - np.mean(advs)) / (np.std(advs) + eps)
            for step in range(len(obs)):
                cur_lr = lr.value()
            if hasattr(train_model, "ob_rms"):
                train_model.ob_rms.update(
                    sess,
                    obs)  # update running mean/std for observations of policy
            if hasattr(train_model, "ret_rms"):
                train_model.ret_rms.update(
                    sess, returns)  # # update running mean/std for returns
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            ravg_norm_obs, policy_loss, value_loss, policy_entropy, _ = sess.run(
                [avg_norm_obs, pg_loss, vf_loss, entropy, _train], td_map)
            return ravg_norm_obs, policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #21
0
def learn_hoof_a2c(
        network,
        env,
        seed=None,
        nsteps=5,
        total_timesteps=int(80e6),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='linear',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100,
        load_path=None,  # Baselines default settings till here
        optimiser='RMSProp',
        lr_upper_bound=None,
        ent_upper_bound=None,
        num_lr=None,
        num_ent_coeff=None,
        max_kl=-1.0,  # -1.0 is for no KL constraint
        **network_kwargs):
    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # overwrite default params if using HOOF
    if lr_upper_bound is not None:
        lr = 1.0
        lrschedule = 'constant'
    else:
        num_lr = 1

    if ent_upper_bound is None:
        num_ent_coeff = 1

    # Instantiate the model object (that creates step_model and train_model)
    model = HOOF_Model(
        policy=policy,
        env=env,
        nsteps=nsteps,
        optimiser=optimiser,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        total_timesteps=total_timesteps,
        alpha=alpha,
        epsilon=epsilon  # defaults for RMSProp
    )

    runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # model helper functions
    model_params = find_trainable_variables("a2c_model")
    get_flat = U.GetFlat(model_params)
    set_from_flat = U.SetFromFlat(model_params)

    # for Gaussian policies
    def kl(new_mean, new_sd, old_mean, old_sd):
        approx_kl = np.log(new_sd / old_sd) + (
            old_sd**2 +
            (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5
        approx_kl = np.sum(approx_kl, axis=1)
        approx_kl = np.mean(approx_kl)
        return approx_kl

    if max_kl == -1.0:  # set max kl to a high val in case there is no constraint
        max_kl = 10**8

    # Start total timer
    tstart = time.time()

    for update in range(1, int(total_timesteps // nbatch + 1)):
        obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run(
        )
        epinfobuf.extend(epinfos)
        old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions)
        for step in range(len(obs)):
            cur_lr = lr.value()

        opt_pol_val = -10**8
        old_params = get_flat()
        rms_weights_before_upd = model.get_opt_state()
        approx_kl = np.zeros((num_ent_coeff, num_lr))
        epv = np.zeros((num_ent_coeff, num_lr))
        rand_lr = lr_upper_bound * np.random.rand(
            num_lr) if lr_upper_bound is not None else [cur_lr]
        rand_lr = np.sort(rand_lr)
        rand_ent_coeff = ent_upper_bound * np.random.rand(
            num_ent_coeff) if ent_upper_bound is not None else [ent_coef]

        for nec in range(num_ent_coeff):
            # reset policy and optimiser
            set_from_flat(old_params)
            model.set_opt_state(rms_weights_before_upd)

            # get grads for loss fn with given entropy coeff
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values,
                rand_ent_coeff[nec])
            new_params = get_flat()
            ent_grads = new_params - old_params

            # enumerate over different LR
            for nlr in range(num_lr):
                new_params = old_params + rand_lr[nlr] * ent_grads
                set_from_flat(new_params)
                new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll(
                    obs, actions)
                lik_ratio = np.exp(-new_neg_ll + old_neg_ll)
                est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds,
                                           lik_ratio)
                approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd)
                epv[nec, nlr] = est_pol_val

                if (nec == 0
                        and nlr == 0) or (est_pol_val > opt_pol_val
                                          and approx_kl[nec, nlr] < max_kl):
                    opt_pol_val = est_pol_val
                    opt_pol_params = get_flat()
                    opt_rms_wts = model.get_opt_state()
                    opt_lr = rand_lr[nlr]
                    opt_ent_coeff = rand_ent_coeff[nec]
                    opt_kl = approx_kl[nec, nlr]

        # update policy and rms prop to optimal wts
        set_from_flat(opt_pol_params)
        model.set_opt_state(opt_rms_wts)

        # Shrink LR search space if too many get rejected
        if lr_upper_bound is not None:
            rejections = np.sum(approx_kl > max_kl) / num_lr
            if rejections > 0.8:
                lr_upper_bound *= 0.8
            if rejections == 0:
                lr_upper_bound *= 1.25

        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("opt_lr", float(opt_lr))
            logger.record_tabular("opt_ent_coeff", float(opt_ent_coeff))
            logger.record_tabular("approx_kl", float(opt_kl))
            if lr_upper_bound is not None:
                logger.record_tabular("rejections", rejections)
                logger.record_tabular("lr_ub", lr_upper_bound)
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
예제 #22
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        writter = tf.summary.FileWriter(
            "/tmp/a2c_demo/1")  # Change for SAT: this is to use tensorBoard

        A = tf.placeholder(
            tf.int32, [nbatch])  # Comments by Fei: this must be the action
        ADV = tf.placeholder(
            tf.float32,
            [nbatch])  # Comments by Fei: this must be the advantage
        R = tf.placeholder(
            tf.float32, [nbatch])  # Comments by Fei: this must be the reward
        LR = tf.placeholder(
            tf.float32, [])  # Comments by Fei: this must be the learning rate

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi,
            labels=A)  # Comments by Fei: pi is nbatch * nact
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            # writter.add_graph(sess.graph)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #23
0
    def __init__(self, policy, p, has_state):
        """
        policy : Internal Policy model such as  SnakeModel.CNNPolicy
        p : Hyperparameters required for training
        """
        sess = tf_util.make_session()
        # Tensorflow model initiallization
        step_model = policy(sess=sess,
                            p=p,
                            train_phase=False,
                            has_state=has_state)  # Deploy model settings
        train_model = policy(sess=sess,
                             p=p,
                             train_phase=True,
                             has_state=has_state)  # Training model settings
        saver = tf.train.Saver()

        #Step 2 : Initialize the training parameters
        A = tf.placeholder(tf.int32, [p.N_BATCH])
        ADV = tf.placeholder(tf.float32, [p.N_BATCH])
        R = tf.placeholder(tf.float32, [p.N_BATCH])
        LR = tf.placeholder(tf.float32, [])

        #Step 3 : Define the loss Function
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)  #
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF

        #Step 4 : Define the loss optimizer
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if p.MAX_GRAD_NORM is not None:
            grads, grad_norm = tf.clip_by_global_norm(
                grads, p.MAX_GRAD_NORM
            )  # Clipping the gradients to protect learned weights
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=p.RMS_DECAY,
                                            epsilon=p.EPSILON)
        _train = trainer.apply_gradients(
            grads)  # This is the variable which will be used
        lr = Scheduler(v=p.LEARNING_RATE,
                       nvalues=p.N_TIMESTEPS,
                       schedule=p.LEARNING_RATE_SCHEDULE
                       )  # Learning rate changes linearly or as per arguments

        # Step 5 : Write down the summary parameters to be used
        writer = tf.summary.FileWriter(p.LOG_PATH)  #summary writer

        def train(obs, rewards, masks, actions, values, states):
            """
            obs     : batch x n x m x 1 snake matrix
            rewards : batch x 1 rewards corrosponding to action 
            actions : batch x 1 discrete action taken
            values  : batch x 1 output of value function during the training process  
            """
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                train_model.S: states,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            #ps = sess.run(params)
            #make_path(save_path)
            #joblib.dump(ps, save_path)
            saver.save(sess, save_path)

        def load(load_path):
            #loaded_params = joblib.load(load_path)
            #restores = []
            #for p, loaded_p in zip(params, loaded_params):
            #    restores.append(p.assign(loaded_p))
            #ps = sess.run(restores)
            saver.restore(sess, load_path)

        def add_scalar_summary(tag, value, step):
            summary = tf.Summary(
                value=[tf.Summary.Value(tag=tag, simple_value=value)])
            writer.add_summary(summary, step)

        # Expose the user to closure functions
        self.train = train
        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.hidden_value = step_model.hidden_value
        self.initial_state = step_model.initial_state
        self.add_scalar_summary = add_scalar_summary
        self.save = save
        self.load = load
        # Initialize global variables and add tf graph
        tf.global_variables_initializer().run(session=sess)
        writer.add_graph(tf.get_default_graph())  #write graph
예제 #24
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")
예제 #25
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 lambda_dist=0.01,
                 total_timesteps=None,
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        config = Config()

        act_model = policy(config)
        config.reuse = True
        train_model = policy(config)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.logits, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.logits))

        aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.rp_logits, labels=A)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        saver = tf.train.Saver()

        def train(obs, rs, rr, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                train_model.inputs_s: rs,
                train_model.inputs_r: rr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver.save(sess, save_path + 'model.ckpt')

        def load(load_path):
            saver.restore(sess, load_path + 'model.ckpt')

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.act = act_model.act
        self.value = act_model.value
        self.save = save
        self.load = load
예제 #26
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack,
                 num_procs, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c,
                 trust_region, alpha, delta):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])  # actions
        D = tf.placeholder(tf.float32, [nbatch])  # dones
        R = tf.placeholder(tf.float32, [nbatch])  # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact])  # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps + 1,
                             nstack,
                             reuse=True)

        params = find_trainable_variables("model")
        print("Params {}".format(len(params)))
        for var in params:
            print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            print(v.name)
            return v

        with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
            polyak_model = policy(sess,
                                  ob_space,
                                  ac_space,
                                  nenvs,
                                  nsteps + 1,
                                  nstack,
                                  reuse=True)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
        v = tf.reduce_sum(train_model.pi * train_model.q,
                          axis=-1)  # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model.pi, polyak_model.pi, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  #IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        run_ops = [
            _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev,
            norm_grads
        ]
        names_ops = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            run_ops = run_ops + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            names_ops = names_ops + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]

        def train(obs, actions, rewards, dones, mus, states, masks, steps):
            cur_lr = lr.value_steps(steps)
            td_map = {
                train_model.X: obs,
                polyak_model.X: obs,
                A: actions,
                R: rewards,
                D: dones,
                MU: mus,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks
            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        self.train = train
        self.save = save
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #27
0
파일: a2c.py 프로젝트: yoniosin/A2C
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 network='cnn',
                 prio_args=None):

        self.prio_args = prio_args
        sess = tf_util.get_session()
        nenvs = self.get_active_envs(env)

        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)
            # our TD evaluating network

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        # TD loss
        # td_loss = losses.mean_squared_error(tf.squeeze(train_model.dt), TD)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        """prio model"""
        with tf.variable_scope('a2c_model_prio', reuse=tf.AUTO_REUSE):
            # prio_model = policy(nbatch, nsteps, sess)
            prio_model = MyNN(env, nbatch, network)

        P_R = tf.placeholder(tf.float32, [nbatch])
        PRIO = tf.placeholder(tf.float32, [nbatch])
        P_LR = tf.placeholder(tf.float32, [])

        # prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), P_R) # Reward
        prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out),
                                                    PRIO)  # TD Error
        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")
        params_prio = find_trainable_variables("a2c_model_prio")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        prio_grads = tf.gradients(prio_model_loss, params_prio)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            prio_grads, prio_grad_norm = tf.clip_by_global_norm(
                prio_grads, max_grad_norm)
        grads = list(zip(grads, params))
        prio_grads = list(zip(prio_grads, params_prio))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        prio_trainer = tf.train.RMSPropOptimizer(learning_rate=P_LR,
                                                 decay=alpha,
                                                 epsilon=epsilon)

        _train = trainer.apply_gradients(grads)
        _prio_train = prio_trainer.apply_gradients(prio_grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)

            prio_loss = 0
            if self.prio_args is not None:
                prio_values = GetValuesForPrio(self.prio_args['prio_type'],
                                               self.prio_args['prio_param'],
                                               advs, rewards)
                prio_td_map = {
                    prio_model.X: obs,
                    P_R: rewards,
                    P_LR: cur_lr,
                    PRIO: prio_values
                }

                prio_loss, _, p_td = sess.run(
                    [prio_model_loss, _prio_train, PRIO], prio_td_map)
                # mb aranged as 1D-vector = [[env_1: n1, ..., n_nstep],...,[env_n_active]]
                # need to take last value of each env's buffer
                self.prio_score = prio_values[list(
                    filter(lambda x: x % nsteps == (nsteps - 1),
                           range(len(prio_values))))]
            return policy_loss, value_loss, policy_entropy, prio_loss

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.prio_model = prio_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #28
0
파일: a2c.py 프로젝트: NeteaseFuxiRL/asf
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 param=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            reuse=False,
                            param=param)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True,
                             param=param)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #29
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
                 c, trust_region, alpha, delta, icm ):

        sess = get_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch]) # actions
        D = tf.placeholder(tf.float32, [nbatch]) # dones
        R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape)
        train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape)
        with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):

            step_model = policy(nbatch=nenvs , nsteps=1,observ_placeholder=step_ob_placeholder, sess=sess)
            train_model = policy(nbatch=nbatch , nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)


        params = find_trainable_variables("acer_model")
        print("Params {}".format(len(params)))
        # for var in params:
        #     print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            print(v.name)
            return v

        with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
            polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
        # shape is [n_envs * (n_steps + 1)]

        # action probability distributions according to train_model, polyak_model and step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(train_model.pi)
        polyak_model_p = tf.nn.softmax(polyak_model.pi)
        step_model_p = tf.nn.softmax(step_model.pi)
        # train model policy probility and train model q value
        v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        # dictribution_f , f_polyak, q_value
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        # passed 
        # R = reward , D = done_ph , v = value ,... rest is same
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f)) # f is distribution here 

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True) # v is value here
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        if icm is None :

            adv = qret - v # v is value here

        else : 
            # print("Adv Normalization")
            # > Advantage Normalization
            adv = qret - v
            # m , s = get_mean_and_std(icm_adv)

            # advs = (icm_adv - m) / (s + 1e-7)
            # > Advantage Normalization


        logf = tf.log(f_i + eps)
        #  c is correction term 
        #  importance weight clipping factor : c
        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps) # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
        loss_bc= -tf.reduce_mean(gain_bc)

       #  IMP: This is sum, as expectation wrt f

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        if icm is not None :
            # print("with ICM")
            grads = grads + icm.pred_grads_and_vars



        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        
        if icm is not None :
            # print("With ICM")
            run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads , 
            icm.forw_loss , icm.inv_loss, icm.icm_loss]
            names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                     'norm_grads' ,'icm.forw_loss' , 'icm.inv_loss', 'icm.icm_loss' ]

            if trust_region:
                run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
                                     avg_norm_adj  ]
                names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
                                         'avg_norm_k_dot_g', 'avg_norm_adj'  ]
        else :
            run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ]
            names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                     'norm_grads' ]

            if trust_region:
                run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
                                     avg_norm_adj  ]
                names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
                                         'avg_norm_k_dot_g', 'avg_norm_adj' ]


        def train(obs, actions, rewards, dones, mus, states, masks, steps, next_states, icm_actions  ):
            cur_lr = lr.value_steps(steps)
            
            if icm is not None :
                print("with ICM ")
                td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr , 
                 icm.state_:obs, icm.next_state_ : next_states , icm.action_ : icm_actions}
            else :
                td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks

            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train

        def _step(observation, **kwargs):
            return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)



        self.train = train
        self.save = functools.partial(save_variables, sess=sess, variables=params)
        self.train_model = train_model
        self.step_model = step_model
        self._step = _step
        self.step = self.step_model.step

        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #30
0
    def __init__(self, policy, env, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs*nsteps


        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy


        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #31
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_envs,
                 total_timesteps,
                 nprocs=32,
                 n_steps=20,
                 ent_coef=0.01,
                 vf_coef=0.25,
                 vf_fisher_coef=1.0,
                 learning_rate=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lr_schedule='linear'):
        """
        The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144

        :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
        :param ob_space: (Gym Space) The observation space
        :param ac_space: (Gym Space) The action space
        :param n_envs: (int) The number of environments
        :param total_timesteps: (int) The total number of timesteps for training the model
        :param nprocs: (int) The number of threads for TensorFlow operations
        :param n_steps: (int) The number of steps to run for each environment
        :param ent_coef: (float) The weight for the entropic loss
        :param vf_coef: (float) The weight for the loss on the value function
        :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
        :param max_grad_norm: (float) The clipping value for the maximum gradient
        :param kfac_clip: (float) gradient clipping for Kullback leiber
        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
        """

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        n_batch = n_envs * n_steps
        action_ph = tf.placeholder(tf.int32, [n_batch])
        advs_ph = tf.placeholder(tf.float32, [n_batch])
        rewards_ph = tf.placeholder(tf.float32, [n_batch])
        pg_lr_ph = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         n_envs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           n_envs * n_steps,
                                           n_steps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.policy, labels=action_ph)
        self.logits = train_model.policy

        # training loss
        pg_loss = tf.reduce_mean(advs_ph * logpac)
        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
        train_loss = pg_loss + vf_coef * vf_loss

        # Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.value_fn + tf.random_normal(
            tf.shape(train_model.value_fn))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=pg_lr_ph,
                clip_kl=kfac_clip,
                momentum=0.9,
                kfac_update=1,
                epsilon=0.01,
                stats_decay=0.99,
                async=1,
                cold_iter=10,
                max_grad_norm=max_grad_norm)

            optim.compute_and_apply_stats(self.joint_fisher, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.learning_rate = Scheduler(initial_value=learning_rate,
                                       n_values=total_timesteps,
                                       schedule=lr_schedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for _ in range(len(obs)):
                cur_lr = self.learning_rate.value()

            td_map = {
                train_model.obs_ph: obs,
                action_ph: actions,
                advs_ph: advs,
                rewards_ph: rewards,
                pg_lr_ph: cur_lr
            }
            if states is not None:
                td_map[train_model.states_ph] = states
                td_map[train_model.masks_ph] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            session_params = sess.run(params)
            joblib.dump(session_params, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for param, loaded_p in zip(params, loaded_params):
                restores.append(param.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #32
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(20e6),
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nbatch = nenvs * nsteps

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, discounted_rewards, rewards, masks,
                  prev_actions, actions, values, dones):
            advs = discounted_rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            # reshape actions, rewards, and dones to have first dimension of size nenvs*nsteps, existing second dimension
            # this is already done for obs
            rews = np.reshape(rewards, (nbatch, 1))
            ds = np.reshape(np.asarray(dones, dtype=np.float32), (nbatch, 1))
            if len(ac_space.shape) == 0:
                prev_actions = np.reshape(prev_actions, (nbatch, ))
                one_hot = np.eye(ac_space.n)[prev_actions]
                for i in range(nbatch):
                    if prev_actions[i] == -1:
                        one_hot[i, :] = np.zeros((ac_space.n, ), dtype=np.int)
                x = np.concatenate((obs, one_hot, rews, ds), axis=1)
                actions = np.reshape(actions, (nbatch, ))
            else:
                prev_actions = np.reshape(prev_actions,
                                          (nbatch, ac_space.shape[0]))
                x = np.concatenate((obs, prev_actions, rews, ds), axis=1)
            td_map = {
                train_model.X: x,
                A: actions,
                ADV: advs,
                R: discounted_rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #33
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")