Ejemplo n.º 1
0
        def a2c_loss(pi, vf):
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=A)
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            entropy = tf.reduce_mean(cat_entropy(train_model.pi))

            # ent_coef_mode = hparams.get('ent_coef_mode', 'default')
            # ent_coef_val = hparams.get('ent_coef_val', ent_coef)

            # if ent_coef_mode == 'default':
            #     actual_ent_coef = ent_coef_val
            # elif ent_coef_mode == 'linear_teacher':
            #     actual_ent_coef = ent_coef_val * TEACHER_C + ent_coef * (1 - TEACHER_C)
            # elif ent_coef_mode == 'additive_teacher':
            #     actual_ent_coef = ent_coef_val + ent_coef_val * TEACHER_C
            # else:
            #     raise Exception('unrecognized ent_coef_mode: {}'.format(ent_coef_mode))

            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            return loss, pg_loss, vf_loss, entropy
Ejemplo n.º 2
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack,
                 num_procs, ent_coef, q_coef, e_vf_coef, gamma, max_grad_norm,
                 lr, rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
                 c, trust_region, alpha, delta):
        config = tf.ConfigProto(  # allow_soft_placement=True,
            intra_op_parallelism_threads=num_procs,
            inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])  # actions
        D = tf.placeholder(tf.float32, [nbatch])  # dones
        R = tf.placeholder(tf.float32, [nbatch])  # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact])  # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps + 1,
                             nstack,
                             reuse=True)

        # for explore start =================================
        e_ADV = tf.placeholder(tf.float32, [nbatch])
        e_R = tf.placeholder(tf.float32, [nbatch])
        e_pi_logits, e_v = map(lambda var: strip(var, nenvs, nsteps),
                               [train_model.e_pi_logits, train_model.e_v])
        e_neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=e_pi_logits, labels=A)
        e_pg_loss = tf.reduce_mean(e_ADV * e_neglogpac)
        e_vf_loss = tf.reduce_mean(mse(tf.squeeze(e_v), e_R))
        # entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        e_loss = e_pg_loss + e_vf_loss * e_vf_coef
        # e_params = find_trainable_variables("model/explore")
        with tf.variable_scope('model'):
            e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope='model/explore')
        e_grads = tf.gradients(e_loss, e_params)
        if max_grad_norm is not None:
            e_grads, e_grad_norm = tf.clip_by_global_norm(
                e_grads, max_grad_norm)
        # for explore end =================================

        # params = find_trainable_variables("model/acer")
        with tf.variable_scope('model'):
            params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       scope='model/acer')
        print("Params {}".format(len(params)))
        for var in params:
            print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v0 = getter(*args, **kwargs)
            v = ema.average(v0)
            # v = ema.average(getter(*args, **kwargs))
            if v is None:
                return v0
            else:
                print(v.name)
                return v

        with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
            polyak_model = policy(sess,
                                  ob_space,
                                  ac_space,
                                  nenvs,
                                  nsteps + 1,
                                  nstack,
                                  reuse=True)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
        v = tf.reduce_sum(train_model.pi * train_model.q,
                          axis=-1)  # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model.pi, polyak_model.pi, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  #IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        # loss = loss_policy + q_coef * loss_q - ent_coef * entropy
        loss = loss_policy + q_coef * loss_q + e_loss

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)

        # add explore grads
        grads.extend(e_grads)
        params.extend(e_params)

        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        run_ops = [
            _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev,
            norm_grads
        ]
        names_ops = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            run_ops = run_ops + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj, e_pg_loss,
                e_vf_loss
            ]
            names_ops = names_ops + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj',
                'e_pg_loss', 'e_vf_loss'
            ]

        def train(obs, actions, rewards, dones, mus, states, masks, steps,
                  e_returns, e_advs):
            cur_lr = lr.value_steps(steps)
            td_map = {
                train_model.X: obs,
                polyak_model.X: obs,
                A: actions,
                R: rewards,
                D: dones,
                MU: mus,
                LR: cur_lr,
                e_R: e_returns,
                e_ADV: e_advs
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks
            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        self.train = train
        self.save = save
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.e_step = step_model.e_step
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 3
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 param=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            reuse=False,
                            param=param)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True,
                             param=param)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 4
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 v_mix_coef=0.5,
                 max_grad_norm=0.5,
                 lr_alpha=7e-4,
                 lr_beta=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 r_ex_coef=1.0,
                 r_in_coef=0.0,
                 v_ex_coef=1.0):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch], 'A')
        R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX')
        ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX')
        RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX')
        V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX')
        DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST')
        COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT')
        LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA')
        LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA')

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum(
            train_model.r_in * tf.one_hot(A, nact), axis=1)
        ret_mix = tf.squeeze(
            tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])),
            [1]) + DIS_V_MIX_LAST
        adv_mix = ret_mix - V_MIX

        neglogpac = train_model.pd.neglogp(A)
        pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac)
        v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix),
                                        ret_mix))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss

        policy_params = train_model.policy_params
        policy_grads = tf.gradients(policy_loss, policy_params)
        if max_grad_norm is not None:
            policy_grads, policy_grad_norm = tf.clip_by_global_norm(
                policy_grads, max_grad_norm)
        policy_grads_and_vars = list(zip(policy_grads, policy_params))
        policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA,
                                                   decay=alpha,
                                                   epsilon=epsilon)
        policy_train = policy_trainer.apply_gradients(policy_grads_and_vars)

        rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params]
        policy_params_new = {}
        for grad, rms, var in zip(policy_grads, rmss, policy_params):
            ms = rms + (tf.square(grad) - rms) * (1 - alpha)
            policy_params_new[
                var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon)
        policy_new = train_model.policy_new_fn(policy_params_new, ob_space,
                                               ac_space, nbatch, nsteps)

        neglogpac_new = policy_new.pd.neglogp(A)
        ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new)
        pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new)
        v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX))
        intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss

        intrinsic_params = train_model.intrinsic_params
        intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params)
        if max_grad_norm is not None:
            intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm(
                intrinsic_grads, max_grad_norm)
        intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params))
        intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA,
                                                      decay=alpha,
                                                      epsilon=epsilon)
        intrinsic_train = intrinsic_trainer.apply_gradients(
            intrinsic_grads_and_vars)

        lr_alpha = Scheduler(v=lr_alpha,
                             nvalues=total_timesteps,
                             schedule=lrschedule)
        lr_beta = Scheduler(v=lr_beta,
                            nvalues=total_timesteps,
                            schedule=lrschedule)

        all_params = tf.global_variables()

        def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex,
                  v_mix, dis_v_mix_last, coef_mat):
            advs_ex = ret_ex - v_ex
            for step in range(len(obs)):
                cur_lr_alpha = lr_alpha.value()
                cur_lr_beta = lr_beta.value()
            td_map = {
                train_model.X: obs,
                policy_new.X: obs,
                A: actions,
                R_EX: r_ex,
                ADV_EX: advs_ex,
                RET_EX: ret_ex,
                V_MIX: v_mix,
                DIS_V_MIX_LAST: dis_v_mix_last,
                COEF_MAT: coef_mat,
                LR_ALPHA: cur_lr_alpha,
                LR_BETA: cur_lr_beta
            }
            if policy_states is not None:
                td_map[train_model.PS] = policy_states
                td_map[train_model.M] = masks
            return sess.run([entropy, policy_train, intrinsic_train],
                            td_map)[0]

        def save(save_path):
            ps = sess.run(all_params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(all_params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.intrinsic_reward = step_model.intrinsic_reward
        self.init_policy_state = step_model.init_policy_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 5
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0)
        entropy = tf.reduce_sum(cat_entropy(train_model.pi))
        params = find_trainable_variables("model")
        tf.summary.histogram("vf", train_model.vf)
        tf.summary.histogram("R", R)

        if train_model.relaxed:
            pg_loss = tf.constant(0.0)
            oh_A = tf.one_hot(train_model.a0, ac_space.n)

            params = find_trainable_variables("model")
            policy_params = [v for v in params if "pi" in v.name]
            vf_params = [v for v in params if "vf" in v.name]
            entropy_grads = tf.gradients(entropy, policy_params)

            ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t)
            ddiff_grads = tf.gradients(ddiff_loss, policy_params)

            sm = tf.nn.softmax(train_model.pi)
            dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm)
            pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi)
            pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)
            pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)]

            pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)

            cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0)
            cv_grad_splits = tf.reduce_sum(tf.square(cv_grads))
            vf_loss = cv_grad_splits * vf_coef

            cv_grads = tf.gradients(vf_loss, vf_params)

            policy_grads = []
            for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params):
                grad = -e_grad * ent_coef + p_grad
                policy_grads.append(grad)
            grad_dict = {}

            for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)):
                grad_dict[v] = g

            grads = [grad_dict[v] for v in params]
            print(grads)


        else:
            pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac)
            policy_params = [v for v in params if "pi" in v.name]
            pg_grads = tf.gradients(pg_loss, policy_params)

            vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R))
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            grads = tf.gradients(loss, params)

        grads = list(zip(grads, params))

        ema = tf.train.ExponentialMovingAverage(.99)
        all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0)
        all_policy_grads_sq = tf.square(all_policy_grads)
        apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq])
        em_mean = ema.average(all_policy_grads)
        em_mean_sq = ema.average(all_policy_grads_sq)
        em_var = em_mean_sq - tf.square(em_mean)
        em_log_var = tf.log(em_var + 1e-20)
        mlgv = tf.reduce_mean(em_log_var)

        for g, v in grads:
            print(v.name, g)
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name+"_grad", g)

        self.sum_op = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter(logdir)

        trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999)
        with tf.control_dependencies([apply_mean_op]):
            _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self._step = 0
        def train(obs, states, rewards, masks, u1, u2, values, summary=False):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X:obs, train_model.U1:u1, train_model.U2:u2,
                ADV:advs, R:rewards, LR:cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if summary:
                sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
                self.writer.add_summary(sum_str, self._step)
            else:
                policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
            self._step += 1
            return policy_loss, value_loss, policy_entropy, lv

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 6
0
    def __init__(self,
                 optim,
                 policy,
                 ob_dim,
                 ac_dim,
                 num_procs,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 vf_lr=0.001,
                 cv_lr=0.001,
                 cv_num=25,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 logdir=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

        A = tf.placeholder(tf.float32, [None, ac_dim], name="A")
        ADV = tf.placeholder(tf.float32, [None], name="ADV")
        R = tf.placeholder(tf.float32, [None], name="R")

        train_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=False)
        step_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=True)

        params = find_trainable_variables("model")
        tf.summary.histogram("vf", train_model.vf)

        pi_params = [v for v in params if "pi" in v.name]
        vf_params = [v for v in params if "vf" in v.name]

        logpac = train_model.logprob_n
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        pg_loss = -tf.reduce_mean(ADV * logpac)
        tf.summary.scalar("vf_loss", vf_loss)

        if train_model.relaxed:
            ddiff_loss = tf.reduce_mean(train_model.cv)
            ddiff_grads_mean = tf.gradients(ddiff_loss, pi_params)
            ddiff_grads_std = tf.gradients(ddiff_loss, train_model.logstd_1a)

            dlogp_dmean = (A - train_model.mean) / tf.square(
                train_model.std_na)
            dlogp_dstd = -1 / train_model.std_na + 1 / tf.pow(
                train_model.std_na, 3) * tf.square(A - train_model.mean)

            pi_grads_mean = -((tf.expand_dims(ADV, 1) - train_model.cv) *
                              dlogp_dmean) / tf.to_float(tf.shape(ADV)[0])
            pg_grads_mean = tf.gradients(train_model.mean,
                                         pi_params,
                                         grad_ys=pi_grads_mean)
            pg_grads_mean = [
                pg - dg for pg, dg in zip(pg_grads_mean, ddiff_grads_mean)
            ]
            pi_grads_std = -((tf.expand_dims(ADV, 1) - train_model.cv) *
                             dlogp_dstd) / tf.to_float(tf.shape(ADV)[0])
            pg_grads_std = tf.gradients(train_model.std_na,
                                        train_model.logstd_1a,
                                        grad_ys=pi_grads_std)
            pg_grads_std = [
                pg - dg for pg, dg in zip(pg_grads_std, ddiff_grads_std)
            ]

            pg_grads = pg_grads_mean + pg_grads_std

            cv_loss = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0)
            cv_loss = tf.squeeze(tf.reduce_sum(tf.square(cv_loss)))

            tf.summary.scalar("cv_loss", cv_loss)
            cv_params = [v for v in params if "cv" in v.name]
            cv_grads = tf.gradients(cv_loss, cv_params)
            cv_gradvars = list(zip(cv_grads, cv_params))
        else:
            pg_grads = tf.gradients(pg_loss, pi_params) + tf.gradients(
                pg_loss, train_model.logstd_1a)

        all_policy_grads = tf.concat([tf.reshape(pg, [-1]) for pg in pg_grads],
                                     0)

        # policy gradients
        policy_gradvars = list(
            zip(pg_grads, pi_params + [train_model.logstd_1a]))
        vf_grads = tf.gradients(vf_loss, vf_params)
        vf_gradvars = list(zip(vf_grads, vf_params))

        grads_list = policy_gradvars + vf_gradvars
        if train_model.relaxed: grads_list += cv_gradvars
        for g, v in grads_list:
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name + "_grad", g)

        sum_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir)

        trainer = optim
        _train = trainer.apply_gradients(policy_gradvars)

        _vf_train = train_model.vf_optim.apply_gradients(vf_gradvars)

        self._step = 0

        def get_cv_grads(obs, old_actions, advs, rewards, vf_in, values):
            advs = rewards - values
            td_map = {
                train_model.ob_no: obs,
                train_model.oldac_na: old_actions,
                train_model.X: vf_in,
                A: old_actions,
                ADV: advs,
                R: rewards
            }
            cv_gs = sess.run(cv_grads, td_map)
            return cv_gs

        def update_cv(mean_cv_gs):
            cv_gvs = list(zip(mean_cv_gs, cv_params))
            train_model.cv_optim.apply_gradients(cv_gvs)

        def update_policy_and_value(obs,
                                    old_actions,
                                    advs,
                                    rewards,
                                    vf_in,
                                    values,
                                    summary=False):
            advs = rewards - values
            td_map = {
                train_model.ob_no: obs,
                train_model.oldac_na: old_actions,
                train_model.X: vf_in,
                A: old_actions,
                ADV: advs,
                R: rewards
            }
            for _ in range(25):
                sess.run(
                    _vf_train, {
                        train_model.ob_no: obs,
                        train_model.oldac_na: old_actions,
                        train_model.X: vf_in,
                        A: old_actions,
                        ADV: advs,
                        R: rewards
                    })
            if summary:
                sum_str, policy_loss, value_loss, _, = sess.run(
                    [sum_op, pg_loss, vf_loss, _train], td_map)
                writer.add_summary(sum_str, self._step)
            else:
                policy_loss, value_loss, _ = sess.run(
                    [pg_loss, vf_loss, _train], td_map)
            self._step += 1
            return policy_loss, value_loss

        def get_grads(obs, old_actions, advs, rewards, vf_in, value):
            advs = rewards - value
            td_map = {
                train_model.ob_no: obs,
                train_model.oldac_na: old_actions,
                train_model.X: vf_in,
                A: old_actions,
                ADV: advs,
                R: rewards
            }
            _g = all_policy_grads  # seems to already have happened? / tf.to_float(tf.shape(rewards)[0])
            pg = sess.run(_g, td_map)
            return pg

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.sess = sess
        self.get_cv_grads = get_cv_grads
        self.update_cv = update_cv
        self.update_policy_and_value = update_policy_and_value
        self.train_model = train_model
        self.step_model = step_model
        self.value = train_model.value
        self.get_grads = get_grads
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 7
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_envs,
                 total_timesteps,
                 nprocs=32,
                 n_steps=20,
                 ent_coef=0.01,
                 vf_coef=0.25,
                 vf_fisher_coef=1.0,
                 learning_rate=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lr_schedule='linear'):
        """
        The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144

        :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
        :param ob_space: (Gym Space) The observation space
        :param ac_space: (Gym Space) The action space
        :param n_envs: (int) The number of environments
        :param total_timesteps: (int) The total number of timesteps for training the model
        :param nprocs: (int) The number of threads for TensorFlow operations
        :param n_steps: (int) The number of steps to run for each environment
        :param ent_coef: (float) The weight for the entropic loss
        :param vf_coef: (float) The weight for the loss on the value function
        :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
        :param max_grad_norm: (float) The clipping value for the maximum gradient
        :param kfac_clip: (float) gradient clipping for Kullback leiber
        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
        """

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        n_batch = n_envs * n_steps
        action_ph = tf.placeholder(tf.int32, [n_batch])
        advs_ph = tf.placeholder(tf.float32, [n_batch])
        rewards_ph = tf.placeholder(tf.float32, [n_batch])
        pg_lr_ph = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         n_envs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           n_envs * n_steps,
                                           n_steps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.policy, labels=action_ph)
        self.logits = train_model.policy

        # training loss
        pg_loss = tf.reduce_mean(advs_ph * logpac)
        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
        train_loss = pg_loss + vf_coef * vf_loss

        # Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.value_fn + tf.random_normal(
            tf.shape(train_model.value_fn))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=pg_lr_ph,
                clip_kl=kfac_clip,
                momentum=0.9,
                kfac_update=1,
                epsilon=0.01,
                stats_decay=0.99,
                async=1,
                cold_iter=10,
                max_grad_norm=max_grad_norm)

            optim.compute_and_apply_stats(self.joint_fisher, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.learning_rate = Scheduler(initial_value=learning_rate,
                                       n_values=total_timesteps,
                                       schedule=lr_schedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for _ in range(len(obs)):
                cur_lr = self.learning_rate.value()

            td_map = {
                train_model.obs_ph: obs,
                action_ph: actions,
                advs_ph: advs,
                rewards_ph: rewards,
                pg_lr_ph: cur_lr
            }
            if states is not None:
                td_map[train_model.states_ph] = states
                td_map[train_model.masks_ph] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            session_params = sess.run(params)
            joblib.dump(session_params, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for param, loaded_p in zip(params, loaded_params):
                restores.append(param.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 8
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 lambda_dist=0.01,
                 total_timesteps=None,
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        config = Config()

        act_model = policy(config)
        config.reuse = True
        train_model = policy(config)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.logits, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.logits))

        aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.rp_logits, labels=A)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        saver = tf.train.Saver()

        def train(obs, rs, rr, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                train_model.inputs_s: rs,
                train_model.inputs_r: rr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver.save(sess, save_path + 'model.ckpt')

        def load(load_path):
            saver.restore(sess, load_path + 'model.ckpt')

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.act = act_model.act
        self.value = act_model.value
        self.save = save
        self.load = load
Ejemplo n.º 9
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 nModelsToKeep=5):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save():
            modelfile = os.path.join(
                logger.get_dir(),
                datetime.datetime.now().strftime("model-%Y-%m-%d-%H-%M-%S-%f"))

            ps = sess.run(params)
            joblib.dump(ps, modelfile)
            logger.log('Model saved to %s' % modelfile)

            model_files = sorted(
                fnmatch.filter(os.listdir(logger.get_dir()), "model-*"))
            if len(model_files) > nModelsToKeep:
                for old_file in model_files[0:-nModelsToKeep]:
                    os.remove(os.path.join(logger.get_dir(), old_file))

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)
            logger.log('Model loaded from %s' % load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 10
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_envs,
                 n_steps,
                 ent_coef=0.01,
                 vf_coef=0.25,
                 max_grad_norm=0.5,
                 learning_rate=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lr_schedule='linear'):
        """
        The A2C (Advantage Actor Critic) model class, https://arxiv.org/abs/1602.01783

        :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
        :param ob_space: (Gym Space) Observation space
        :param ac_space: (Gym Space) Action space
        :param n_envs: (int) The number of environments
        :param n_steps: (int) The number of steps to run for each environment
        :param ent_coef: (float) Entropy coefficient for the loss caculation
        :param vf_coef: (float) Value function coefficient for the loss calculation
        :param max_grad_norm: (float) The maximum value for the gradient clipping
        :param learning_rate: (float) The learning rate
        :param alpha: (float) RMS prop optimizer decay
        :param epsilon: (float) RMS prop optimizer epsilon
        :param total_timesteps: (int) The total number of samples
        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
        """

        sess = tf_util.make_session()
        n_batch = n_envs * n_steps

        actions_ph = tf.placeholder(tf.int32, [n_batch])
        advs_ph = tf.placeholder(tf.float32, [n_batch])
        rewards_ph = tf.placeholder(tf.float32, [n_batch])
        learning_rate_ph = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, n_envs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             n_envs * n_steps,
                             n_steps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.policy, labels=actions_ph)
        pg_loss = tf.reduce_mean(advs_ph * neglogpac)
        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=learning_rate_ph,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        learning_rate = Scheduler(initial_value=learning_rate,
                                  n_values=total_timesteps,
                                  schedule=lr_schedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for _ in range(len(obs)):
                cur_lr = learning_rate.value()
            td_map = {
                train_model.obs_ph: obs,
                actions_ph: actions,
                advs_ph: advs,
                rewards_ph: rewards,
                learning_rate_ph: cur_lr
            }
            if states is not None:
                td_map[train_model.states_ph] = states
                td_map[train_model.masks_ph] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            parameters = sess.run(params)
            make_path(os.path.dirname(save_path))
            joblib.dump(parameters, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for param, loaded_p in zip(params, loaded_params):
                restores.append(param.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 11
0
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)



        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 12
0
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps,
                 ent_coeff, vf_coeff, max_grad_norm):
        sess = tf.get_default_session()

        #Define placeholders
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        #Create our two models here
        #take one step for each environment
        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)
        #take number of steps * number of environments for total steps
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        #calculate the loss
        #Note: in the future we can add clipped Loss to control the step size of our parameter updates.
        #This can lead to better convergence *Using PPO*
        #Recall that Total Loss =  PolicyGradientLoss - Entropy*EntropyCoeff + Value*ValueCoeff

        #output loss -log(policy)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            Logits=train_model.pi,
            Labels=actions_,
        )

        #1/n * sum(A(s,a) * -logpi(a|s))
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        #value loss
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), rewards_))

        #entropy
        entropy = tf.reduce_mean(train_model.pd.entropy())

        #total loss
        loss = pg_loss - (entropy * ent_coeff) + (vf_loss * vf_coeff)

        #Update the parameters using the loss we've just calculated
        #Grab model params
        params = find_trainable_variables("model")

        #Calculate gradients. *We'll want to zip our parameters w/ our gradients
        grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            #Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = List(zip(grads, params))

        #build our trainer
        trainer = tf.train.RMSPropOptimizer(Learning_rate=lr_,
                                            decay=0.99,
                                            epsilon=1e-5)
        #Backprop
        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            #here we calculate advantage A(s, a) = R+yV(s') - V(s)
            #Returns = R+yV(S')
            advantages = returns - values

            td_map = {
                train_model.inputs_: states_in,
                actions_: actions,
                advantages_: advantages,
                rewards_:
                returns,  #Recall we bootstrap "real" value since we're learning 1 step at a time. (not episode)
                lr_: lr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            saver = tf.train.Saver()
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 13
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 continuous_actions=False,
                 debug=False,
                 numAgents=2,
                 itr=1,
                 particleEnv=False,
                 communication=False):
        self.continuous_actions = continuous_actions
        self.nenvs = nenvs
        print('vf_coef', vf_coef)
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        # print('action space: ', ac_space)
        if particleEnv == False:
            nact = ac_space.n
        elif communication == False:
            nact = ac_space[itr].n
        else:
            nact = ac_space[itr].high - ac_space[itr].low  # modified
        self.nact = nact
        # print('nact: ', nact)
        # print(nact)
        nbatch = nenvs * nsteps
        # print(nbatch)
        # print('batch size: ', nbatch)
        if self.continuous_actions:
            A = tf.placeholder(tf.float32, [nbatch])
        elif particleEnv == False or communication == False:
            A = tf.placeholder(tf.int32, [nbatch])
        else:
            actions_per_agent = 2
            A = tf.placeholder(tf.int32, [actions_per_agent, nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        if particleEnv == False:
            step_model = policy(
                sess,
                ob_space,
                ac_space,
                nenvs,
                1,
                nstack,
                reuse=tf.AUTO_REUSE,
                continuous_actions=continuous_actions)  #, itr=itr)
            train_model = policy(
                sess,
                ob_space,
                ac_space,
                nenvs,
                nsteps,
                nstack,
                reuse=tf.AUTO_REUSE,
                continuous_actions=continuous_actions)  #, itr=itr)
        elif communication == False:
            # print('step model')
            step_model = policy(sess,
                                ob_space,
                                ac_space,
                                nenvs,
                                1,
                                nstack,
                                reuse=False,
                                continuous_actions=continuous_actions,
                                itr=itr,
                                communication=communication)
            # print('train model')
            train_model = policy(sess,
                                 ob_space,
                                 ac_space,
                                 nenvs,
                                 nsteps,
                                 nstack,
                                 reuse=tf.AUTO_REUSE,
                                 continuous_actions=continuous_actions,
                                 itr=itr,
                                 communication=communication)
        else:
            step_model = policy(sess,
                                ob_space,
                                ac_space,
                                nenvs,
                                1,
                                nstack,
                                reuse=tf.AUTO_REUSE,
                                continuous_actions=continuous_actions,
                                itr=itr,
                                communication=communication)
            train_model = policy(sess,
                                 ob_space,
                                 ac_space,
                                 nenvs,
                                 nsteps,
                                 nstack,
                                 reuse=tf.AUTO_REUSE,
                                 continuous_actions=continuous_actions,
                                 itr=itr,
                                 communication=communication)
        # else:
        # else:
        #     step_model = []
        #     train_model = []
        #     for i in range(numAgents):
        #         step_model.append(policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=tf.AUTO_REUSE, continuous_actions=continuous_actions))
        #         train_model.append(policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True, continuous_actions=continuous_actions))

        # print(train_model)
        if self.continuous_actions:
            neglogpac = tf.log(mse(train_model.mu, A))
        elif particleEnv == False or communication == False:
            # print('A: ', A)
            neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi, labels=A)
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            entropy = tf.reduce_mean(cat_entropy(train_model.pi))
            pg_loss = tf.reduce_mean(ADV * neglogpac)
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        else:
            neglogpac = []
            entropy = []
            pg_loss = []
            loss = []
            vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
            neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi_c, labels=A[0])
            entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_c))
            pg_loss_ = tf.reduce_mean(ADV * neglogpac_)
            entropy.append(entropy_)
            pg_loss.append(pg_loss_)
            loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef)
            neglogpac_ = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model.pi_u, labels=A[1])
            entropy_ = tf.reduce_mean(cat_entropy(train_model.pi_u))
            pg_loss_ = tf.reduce_mean(ADV * neglogpac_)
            entropy.append(entropy_)
            pg_loss.append(pg_loss_)
            loss.append(pg_loss_ - entropy_ * ent_coef + vf_loss * vf_coef)

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # f itr == 0:
        # trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = tf.train.AdamOptimizer(
            learning_rate=LR, name=str(itr)
        ).apply_gradients(
            grads
        )  # , decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads)
        # _train = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon, name=str(itr)).apply_gradients(grads) # Error here

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs,
                  states,
                  rewards,
                  masks,
                  actions,
                  values,
                  debug=False,
                  numAgents=2):
            # print('train rewards and values')
            # print(actions[0])
            # print(actions[1])
            # print(rewards)
            # print(values)
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            # if states != []:
            if train_model.initial_state != []:
                # print(states)
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if debug == True:
                policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run(
                    [pg_loss, vf_loss, entropy, grads, _train], td_map)
                # grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals]
                # print('Policy Gradients: ')
                # print(all_grad_vals[9])
                # print('Value Gradients: ')
                # print(all_grad_vals[11])
                print('Gradient Values: ')
                print(all_grad_vals)
            else:
                policy_loss, value_loss, policy_entropy, _ = sess.run(
                    [pg_loss, vf_loss, entropy, _train], td_map)
            # else:
            # td_map = []
            #     print('Train Model in train')
            #     print(train_model)
            #     for i in range(numAgents):
            #         td_map = {train_model[i].X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            #         if train_model[i].initial_state != []:
            #             print('states')
            #            print(states)
            #            td_map[train_model[i].S] = states
            #            td_map[train_model[i].M] = masks
            #        if debug:
            #            print('point1')
            #            policy_loss, value_loss, policy_entropy, all_grad_vals, _ = sess.run(
            #                [pg_loss, vf_loss, entropy, grads, _train],
            #                td_map
            #            )
            #            print('point2')
            #            grad_vals = [(np.min(grad_vals), np.max(grad_vals), np.sum(grad_vals)) for grad_vals in all_grad_vals]
            #            print('Policy Gradients: ')
            #            print(all_grad_vals[9])
            #            print('Value Gradients: ')
            #            print(all_grad_vals[11])
            #        else:
            #            policy_loss, value_loss, policy_entropy, _ = sess.run(
            #                [pg_loss, vf_loss, entropy, _train],
            #                td_map
            #            )
            # print('Policy Loss: ')
            # print(policy_loss)
            # print('Value Loss: ')
            # print(value_loss)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            #make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        # if numAgents == 1:
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        # else:
        #     self.step = []
        #     self.value = []
        #     self.initial_state = []
        #     for i in range(numAgents):
        #         self.step.append(step_model[i].step)
        #         self.value.append(step_model[i].value)
        #         self.initial_state.append(step_model[i].initial_state)
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 14
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")
Ejemplo n.º 15
0
    def __init__(self, policy, ob_space, action_space, nenvs, nsteps, ent_coef,
                 vf_coef, max_grad_norm):

        sess = tf.get_default_session()

        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.int32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.int32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.int32, [None], name="learning_rate_")

        step_model = policy(sess,
                            ob_space,
                            action_space,
                            nenvs,
                            1,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             action_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=actions_)

        pg_loss = tf.reduce_mean(advantages_ * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squezze(train_model.vf), rewards_))
        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pgloss - entropy * ent_coef + vf_loss * vf_coef

        params = find_tranaible_variables("model")

        grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grands, grand_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        trainer = tf.train.RMSPropOptimizer(learning_rate_,
                                            decay=0.99,
                                            epsilon=1e-5)

        _train = trainer.apply_gradient(grads)

        def train(states_in, actions, returns, values, lr):
            advantages = returns - values

            td_map = {
                train_model.inputs_: states_in,
                actions_: actions,
                advantages_: advantages,
                rewards_: returns,
                lr_: lr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            saver = tf.train.Saver()
            print("Loading " + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 16
0
    def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear',
            algo='regular', beta=1e-3):

        print('Create Session')
        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        nact = ac_space.n
        nbatch = nenvs*master_ts*worker_ts

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo)
        train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo)
        print('model_setting_done')

        #loss construction
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
        pg_loss = pg_loss - entropy * ent_coef
        print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm))
        try:
            if algo == 'regular':
                loss = pg_loss + vf_coef * vf_loss
            elif algo == 'VIB':
                '''
                implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss,
                the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1
                the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss
                '''
                prior = ds.Normal(0.0, 1.0)
                kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior))
                loss = pg_loss + vf_coef * vf_loss + beta*kl_loss
                # pass
            else:
                raise Exception('Algorithm not exists')
        except Exception as e:
            print(e)

        grads, global_norm = grad_clip(loss, max_grad_norm, ['model'])
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(wobs, whs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(whs)):
                cur_lr = lr.value()

            td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.wS] = states
                td_map[train_model.wM] = masks

            '''
            you can add and run additional loss for VIB here for debugging, such as kl_loss
            '''
            tloss, value_loss, policy_loss, policy_entropy, _ = sess.run(
                [loss, vf_loss, pg_loss, entropy, _train],
                feed_dict=td_map
            )
            return tloss, value_loss, policy_loss, policy_entropy

        params = find_trainable_variables("model")
        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.wvalue
        self.get_wh = step_model.get_wh
        self.initial_state = step_model.w_initial_state
        self.train = train
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 17
0
    def __init__(self, policy, p, has_state):
        """
        policy : Internal Policy model such as  SnakeModel.CNNPolicy
        p : Hyperparameters required for training
        """
        sess = tf_util.make_session()
        # Tensorflow model initiallization
        step_model = policy(sess=sess,
                            p=p,
                            train_phase=False,
                            has_state=has_state)  # Deploy model settings
        train_model = policy(sess=sess,
                             p=p,
                             train_phase=True,
                             has_state=has_state)  # Training model settings
        saver = tf.train.Saver()

        #Step 2 : Initialize the training parameters
        A = tf.placeholder(tf.int32, [p.N_BATCH])
        ADV = tf.placeholder(tf.float32, [p.N_BATCH])
        R = tf.placeholder(tf.float32, [p.N_BATCH])
        LR = tf.placeholder(tf.float32, [])

        #Step 3 : Define the loss Function
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)  #
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF

        #Step 4 : Define the loss optimizer
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if p.MAX_GRAD_NORM is not None:
            grads, grad_norm = tf.clip_by_global_norm(
                grads, p.MAX_GRAD_NORM
            )  # Clipping the gradients to protect learned weights
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=p.RMS_DECAY,
                                            epsilon=p.EPSILON)
        _train = trainer.apply_gradients(
            grads)  # This is the variable which will be used
        lr = Scheduler(v=p.LEARNING_RATE,
                       nvalues=p.N_TIMESTEPS,
                       schedule=p.LEARNING_RATE_SCHEDULE
                       )  # Learning rate changes linearly or as per arguments

        # Step 5 : Write down the summary parameters to be used
        writer = tf.summary.FileWriter(p.LOG_PATH)  #summary writer

        def train(obs, rewards, masks, actions, values, states):
            """
            obs     : batch x n x m x 1 snake matrix
            rewards : batch x 1 rewards corrosponding to action 
            actions : batch x 1 discrete action taken
            values  : batch x 1 output of value function during the training process  
            """
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                train_model.S: states,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            #ps = sess.run(params)
            #make_path(save_path)
            #joblib.dump(ps, save_path)
            saver.save(sess, save_path)

        def load(load_path):
            #loaded_params = joblib.load(load_path)
            #restores = []
            #for p, loaded_p in zip(params, loaded_params):
            #    restores.append(p.assign(loaded_p))
            #ps = sess.run(restores)
            saver.restore(sess, load_path)

        def add_scalar_summary(tag, value, step):
            summary = tf.Summary(
                value=[tf.Summary.Value(tag=tag, simple_value=value)])
            writer.add_summary(summary, step)

        # Expose the user to closure functions
        self.train = train
        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.hidden_value = step_model.hidden_value
        self.initial_state = step_model.initial_state
        self.add_scalar_summary = add_scalar_summary
        self.save = save
        self.load = load
        # Initialize global variables and add tf graph
        tf.global_variables_initializer().run(session=sess)
        writer.add_graph(tf.get_default_graph())  #write graph
Ejemplo n.º 18
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Defines step_model function and train_model functions
        # Pass each model a copy of 'sess'
        print("Constructing model... STEP_MODEL & TRAIN_MODEL: constructing step_model policy | " + str(policy))
        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)

        # train_model takes in the mini-batch produced by 5 step_models, NOTE: reuse = true
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        # var init: this neglogpac is still somewhat unknown,
        # looks like it does softmax over policy layer of training model
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        print("MAIN: neglocpac = sparse_softmax_cross_entropy_with_logits() inputs: ")
        print("MAIN: train_model_pi: " + str(train_model.pi))
        print("MAIN: labels: " + str(A))

        # var init: policy gradient loss determined by average of all advantage * neglogpac
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # value function loss is mse(tf.squeeze(train_model.vf), R)
        # ^ in english, mse(model value prediction, actual Reward)
        # mse == means squared error, defined in a2c/utils.py
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))

        # entropy of policy
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))

        # total loss calculation?
        # todo: is this the loss function definition??? check with a3c paper
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef


        # params gets trainable variables from model (weights of network?)
        params = find_trainable_variables("model")

        # computes gradients (change of weights, or direction of weights) using 'loss' and 'params' above
        # computes 'symbolic derivatives of sum 'loss' w.r.t 'params'
        # from tflow docs: 'gradients() adds ops to the graph to output the derivs of 'params'
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)

        # TODO: how many gradients are computed here, should be 16
        grads = list(zip(grads, params))
        # RMSProp optimizes learning rate , check thesis notes
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        # RMSProp pushes back new gradients over trainable variables to change weights
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)


        writer = tf.summary.FileWriter("/tmp/helloTensorBoard.txt")
        writer.add_graph(sess.graph)

        # Trains the model,
        # TODO: What is 'masks' input param
        # TODO: How often does train_model (steps thru train_model) get run vs. step_model
        #   A: I think it does a 'train_model' for each mini-batch, which is currently 5 steps
        # Does a sess.run with train_model
        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            # td_map hooks up all inputs for train model?
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            # Policy Loss, Value Loss, and Policy Entropy calculations

            # Propagates losses backwards through the neural network?
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):


            path = logger.get_dir() + "/model.pkl"

            print("Logger dir: " + logger.get_dir())
            print("MODEL SAVED TO : " + str(path))

            ps = sess.run(params)
            #make_path(osp.dirname(save_path))
            joblib.dump(ps, path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 19
0
    def __init__(self,
                 policy,
                ob_space,
                action_space,
                nenvs,
                nsteps,
                ent_coef,
                vf_coef,
                max_grad_norm):

        sess = tf.get_default_session()

        # Here we create the placeholders
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        # Here we create our two models:
        # Step_model that is used for sampling
        step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False)

        # Train model for training
        train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True)

        """
        Calculate the loss
        Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        """
        # Policy loss
        # Output -log(pi)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)

        # 1/n * sum A(si,ai) * -logpi(ai|si)
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        # Value loss 1/2 SUM [R - V(s)]^2
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_))

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())


        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advantages = returns - values

            # We create the feed dictionary
            td_map = {train_model.inputs_: states_in,
                     actions_: actions,
                     advantages_: advantages, # Use to calculate our policy loss
                     rewards_: returns, # Use as a bootstrap for real value
                     lr_: lr}

            policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map)
            
            return policy_loss, value_loss, policy_entropy


        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 20
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 21
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6),
                 lrschedule='linear', replay_lambda=1, ss_rate=1,
                 replay_loss=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*nsteps

        # If we have replay_loss, create replay buffer and stage buffer
        # Use this to enforce replay loss lower
        if replay_loss is not None:
            self.replay_buffer = [] # holds all past data

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))

        # Introduce replay_loss if given
        if replay_loss == "L2":
            # Replace train_model.pi with whatever is predicted label
            # Replace A with whatever is recorded label
            re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch
        elif replay_loss == "Distillation":
            # Replace y_donor with whatever is recorded label
            # Replace y_acceptor with whatever is predicted label
            re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor)
                                                      * tf.log(y_acceptor),
                                                      reduction_indices=1))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
        if replay_loss is not None:
            loss = loss + replay_lambda*re_loss
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 22
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nenvs,
                                inter_op_parallelism_threads=nenvs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        self.saver = tf.train.Saver(max_to_keep=1000)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(path, steps):
            make_path(path)
            self.saver.save(sess, path + 'model', global_step=steps)

        def load(path, steps):
            self.saver = tf.train.import_meta_graph(path + 'model' + '-' +
                                                    str(steps) + '.meta')
            self.saver.restore(sess, tf.train.latest_checkpoint(path))

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 23
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack,
                 num_procs, ent_coef, vf_coef, max_grad_norm, lr, rprop_alpha,
                 rprop_epsilon, total_timesteps, lrschedule):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nbatch = nenvs * nsteps

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        eps = 1e-6

        #nadv = ADV / (train_model.ret_rms.std + eps)
        #nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps)

        nadv = (ADV - train_model.ret_rms.mean) / (train_model.ret_rms.std +
                                                   eps)
        nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps)

        nlogpac = -train_model.pd.logp(A)
        pg_loss = tf.reduce_mean(nadv * nlogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), nr))
        #vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vnorm), nr))

        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        avg_norm_ret = tf.reduce_mean(tf.abs(train_model.ret_rms.mean))
        avg_norm_obs = tf.reduce_mean(tf.abs(train_model.ob_rms.mean))

        def train(obs, states, returns, masks, actions, values):

            advs = returns - values
            #advs = (advs - np.mean(advs)) / (np.std(advs) + eps)
            for step in range(len(obs)):
                cur_lr = lr.value()
            if hasattr(train_model, "ob_rms"):
                train_model.ob_rms.update(
                    sess,
                    obs)  # update running mean/std for observations of policy
            if hasattr(train_model, "ret_rms"):
                train_model.ret_rms.update(
                    sess, returns)  # # update running mean/std for returns
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            ravg_norm_obs, policy_loss, value_loss, policy_entropy, _ = sess.run(
                [avg_norm_obs, pg_loss, vf_loss, entropy, _train], td_map)
            return ravg_norm_obs, policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 24
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5,
            lr=7e-4, alpha=0.99, epsilon=1e-5, total_timesteps=int(20e6), lrschedule='linear'):

        '''
        sess = tf.get_default_session()
        nbatch = nenvs*nsteps

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        '''

        # begin diff
        sess = tf.get_default_session()

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, reuse=True)

        L = tf.placeholder(tf.int32, [1])
        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        # end diff

        neglogpac = train_model.pd.neglogp(A) # length max_episode_steps
        pg_loss = tf.reduce_mean(tf.slice(ADV * neglogpac, [0], L))
        vf_loss = tf.reduce_mean(tf.slice(mse(tf.squeeze(train_model.vf), R), [0], L))
        entropy = tf.reduce_mean(tf.slice(train_model.pd.entropy(), [0], L))
        loss = pg_loss-entropy*ent_coef+vf_loss*vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values, length):
            advs = rewards-values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr, L:np.asarray([length])}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run([pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 25
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        writter = tf.summary.FileWriter(
            "/tmp/a2c_demo/1")  # Change for SAT: this is to use tensorBoard

        A = tf.placeholder(
            tf.int32, [nbatch])  # Comments by Fei: this must be the action
        ADV = tf.placeholder(
            tf.float32,
            [nbatch])  # Comments by Fei: this must be the advantage
        R = tf.placeholder(
            tf.float32, [nbatch])  # Comments by Fei: this must be the reward
        LR = tf.placeholder(
            tf.float32, [])  # Comments by Fei: this must be the learning rate

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi,
            labels=A)  # Comments by Fei: pi is nbatch * nact
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            # writter.add_graph(sess.graph)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 26
0
    def build_model(self, max_grad_norm, reuse=tf.AUTO_REUSE):
        #reuse  is true id loading
        #buld a 4 layer fc net for actor
        # X=tf.placeholder([-1,6,4])
        states_in = tf.layers.flatten(self.states)
        with tf.variable_scope("model", reuse=reuse):
            a1 = tf.layers.dropout(inputs=tf.layers.dense(
                inputs=states_in, units=64, activation=tf.nn.relu),
                                   rate=0.3)
            self.a2 = tf.layers.dropout(tf.layers.dense(inputs=a1,
                                                        units=128,
                                                        activation=tf.nn.relu),
                                        rate=0.2)
            self.a3 = tf.layers.dropout(tf.layers.dense(inputs=self.a2,
                                                        units=128,
                                                        activation=tf.nn.relu),
                                        rate=0.1)
            self.out = tf.layers.dense(
                inputs=self.a3,
                units=4,
                activation=tf.nn.relu,
                kernel_initializer=tf.orthogonal_initializer(np.sqrt(2)))
            self.value = tf.layers.dense(inputs=self.a3,
                                         units=1,
                                         activation=None)

            #
            # self.pd, self.pi = self.pdtype.pdfromlatent(self.out, init_scale=0.01) # with baselines from openai
            self.pd, self.pi, _ = self.pdtype.proba_distribution_from_latent(
                self.out, self.value, init_scale=0.01
            )  # with stable_baselines see https://stable-baselines.readthedocs.io/en/master/common/distributions.html?highlight=vf%20latent%20vector
            # self.pd, self.pi = self.pdtype.pdfromlatent(self.out, init_scale=0.01)
            self.a0 = self.pd.sample()

        #calculate the loss function
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=self.pi, labels=self.actions)

        # 1/n * sum A(si,ai) * -logpi(ai|si)
        pg_loss = tf.reduce_mean(self.advantages * neglogpac)

        # Value loss 1/2 SUM [R - V(s)]^2
        vf_loss = tf.reduce_mean(mse(tf.squeeze(self.value), self.rewards))

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(self.pd.entropy())

        self.loss = pg_loss - entropy * self.Entropy_coefficient + vf_loss * self.vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(self.loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=self.lr,
                                            decay=0.99,
                                            epsilon=1e-5)

        # 4. Backpropagation
        self.train_op = trainer.apply_gradients(grads)
Ejemplo n.º 27
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs * nsteps,
                                           nsteps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
Ejemplo n.º 28
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nscripts=16,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.1,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.001,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 alpha=0.99,
                 epsilon=1e-5):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nsml.bind(sess=sess)
        #nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])

        XY0 = tf.placeholder(tf.int32, [nbatch])
        XY1 = tf.placeholder(tf.int32, [nbatch])

        # ADV == TD_TARGET - values
        ADV = tf.placeholder(tf.float32, [nbatch])
        TD_TARGET = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)

        # Policy 1 : Base Action : train_model.pi label = A

        script_mask = tf.concat([
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
                                axis=0)

        pi = train_model.pi
        pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi,
                                                                   labels=A)
        neglogpac *= tf.stop_gradient(pac_weight)

        inv_A = 1.0 - tf.cast(A, tf.float32)

        xy0_mask = tf.cast(A, tf.float32)
        xy1_mask = tf.cast(A, tf.float32)

        condition0 = tf.equal(xy0_mask, 2)
        xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
        xy0_mask = 1.0 - xy0_mask

        condition1 = tf.equal(xy1_mask, 2)
        xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

        # One hot representation of chosen marine.
        # [batch_size, 2]
        pi_xy0 = train_model.pi_xy0
        pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024),
                                   axis=1)

        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi_xy0, labels=XY0)
        logpac_xy0 *= tf.stop_gradient(pac_weight)
        logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

        pi_xy1 = train_model.pi_xy1
        pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024),
                                   axis=1)

        # 1D? 2D?
        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi_xy1, labels=XY1)
        logpac_xy1 *= tf.stop_gradient(pac_weight)
        logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

        pg_loss = tf.reduce_mean(ADV * neglogpac)
        pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
        pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

        vf_ = tf.squeeze(train_model.vf)

        vf_r = tf.concat([
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
                         axis=0) * TD_TARGET
        vf_masked = vf_ * script_mask + vf_r

        #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

        vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
        entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
        entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
        entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
        entropy = entropy_a + entropy_xy0 + entropy_xy1

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        self.logits = logits = train_model.pi

        # xy0

        self.params_common = params_common = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
        self.params_xy0 = params_xy0 = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope='model/xy0') + params_common

        train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

        self.grads_check_xy0 = grads_xy0 = tf.gradients(
            train_loss_xy0, params_xy0)
        if max_grad_norm is not None:
            grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

        grads_xy0 = list(zip(grads_xy0, params_xy0))
        trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr,
                                                decay=alpha,
                                                epsilon=epsilon)
        _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

        # xy1

        self.params_xy1 = params_xy1 = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope='model/xy1') + params_common

        train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

        self.grads_check_xy1 = grads_xy1 = tf.gradients(
            train_loss_xy1, params_xy1)
        if max_grad_norm is not None:
            grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

        grads_xy1 = list(zip(grads_xy1, params_xy1))
        trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr,
                                                decay=alpha,
                                                epsilon=epsilon)
        _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
            advs = td_targets - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                XY0: xy0,
                XY1: xy1,
                ADV: advs,
                TD_TARGET: td_targets,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _, \
            policy_loss_xy0, policy_entropy_xy0, _, \
            policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train,
                 pg_loss_xy0, entropy_xy0, _train_xy0,
                 pg_loss_xy1, entropy_xy1, _train_xy1],
                td_map)
            return policy_loss, value_loss, policy_entropy, \
                   policy_loss_xy0, policy_entropy_xy0, \
                   policy_loss_xy1, policy_entropy_xy1

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        print("global_variables_initializer start")
        tf.global_variables_initializer().run(session=sess)
        print("global_variables_initializer complete")
Ejemplo n.º 29
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(20e6),
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nbatch = nenvs * nsteps

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, discounted_rewards, rewards, masks,
                  prev_actions, actions, values, dones):
            advs = discounted_rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            # reshape actions, rewards, and dones to have first dimension of size nenvs*nsteps, existing second dimension
            # this is already done for obs
            rews = np.reshape(rewards, (nbatch, 1))
            ds = np.reshape(np.asarray(dones, dtype=np.float32), (nbatch, 1))
            if len(ac_space.shape) == 0:
                prev_actions = np.reshape(prev_actions, (nbatch, ))
                one_hot = np.eye(ac_space.n)[prev_actions]
                for i in range(nbatch):
                    if prev_actions[i] == -1:
                        one_hot[i, :] = np.zeros((ac_space.n, ), dtype=np.int)
                x = np.concatenate((obs, one_hot, rews, ds), axis=1)
                actions = np.reshape(actions, (nbatch, ))
            else:
                prev_actions = np.reshape(prev_actions,
                                          (nbatch, ac_space.shape[0]))
                x = np.concatenate((obs, prev_actions, rews, ds), axis=1)
            td_map = {
                train_model.X: x,
                A: actions,
                ADV: advs,
                R: discounted_rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
    def __init__(self,
                 policy,
                ob_space,
                action_space,
                nenvs,
                nsteps,
                ent_coef,
                vf_coef,
                max_grad_norm):

        sess = tf.get_default_session()

        # Here we create the placeholders
        actions_ = tf.placeholder(tf.int32, [None], name="actions_")
        advantages_ = tf.placeholder(tf.float32, [None], name="advantages_")
        rewards_ = tf.placeholder(tf.float32, [None], name="rewards_")
        lr_ = tf.placeholder(tf.float32, name="learning_rate_")

        # Here we create our two models:
        # Step_model that is used for sampling
        step_model = policy(sess, ob_space, action_space, nenvs, 1, reuse=False)

        # Train model for training
        train_model = policy(sess, ob_space, action_space, nenvs*nsteps, nsteps, reuse=True)

        """
        Calculate the loss
        Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        """
        # Policy loss
        # Output -log(pi)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=actions_)

        # 1/n * sum A(si,ai) * -logpi(ai|si)
        pg_loss = tf.reduce_mean(advantages_ * neglogpac)

        # Value loss 1/2 SUM [R - V(s)]^2
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf),rewards_))

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())


        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Build our trainer
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr_, decay=0.99, epsilon=1e-5)

        # 4. Backpropagation
        _train = trainer.apply_gradients(grads)

        def train(states_in, actions, returns, values, lr):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # Returns = R + yV(s')
            advantages = returns - values

            # We create the feed dictionary
            td_map = {train_model.inputs_: states_in,
                     actions_: actions,
                     advantages_: advantages, # Use to calculate our policy loss
                     rewards_: returns, # Use as a bootstrap for real value
                     lr_: lr}

            policy_loss, value_loss, policy_entropy, _= sess.run([pg_loss, vf_loss, entropy, _train], td_map)
            
            return policy_loss, value_loss, policy_entropy


        def save(save_path):
            """
            Save the model
            """
            saver = tf.train.Saver()
            saver.save(sess, save_path)

        def load(load_path):
            """
            Load the model
            """
            saver = tf.train.Saver()
            print('Loading ' + load_path)
            saver.restore(sess, load_path)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)