コード例 #1
0
    def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
        #X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
        X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+2]) # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
        vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
        self._predict = U.function([X], vpred_n)
        optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
                                    async=1, kfac_update=2, cold_iter=50, \
                                    weight_decay_dict=wd_dict, max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
        self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
        U.initialize() # Initialize uninitialized TF variables
コード例 #2
0
ファイル: acktr.py プロジェクト: MoritzTaylor/baselines-tf2
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):
        super(Model, self).__init__(name='ACKTRModel')

        nbatch = nenvs * nsteps

        # TODO: PolicyWithValue does this right? Original implementation uses 'nbatch'
        #self.model = step_model = policy(nenvs, 1)
        #self.model2 = train_model = policy(nbatch, nsteps)
        train_model = PolicyWithValue(ac_space,
                                      policy,
                                      value_network=None,
                                      estimate_q=False)

        self.ent_coef = ent_coef
        self.vf_coef = vf_coef
        self.vf_fisher_coef = vf_fisher_coef
        self.kfac_clip = kfac_clip

        self.is_async = is_async
        self.max_grad_norm = max_grad_norm
        self.total_timesteps = total_timesteps

        # TODO: Learning rate schedule and definition of optimizer
        #self.lrschedule = lrschedule
        lrschedule = LinearTimeDecay(initial_learning_rate=lr)  # TODO
        self.optim = kfac.KfacOptimizer(learning_rate=lrschedule, clip_kl=self.kfac_clip, \
                                        momentum=0.9, kfac_update=1, epsilon=0.01, \
                                        stats_decay=0.99, is_async=self.is_async, cold_iter=10,
                                        max_grad_norm=self.max_grad_norm)

        self.train_model = train_model
        #self.step_model = step_model
        self.step = self.train_model.step
        self.value = self.train_model.value
        self.initial_state = self.train_model.initial_state
コード例 #3
0
ファイル: acktr.py プロジェクト: yoniosin/A2C
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs * nsteps,
                                               nsteps,
                                               sess=sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr,
                VF_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #4
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          fname=None):
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    if fname != None and tf.train.checkpoint_exists(fname):
        load_result = U.load_state(fname)
        logger.log("Model loaded from file {}".format(fname))

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(get_session(), coord=coord, start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Save model every 100 iterations
        if fname != None and (i % 100 == 99):
            U.save_state(fname)
            logger.log("Model saved to file {}".format(fname))
            env.seed()

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)

        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            U.eval(
                tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            U.eval(
                tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)

        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #5
0
ファイル: acktr_disc.py プロジェクト: yatuzhang/baselines
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #6
0
    def __init__(self, policy, ob_space, ac_space, nenvs,
                 expert_nbatch,
                 total_timesteps,
                 nprocs=32, nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5, vf_fisher_coef=1.0, vf_expert_coef=0.5 * 0.0,
                 expert_coeff=1.0,
                 exp_adv_est='reward',
                 lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):

        # create tf stuff
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)

        # the actual model
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        A_EXP = tf.placeholder(tf.int32, [expert_nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        ADV_EXP = tf.placeholder(tf.float32, [expert_nbatch])

        R = tf.placeholder(tf.float32, [nbatch])
        R_EXP = tf.placeholder(tf.float32, [expert_nbatch])

        PG_LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        eval_step_model = policy(sess, ob_space, ac_space, 1, 1, reuse=True)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
        expert_train_model = policy(sess, ob_space, ac_space, expert_nbatch, 1, reuse=True)
        logpac_expert = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=expert_train_model.pi, labels=A_EXP)
        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)

        _, acc = tf.metrics.accuracy(labels=A,
                                     predictions=tf.argmax(train_model.pi, 1))

        ## training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        pg_expert_loss = tf.reduce_mean(ADV_EXP * logpac_expert)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        vf_expert_loss = tf.reduce_mean(mse(tf.squeeze(expert_train_model.vf), R_EXP))
        train_loss = pg_loss + vf_coef * vf_loss + expert_coeff * pg_expert_loss + vf_expert_coef * vf_expert_loss

        self.check = check = tf.add_check_numerics_ops()

        ## Fisher loss construction
        pg_fisher_loss = -tf.reduce_mean(logpac)  # + logpac_expert)
        # pg_expert_fisher_loss = -tf.reduce_mean(logpac_expert)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=PG_LR, clip_kl=kfac_clip,
                momentum=0.9, kfac_update=1, epsilon=0.01,
                stats_decay=0.99, async=1, cold_iter=20, max_grad_norm=max_grad_norm
            )

            # why is this unused?
            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values,
                  expert_obs, expert_rewards, expert_actions, expert_values):
            if exp_adv_est == 'critic':
                expert_advs = np.clip(expert_rewards - expert_values, a_min=0, a_max=None)
            elif exp_adv_est == 'reward':
                expert_advs = expert_rewards
            elif exp_adv_est == 'simple':
                expert_advs = np.ones_like(expert_rewards)
            else:
                raise ValueError("Unknown expert advantage estimator {}".format(exp_adv_est))

            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X:obs,
                expert_train_model.X: expert_obs,
                A_EXP: expert_actions,
                A:actions,
                ADV:advs,
                ADV_EXP: expert_advs,
                R:rewards,
                PG_LR:cur_lr,
                R_EXP: expert_rewards
            }

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy, _, grads_to_check = sess.run(
                [pg_loss, pg_expert_loss, vf_loss, entropy, acc, train_op, grads],
                td_map
            )

            for grad in grads_to_check:
                if np.isnan(grad).any():
                    print("ojojoj grad is nan")

            return policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy

        def save(save_path):
            print("Writing model to {}".format(save_path))
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        def eval_step(obs, eval_type):
            td_map = {eval_step_model.X: [obs]}
            logits = sess.run(eval_step_model.pi, td_map)[0]
            if eval_type == 'argmax':
                act = logits.argmax()
                if np.random.rand() < 0.01:
                    act = ac_space.sample()
                return act
            elif eval_type == 'prob':
                # probs = func(s[None, :, :, :])[0][0]
                x = logits
                e_x = np.exp(x - np.max(x))
                probs = e_x / e_x.sum(axis=0)
                act = np.random.choice(range(probs.shape[-1]), 1, p=probs)[0]
                return act
            else:
                raise ValueError("Unknown eval type {}".format(eval_type))

        self.model = step_model
        self.model2 = train_model
        self.expert_train_model = expert_train_model
        self.vf_fisher = vf_fisher_loss
        self.pg_fisher = pg_fisher_loss
        self.joint_fisher = joint_fisher_loss
        self.params = params
        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.eval_step = eval_step
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
        tf.local_variables_initializer().run(session=sess)
コード例 #7
0
    def __init__(self, ob_dim, ac_dim):
        """
        Create an MLP policy for a value function

        :param ob_dim: (int) Observation dimention
        :param ac_dim: (int) action dimention
        """
        obs_ph = tf.placeholder(tf.float32,
                                shape=[None, ob_dim * 2 + ac_dim * 2 + 2
                                       ])  # batch of observations
        vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
        wd_dict = {}
        layer_1 = tf.nn.elu(
            dense(obs_ph,
                  64,
                  "h1",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        layer_2 = tf.nn.elu(
            dense(layer_1,
                  64,
                  "h2",
                  weight_init=tf_util.normc_initializer(1.0),
                  bias_init=0,
                  weight_loss_dict=wd_dict))
        vpred_n = dense(layer_2,
                        1,
                        "hfinal",
                        weight_init=tf_util.normc_initializer(1.0),
                        bias_init=0,
                        weight_loss_dict=wd_dict)[:, 0]
        sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
        wd_loss = tf.get_collection("vf_losses", None)
        loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
        loss_sampled = tf.reduce_mean(
            tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))

        self._predict = tf_util.function([obs_ph], vpred_n)

        optim = kfac.KfacOptimizer(learning_rate=0.001,
                                   cold_lr=0.001 * (1 - 0.9),
                                   momentum=0.9,
                                   clip_kl=0.3,
                                   epsilon=0.1,
                                   stats_decay=0.95,
                                   async=1,
                                   kfac_update=2,
                                   cold_iter=50,
                                   weight_decay_dict=wd_dict,
                                   max_grad_norm=None)
        vf_var_list = []
        for var in tf.trainable_variables():
            if "vf" in var.name:
                vf_var_list.append(var)

        update_op, self.q_runner = optim.minimize(loss,
                                                  loss_sampled,
                                                  var_list=vf_var_list)
        self.do_update = tf_util.function([obs_ph, vtarg_n], update_op)  # pylint: disable=E1101
        tf_util.initialize()  # Initialize uninitialized TF variables
コード例 #8
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          fname='./training.ckpt'):

    mean_logger = setup_logger("Mean Logger", "log/episode_mean.txt")

    # print("Filter shape:  ", env.observation_space.shape)
    space = (env.observation_space.shape[0] * 2, )
    obfilter = ZFilter(space)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')  #0.03
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    #changes
    if fname != None and tf.train.checkpoint_exists(fname):
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), fname)
        logger.log("Model loaded from file {}".format(fname))

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None, "QR is None")
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    total_reward = float()
    while True:
        print("Timestep Number: %d of %d" % (timesteps_so_far, num_timesteps))
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        #Save model every 100 iterations
        if fname != None and (i % 100 == 0):
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)
            logger.log("Model saved to file {}".format(fname))
            env.seed()

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        terminal_rew = []
        while True:
            path, temp_rew = rollout(env,
                                     policy,
                                     max_pathlength,
                                     animate=(len(paths) == 0 and (i % 10 == 0)
                                              and animate),
                                     obfilter=obfilter)
            paths.append(path)
            terminal_rew.append(np.array(temp_rew))
            n = pathlength(path)
            timesteps_this_batch += n
            if timesteps_this_batch > timesteps_per_batch:
                break
        timesteps_so_far += 1

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        terminal_rew = np.array(terminal_rew)
        rew_mean = np.mean([path.sum() for path in terminal_rew])
        rew_sem = np.std(
            [path.sum() / np.sqrt(len(terminal_rew)) for path in terminal_rew])
        len_mean = np.mean([path.shape[0] for path in terminal_rew])

        # rewList = []
        # for path in paths:
        #     trew = []
        #     rew_i = 0
        #     while True:
        #         trew.append(path["reward"][rew_i])
        #         rew_i += 11
        #         if rew_i > (len(path["reward"])-1):
        #             break
        #     rewList.append( np.array(trew) )
        # rewList = np.array(rewList)

        # rew_mean = np.mean([path.sum() for path in rewList])
        # rew_sem = np.std([path.sum()/np.sqrt(len(rewList)) for path in rewList])
        # len_mean = np.mean([path.shape[0] for path in rewList])

        # rew_mean = np.mean([path["reward"].sum() for path in paths])
        # rew_sem = np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths])
        # len_mean = np.mean([pathlength(path) for path in paths])

        total_reward += rew_mean

        logger.record_tabular("EpRewMean", rew_mean)
        logger.record_tabular("EpRewSEM", rew_sem)
        logger.record_tabular("EpLenMean", len_mean)
        logger.record_tabular("TotalRewardMean", total_reward)
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()

        mean_logger.info(
            "Result for episode {}  of {}: Sum: {}, Average: {}, Length: {}".
            format(timesteps_so_far, num_timesteps, rew_mean, rew_sem,
                   len_mean))

        i += 1

    if fname != None:
        os.makedirs(os.path.dirname(fname), exist_ok=True)
        saver = tf.train.Saver()
        saver.save(tf.get_default_session(), fname)
        logger.log("Model saved to file {}".format(fname))
        env.seed()
    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #9
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          lr=0.03,
          momentum=0.9):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(lr)),
                           name='stepsize')
    stepsize_mul = tf.placeholder(tf.float32, shape=None)
    inputs, loss, loss_sampled = policy.update_info
    inputs = list(inputs)
    inputs.append(stepsize_mul)
    optim = kfac.KfacOptimizer(learning_rate=stepsize * stepsize_mul, cold_lr=stepsize * stepsize_mul *(1-0.9), momentum=momentum, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)

    grads = optim.compute_gradients(loss, pi_var_list)
    grads = [g[0] for g in grads]
    old_var = [
        tf.Variable(initial_value=tf.zeros_like(v)) for v in pi_var_list
    ]
    old_to_new = tf.group(
        *[tf.assign(v, o) for v, o in zip(pi_var_list, old_var)])
    old_from_new = tf.group(
        *[tf.assign(o, v) for v, o in zip(pi_var_list, old_var)])

    do_old_var = U.function([], old_var)
    do_pi_var = U.function([], pi_var_list)
    do_old_from_new = U.function([], old_from_new)
    with tf.control_dependencies(grads):
        with tf.control_dependencies([old_to_new]):
            midpoint_op, q_runner_mid = optim.apply_gradients(
                list(zip(grads, pi_var_list)))
    do_midpoint = U.function(inputs, midpoint_op)

    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_old_from_new()
        # print(do_old_var())
        do_update(ob_no, action_na, standardized_adv_n, 0.5)
        do_midpoint(ob_no, action_na, standardized_adv_n, 1.0)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        # if kl > desired_kl * 2:
        #     logger.log("kl too high")
        #     tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
        # elif kl < desired_kl / 2:
        #     logger.log("kl too low")
        #     tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
        # else:
        #     logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #10
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          lr=0.03,
          momentum=0.9):
    ob_dim, ac_dim = policy.ob_dim, policy.ac_dim
    dbpi = GaussianMlpPolicy(ob_dim, ac_dim, 'dbp')
    oldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'oe')
    dboldpi = GaussianMlpPolicy(ob_dim, ac_dim, 'doi')
    # with tf.variable_scope('dbp'):
    # with tf.variable_scope('oe'):
    # with tf.variable_scope('doi'):

    pi = policy

    do_std = U.function([], [pi.std_1a, pi.logstd_1a])

    kloldnew = oldpi.pd.kl(pi.pd)
    dbkloldnew = dboldpi.pd.kl(dbpi.pd)
    dist = meankl = tf.reduce_mean(kloldnew)
    dbkl = tf.reduce_mean(dbkloldnew)
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(lr)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info

    var_list = [v for v in tf.global_variables() if "pi" in v.name]
    db_var_list = [v for v in tf.global_variables() if "dbp" in v.name]
    old_var_list = [v for v in tf.global_variables() if "oe" in v.name]
    db_old_var_list = [v for v in tf.global_variables() if "doi" in v.name]
    print(len(var_list), len(db_var_list), len(old_var_list),
          len(db_old_var_list))
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_var_list, var_list)
        ])
    assign_db = U.function(
        [], [],
        updates=[
            tf.assign(db, o) for (db, o) in zipsame(db_var_list, var_list)
        ] + [
            tf.assign(dbold, dbnew)
            for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)
        ])

    assign_old_eq_newr = U.function(
        [], [],
        updates=[
            tf.assign(newv, oldv)
            for (oldv, newv) in zipsame(old_var_list, var_list)
        ])
    # assign_dbr = U.function([], [], updates=
    # [tf.assign(o, db) for (db, o) in zipsame(db_var_list, var_list)] +
    # [tf.assign(dbnew, dbold) for (dbold, dbnew) in zipsame(db_old_var_list, old_var_list)])

    klgrads = tf.gradients(dist, var_list)
    dbklgrads = tf.gradients(dbkl, db_var_list)
    p_grads = [tf.ones_like(v) for v in dbklgrads]

    get_flat = U.GetFlat(var_list)
    get_old_flat = U.GetFlat(old_var_list)
    set_from_flat = U.SetFromFlat(var_list)

    flat_tangent2 = tf.placeholder(dtype=tf.float32,
                                   shape=[None],
                                   name="flat_tan2")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents2 = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents2.append(tf.reshape(flat_tangent2[start:start + sz], shape))
        start += sz
    gvp2 = tf.add_n([
        tf.reduce_sum(g * tangent2)
        for (g, tangent2) in zipsame(dbklgrads, tangents2)
    ])
    gvp2_grads = tf.gradients(gvp2, db_var_list)

    neg_term = tf.add_n([
        tf.reduce_sum(g * tangent2)
        for (g, tangent2) in zipsame(gvp2_grads, tangents2)
    ]) / 2.
    ng1 = tf.gradients(neg_term, db_var_list)
    ng2 = tf.gradients(neg_term, db_old_var_list)

    neg_term_grads = [
        a + b for (a, b) in zip(tf.gradients(neg_term, db_var_list),
                                tf.gradients(neg_term, db_old_var_list))
    ]
    neg_term = neg_term_grads
    # neg_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in neg_term_grads])

    pos_term = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(gvp2_grads, p_grads)
    ])
    pos_term_grads = [
        a + b for (a, b) in zip(tf.gradients(pos_term, db_var_list),
                                tf.gradients(pos_term, db_old_var_list))
    ]
    pos_term_sum = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(pos_term_grads, tangents2)
    ])
    pos_term_grads = tf.gradients(pos_term_sum, p_grads)
    pos_term = pos_term_grads
    # pos_term = tf.concat(axis=0, values=[tf.reshape(v, [U.numel(v)]) for v in pos_term_grads])
    geo_term = [(p - n) * 0.5 for p, n in zip(pos_term, neg_term)]

    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=momentum, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    grads = optim.compute_gradients(loss, var_list=pi_var_list)
    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    geo_term = [g1 + g2[0] for g1, g2 in zip(geo_term, grads)]
    geo_grads = list(zip(geo_term, var_list))
    update_geo_op, q_runner_geo = optim.apply_gradients(geo_grads)
    do_update = U.function(inputs, update_op)
    inputs_tangent = list(inputs) + [flat_tangent2]
    do_update_geo = U.function(inputs_tangent, update_geo_op)
    do_get_geo_term = U.function(inputs_tangent, [ng1, ng2])
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner, q_runner_geo]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        assign_old_eq_new()  # set old parameter values to new parameter values
        assign_db()

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)
        # ft2 = get_flat() - get_old_flat()

        # assign_old_eq_newr() # assign back
        # gnp = do_get_geo_term(ob_no, action_na, standardized_adv_n, ft2)

        # def check_nan(bs):
        #     return [~np.isnan(b).all() for b in bs]

        # print(gnp[0])
        # print('.....asdfasdfadslfkadsjfaksdfalsdkfjaldskf')
        # print(gnp[1])
        # do_update_geo(ob_no, action_na, standardized_adv_n, ft2)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        # if kl > desired_kl * 2:
        #     logger.log("kl too high")
        #     tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)).eval()
        # elif kl < desired_kl / 2:
        #     logger.log("kl too low")
        #     tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)).eval()
        # else:
        #     logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        print(do_std())
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #11
0
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          save_path="./",
          save_after=200,
          load_path=None,
          save_rollouts=False):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(U.get_session(), coord=coord, start=True))

    if load_path != None:
        saver = tf.train.Saver()
        saver.restore(U.get_session(), os.path.join(load_path, "model.ckpt"))
        obfilter_path = os.path.join(load_path, "obfilter.pkl")
        with open(obfilter_path, 'rb') as obfilter_input:
            obfilter = pickle.load(obfilter_input)
        print("Loaded Model")
    else:
        # create saver
        saver = tf.train.Saver()

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            #path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
            if "jaco" in env.spec.id.lower():
                path = rollout(env,
                               policy,
                               max_pathlength,
                               animate=animate,
                               obfilter=obfilter,
                               save_rollouts=save_rollouts)
                goal_dist = np.linalg.norm(env.env.env.get_body_com("jaco_link_hand") \
                            - env.env.env.get_body_com("target"))
                if goal_dist <= 0.12:
                    print("goal_dist {} ; episode added".format(goal_dist))
                    paths.append(path)
            else:
                path = rollout(env,
                               policy,
                               max_pathlength,
                               animate=animate,
                               obfilter=obfilter,
                               save_rollouts=save_rollouts)

            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        if save_rollouts:
            # save the rollouts
            rollouts_path = os.path.join(load_path, "rollouts-v2.pkl")
            with open(rollouts_path, 'wb') as rollouts_output:
                pickle.dump(paths, rollouts_output, pickle.HIGHEST_PROTOCOL)
            sys.exit()

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        logp_n = np.concatenate([path["logp"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            U.eval(
                tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            U.eval(
                tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)
        if callback:
            callback()
        logger.dump_tabular()
        # save model if necessary
        if i % save_after == 0:
            save(saver, obfilter, save_path)
        i += 1
コード例 #12
0
ファイル: a2c_kfac_fm.py プロジェクト: goodbyeearth/old_pmm
    def __init__(self,
                 policy,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):

        sess = get_session()
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)
            eval_model = policy(1, 1, sess)

        # A = train_model.pdtype.sample_placeholder([None])
        # A = tf.placeholder(step_model.action.dtype, step_model.action.shape)
        probs = tf.nn.softmax(step_model.pi)
        class_ind = tf.to_int32(tf.multinomial(tf.log(probs), 1)[0][0])
        self.pg_fisher = pg_fisher_loss = tf.log(probs[0, class_ind])

        ##Fisher loss construction
        # self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        # sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        # self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        # self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss

        self.params = params = find_trainable_variables("a2c_model")

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer()

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            stats = optim.compute_and_apply_stats(joint_fisher_loss,
                                                  var_list=params[:-4])

        def compute_fisher(obs):
            # action = action[:, np.newaxis]
            td_map = {step_model.X: obs, step_model.keep_prob: 1.0}

            fisher = sess.run(stats, td_map)
            return fisher

        self.compute_fisher = compute_fisher
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #13
0
ファイル: acktr_cont.py プロジェクト: kkonen/baselines
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002,
          save_model_with_prefix=None,
          restore_model_from_file=None,
          outdir="/tmp/rosrl/experiments/continuous/acktr/"):

    obfilter = ZFilter(env.observation_space.shape)
    # Risto change
    max_pathlength = env.max_episode_steps
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async_=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()
    """
    Here we add a possibility to resume from a previously saved model if a model file is provided
    """
    if restore_model_from_file:
        saver = tf.train.Saver()
        saver.restore(tf.get_default_session(), restore_model_from_file)
        logger.log("Loaded model from {}".format(restore_model_from_file))

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(tf.get_default_session(),
                              coord=coord,
                              start=True))

    i = 0
    timesteps_so_far = 0

    if save_model_with_prefix:
        # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/'
        summary_writer = tf.summary.FileWriter(outdir,
                                               graph=tf.get_default_graph())

    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)

        if callback:
            callback()
        logger.dump_tabular()
        """
        Save the model at every itteration
        """
        if save_model_with_prefix:
            if np.mean([path["reward"].sum() for path in paths]) > -50.0:
                # basePath = '/tmp/rosrl/' + str(env.__class__.__name__) +'/acktr/'
                summary = tf.Summary(value=[
                    tf.Summary.Value(tag="EpRewMean",
                                     simple_value=np.mean([
                                         path["reward"].sum() for path in paths
                                     ]))
                ])
                summary_writer.add_summary(summary, i)
                if not os.path.exists(outdir):
                    os.makedirs(outdir)
                modelF = outdir + '/' + save_model_with_prefix + "_afterIter_" + str(
                    i) + ".model"
                U.save_state(modelF)
                logger.log("Saved model to file :{}".format(modelF))

        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #14
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        #nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])

        SUB3 = tf.placeholder(tf.int32, [nbatch])
        SUB4 = tf.placeholder(tf.int32, [nbatch])
        SUB5 = tf.placeholder(tf.int32, [nbatch])
        SUB6 = tf.placeholder(tf.int32, [nbatch])
        SUB7 = tf.placeholder(tf.int32, [nbatch])
        SUB8 = tf.placeholder(tf.int32, [nbatch])
        SUB9 = tf.placeholder(tf.int32, [nbatch])
        SUB10 = tf.placeholder(tf.int32, [nbatch])
        SUB11 = tf.placeholder(tf.int32, [nbatch])
        SUB12 = tf.placeholder(tf.int32, [nbatch])

        X0 = tf.placeholder(tf.int32, [nbatch])
        Y0 = tf.placeholder(tf.int32, [nbatch])
        X1 = tf.placeholder(tf.int32, [nbatch])
        Y1 = tf.placeholder(tf.int32, [nbatch])
        X2 = tf.placeholder(tf.int32, [nbatch])
        Y2 = tf.placeholder(tf.int32, [nbatch])

        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)



        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub3, labels=SUB3) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub4, labels=SUB4) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub5, labels=SUB5) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub6, labels=SUB6) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub7, labels=SUB7) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub8, labels=SUB8) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub9, labels=SUB9) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub10, labels=SUB10) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub11, labels=SUB11) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub12, labels=SUB12) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x0, labels=X0) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y0, labels=Y0) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x1, labels=X1) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y1, labels=Y1) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x2, labels=X2) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y2, labels=Y2)

        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac) * tf.reduce_mean(ADV)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub3)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub4)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub5)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub6)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub7)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub8)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub9)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub10)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub11)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub12)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_x0)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_y0)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_x1)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_y1)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_x2)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_y2))

        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, \
                                                    momentum=0.9, kfac_update=1, epsilon=0.01, \
                                                    stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, sub3, sub4, sub5, sub6,
                  sub7, sub8, sub9, sub10, sub11, sub12, x0, y0, x1, y1, x2,
                  y2, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                SUB3: sub3,
                SUB4: sub4,
                SUB5: sub5,
                SUB6: sub6,
                SUB7: sub7,
                SUB8: sub8,
                SUB9: sub9,
                SUB10: sub10,
                SUB11: sub11,
                SUB12: sub12,
                X0: x0,
                Y0: y0,
                X1: x1,
                Y1: y1,
                X2: x2,
                Y2: y2,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            print("policy_loss : ", policy_loss, " value_loss : ", value_loss,
                  " entropy : ", entropy)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        print("global_variables_initializer start")
        tf.global_variables_initializer().run(session=sess)
        print("global_variables_initializer complete")
コード例 #15
0
ファイル: acktr_cont.py プロジェクト: james/baselines
def learn(env,
          policy,
          vf,
          gamma,
          lam,
          timesteps_per_batch,
          resume,
          logdir,
          agentName,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):

    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1.0 - 0.9), momentum=0.9, kfac_update=2,\
                                epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
                                weight_decay_dict=policy.wd_dict, max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = U.function(inputs, update_op)
    U.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for qr in [q_runner, vf.q_runner]:
        assert (qr != None)
        enqueue_threads.extend(
            qr.create_threads(U.get_session(), coord=coord, start=True))

    timesteps_so_far = 0
    saver = tf.train.Saver(max_to_keep=10)
    if resume > 0:
        saver.restore(
            tf.get_default_session(),
            os.path.join(os.path.abspath(logdir),
                         "{}-{}".format(agentName, resume)))
        ob_filter_path = os.path.join(os.path.abspath(logdir),
                                      "{}-{}".format('obfilter', resume))
        with open(ob_filter_path, 'rb') as ob_filter_input:
            obfilter = pickle.load(ob_filter_input)
            print("Loaded observation filter")
    iters_so_far = resume

    print('logdir = ', logdir)
    logF = open(os.path.join(logdir, 'log.txt'), 'a')
    logF2 = open(os.path.join(logdir, 'log_it.txt'), 'a')
    logStats = open(os.path.join(logdir, 'log_stats.txt'), 'a')

    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        save_interval = 5

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0
                                    and (iters_so_far % save_interval == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            n = pathlength(path)
            timesteps_this_batch += n
            timesteps_so_far += n
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = vf.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        vf.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)

        # Adjust stepsize
        kl = policy.compute_kl(ob_no, oldac_dist)
        if kl > desired_kl * 2.0:
            logger.log("kl too high")
            U.eval(
                tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
        elif kl < desired_kl / 2.0:
            logger.log("kl too low")
            U.eval(
                tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
        else:
            logger.log("kl just right!")

        rew_mean = np.mean([path["reward"].sum() for path in paths])
        logger.record_tabular("EpRewMean", rew_mean)
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular("EpLenMean",
                              np.mean([pathlength(path) for path in paths]))
        logger.record_tabular("KL", kl)

        logF.write(str(rew_mean) + "\n")
        logF2.write(str(iters_so_far) + "," + str(rew_mean) + "\n")
        #   json.dump(combined_stats, logStats)
        logF.flush()
        logF2.flush()
        #   logStats.flush()

        if save_interval and (iters_so_far % save_interval == 0
                              or iters_so_far == 1):
            saver.save(tf.get_default_session(),
                       os.path.join(logdir, agentName),
                       global_step=iters_so_far)
            ob_filter_path = os.path.join(
                os.path.abspath(logdir),
                "{}-{}".format('obfilter', iters_so_far))
            with open(ob_filter_path, 'wb') as ob_filter_output:
                pickle.dump(obfilter, ob_filter_output,
                            pickle.HIGHEST_PROTOCOL)

        if callback:
            callback()
        logger.dump_tabular()
        iters_so_far += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #16
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_envs,
                 total_timesteps,
                 nprocs=32,
                 n_steps=20,
                 ent_coef=0.01,
                 vf_coef=0.25,
                 vf_fisher_coef=1.0,
                 learning_rate=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lr_schedule='linear'):
        """
        The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144

        :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
        :param ob_space: (Gym Space) The observation space
        :param ac_space: (Gym Space) The action space
        :param n_envs: (int) The number of environments
        :param total_timesteps: (int) The total number of timesteps for training the model
        :param nprocs: (int) The number of threads for TensorFlow operations
        :param n_steps: (int) The number of steps to run for each environment
        :param ent_coef: (float) The weight for the entropic loss
        :param vf_coef: (float) The weight for the loss on the value function
        :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
        :param max_grad_norm: (float) The clipping value for the maximum gradient
        :param kfac_clip: (float) gradient clipping for Kullback leiber
        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
        """

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        n_batch = n_envs * n_steps
        action_ph = tf.placeholder(tf.int32, [n_batch])
        advs_ph = tf.placeholder(tf.float32, [n_batch])
        rewards_ph = tf.placeholder(tf.float32, [n_batch])
        pg_lr_ph = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         n_envs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           n_envs * n_steps,
                                           n_steps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.policy, labels=action_ph)
        self.logits = train_model.policy

        # training loss
        pg_loss = tf.reduce_mean(advs_ph * logpac)
        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
        train_loss = pg_loss + vf_coef * vf_loss

        # Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.value_fn + tf.random_normal(
            tf.shape(train_model.value_fn))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=pg_lr_ph,
                clip_kl=kfac_clip,
                momentum=0.9,
                kfac_update=1,
                epsilon=0.01,
                stats_decay=0.99,
                async=1,
                cold_iter=10,
                max_grad_norm=max_grad_norm)

            optim.compute_and_apply_stats(self.joint_fisher, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.learning_rate = Scheduler(initial_value=learning_rate,
                                       n_values=total_timesteps,
                                       schedule=lr_schedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for _ in range(len(obs)):
                cur_lr = self.learning_rate.value()

            td_map = {
                train_model.obs_ph: obs,
                action_ph: actions,
                advs_ph: advs,
                rewards_ph: rewards,
                pg_lr_ph: cur_lr
            }
            if states is not None:
                td_map[train_model.states_ph] = states
                td_map[train_model.masks_ph] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            session_params = sess.run(params)
            joblib.dump(session_params, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for param, loaded_p in zip(params, loaded_params):
                restores.append(param.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #17
0
def learn(env,
          policy,
          value_fn,
          gamma,
          lam,
          timesteps_per_batch,
          num_timesteps,
          animate=False,
          callback=None,
          desired_kl=0.002):
    """
    Traines an ACKTR model.

    :param env: (Gym environment) The environment to learn from
    :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
    :param value_fn: (Object) The value function model to use (MLP, CNN, LSTM, ...)
    :param gamma: (float) The discount value
    :param lam: (float) the tradeoff between exploration and exploitation
    :param timesteps_per_batch: (int) the number of timesteps for each batch
    :param num_timesteps: (int) the total number of timesteps to run
    :param animate: (bool) if render env
    :param callback: (function) called every step, used for logging and saving
    :param desired_kl: (float) the Kullback leibler weight for the loss
    """
    obfilter = ZFilter(env.observation_space.shape)

    max_pathlength = env.spec.timestep_limit
    stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)),
                           name='stepsize')
    inputs, loss, loss_sampled = policy.update_info
    optim = kfac.KfacOptimizer(learning_rate=stepsize,
                               cold_lr=stepsize * (1 - 0.9),
                               momentum=0.9,
                               kfac_update=2,
                               epsilon=1e-2,
                               stats_decay=0.99,
                               async=1,
                               cold_iter=1,
                               weight_decay_dict=policy.wd_dict,
                               max_grad_norm=None)
    pi_var_list = []
    for var in tf.trainable_variables():
        if "pi" in var.name:
            pi_var_list.append(var)

    update_op, q_runner = optim.minimize(loss,
                                         loss_sampled,
                                         var_list=pi_var_list)
    do_update = tf_util.function(inputs, update_op)
    tf_util.initialize()

    # start queue runners
    enqueue_threads = []
    coord = tf.train.Coordinator()
    for queue_runner in [q_runner, value_fn.q_runner]:
        assert queue_runner is not None
        enqueue_threads.extend(
            queue_runner.create_threads(tf.get_default_session(),
                                        coord=coord,
                                        start=True))

    i = 0
    timesteps_so_far = 0
    while True:
        if timesteps_so_far > num_timesteps:
            break
        logger.log("********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            path = rollout(env,
                           policy,
                           max_pathlength,
                           animate=(len(paths) == 0 and (i % 10 == 0)
                                    and animate),
                           obfilter=obfilter)
            paths.append(path)
            timesteps_this_batch += path["reward"].shape[0]
            timesteps_so_far += path["reward"].shape[0]
            if timesteps_this_batch > timesteps_per_batch:
                break

        # Estimate advantage function
        vtargs = []
        advs = []
        for path in paths:
            rew_t = path["reward"]
            return_t = common.discount(rew_t, gamma)
            vtargs.append(return_t)
            vpred_t = value_fn.predict(path)
            vpred_t = np.append(vpred_t,
                                0.0 if path["terminated"] else vpred_t[-1])
            delta_t = rew_t + gamma * vpred_t[1:] - vpred_t[:-1]
            adv_t = common.discount(delta_t, gamma * lam)
            advs.append(adv_t)
        # Update value function
        value_fn.fit(paths, vtargs)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        action_na = np.concatenate([path["action"] for path in paths])
        oldac_dist = np.concatenate([path["action_dist"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)

        # Policy update
        do_update(ob_no, action_na, standardized_adv_n)

        min_stepsize = np.float32(1e-8)
        max_stepsize = np.float32(1e0)
        # Adjust stepsize
        kl_loss = policy.compute_kl(ob_no, oldac_dist)
        if kl_loss > desired_kl * 2:
            logger.log("kl too high")
            tf.assign(stepsize, tf.maximum(min_stepsize,
                                           stepsize / 1.5)).eval()
        elif kl_loss < desired_kl / 2:
            logger.log("kl too low")
            tf.assign(stepsize, tf.minimum(max_stepsize,
                                           stepsize * 1.5)).eval()
        else:
            logger.log("kl just right!")

        logger.record_tabular(
            "EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logger.record_tabular(
            "EpRewSEM",
            np.std([
                path["reward"].sum() / np.sqrt(len(paths)) for path in paths
            ]))
        logger.record_tabular(
            "EpLenMean", np.mean([path["reward"].shape[0] for path in paths]))
        logger.record_tabular("KL", kl_loss)
        if callback:
            callback()
        logger.dump_tabular()
        i += 1

    coord.request_stop()
    coord.join(enqueue_threads)
コード例 #18
0
    def __init__(self, x, y_):

        self.x = x
        input = tf.reshape(self.x, [-1, 28, 28, 1])  # input placeholder

        with tf.variable_scope('conv1'):
            # simple 2-layer network
            W1 = weight_variable([5, 5, 1, 32])
            b1 = bias_variable([32])
            h_conv1 = tf.nn.relu(conv2d(input, W1) + b1)
            h_pool1 = max_pool_2_2((h_conv1))

        with tf.variable_scope('conv2'):
            W2 = weight_variable([5, 5, 32, 64])
            b2 = bias_variable([64])
            h_conv2 = tf.nn.relu(conv2d(h_pool1, W2) + b2)
            h_pool2 = max_pool_2_2(h_conv2)

        with tf.variable_scope('fc1'):
            h_pool2_flatten = tf.reshape(h_pool2, [-1, 7 * 7 * 64])
            W3 = weight_variable(([7 * 7 * 64, 1024]))
            b3 = bias_variable([1024])
            h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flatten, W3) + b3)

        with tf.variable_scope('output'):
            W4 = weight_variable(([1024, 10]))
            b4 = bias_variable([10])
            self.y = tf.matmul(h_fc1, W4) + b4

        self.var_list = [W1, b1, W2, b2, W3, b3, W4, b4]

        # in_dim = int(x.get_shape()[1])  # 784 for MNIST
        # out_dim = int(y_.get_shape()[1])  # 10 for MNIST
        #
        # self.x = x  # input placeholder
        #
        # # simple 2-layer network
        # W1 = weight_variable([in_dim, 100])
        # b1 = bias_variable([100])
        #
        # W2 = weight_variable([100, out_dim])
        # b2 = bias_variable([out_dim])
        #
        # h1 = tf.nn.relu(tf.matmul(x, W1) + b1)  # hidden layer
        # self.y = tf.matmul(h1, W2) + b2  # output layer
        #
        # self.var_list = [W1, b1, W2, b2]

        # vanilla single-task loss
        self.cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits_v2(labels=y_,
                                                       logits=self.y))
        self.set_vanilla_loss()

        # performance metrics
        correct_prediction = tf.equal(tf.argmax(self.y, 1), tf.argmax(y_, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        self.ewc_loss = 0
        # self.star_vars = []
        self.F_accum = []
        self.optim = kfac.KfacOptimizer()