Beispiel #1
0
    def __init__(self,
                 env: gym.Env,
                 actor: Model,
                 critic: Model,
                 preprocess: Model = None,
                 memory=None,
                 noise: callable = ornstein_uhlenbeck_noise,
                 replay_start=5000,
                 training_repeats=1):

        self.warmup_time = replay_start
        self.training_repeats = training_repeats
        assert isinstance(
            env.action_space,
            spaces.Box), "The environment's action space has to be continuous"

        sa = env.action_space.shape
        so = env.observation_space.shape

        self.memory = memory or ReplayMemory(1000000)
        self.env = env

        if not isinstance(actor, Model):
            actor = Model(actor,
                          optimizer=tf.train.AdamOptimizer(.0001),
                          tracker=tf.train.ExponentialMovingAverage(1 - .001))

        if not isinstance(critic, Model):
            critic = Model(critic,
                           optimizer=tf.train.AdamOptimizer(.001),
                           tracker=tf.train.ExponentialMovingAverage(1 - .001))

        preprocess = preprocess or Model(
            lambda x: x,
            optimizer=tf.train.AdamOptimizer(.001),
            tracker=tf.train.ExponentialMovingAverage(1 - 0.001))

        def act(o: [so], noisy=True):
            with arg_scope([layers.batch_norm], is_training=False):
                s = preprocess(o)
                a = actor(s, noise=noisy)
                a = smart_cond(noisy, lambda: noise(a), lambda: a)
                q = critic(s, a)
                layers.summarize_tensors([s, a, q])
                return a

        self.act = Function(act)

        def train_actor(o: [so]):
            s = preprocess(o)
            a0 = actor(s)
            q = critic(tf.stop_gradient(s),
                       a0)  # stop gradients from critic to preprocessor
            loss = -tf.reduce_mean(q, axis=0)
            return loss

        def train_critic(o: [so], a: [sa], r, t: tf.bool, o2: [so]):
            s = preprocess(o)
            q2 = critic(s, a)
            s2 = preprocess.tracked(o2)
            qt = critic.tracked(s2, actor.tracked(s2))
            qtt = tf.where(t, r, r + 0.99 * qt)
            qtt = tf.stop_gradient(qtt)
            mse = tf.reduce_mean(tf.square(q2 - qtt), axis=0)
            return mse

        def train(o: [so], a: [sa], r, t: tf.bool, o2: [so]):
            al = train_actor(o)
            mse = train_critic(o, a, r, t, o2)
            return actor.minimize(al), critic.minimize(
                mse), preprocess.minimize([mse, al])

        self.train = Function(
            train,
            prefetch_fctn=lambda: self.memory.sample_batch()[:-1],
            prefetch_capacity=training_repeats,
            async=True)

        def log_return(r: []):
            layers.summarize_tensor(r, 'Return')

        self.log_return = Function(log_return, async=True)

        self.t = 0
Beispiel #2
0
    def __init__(self,
                 env: gym.Env,
                 actor: callable,
                 critic: callable,
                 preprocess: Model = None,
                 memory=None,
                 heads=2,
                 replay_start=2000):
        self.replay_start = replay_start

        assert isinstance(env.observation_space,
                          spaces.Box), 'action space has to be continuous'
        assert isinstance(env.action_space,
                          spaces.Box), 'observation space has to be continuous'

        so = env.observation_space.shape
        sa = env.action_space.shape

        self.env = env
        self.memory = memory or ReplayMemory(1000000)
        self.heads = heads
        self.t = 0

        preprocess = preprocess or Model(
            lambda x: x,
            optimizer=tf.train.AdamOptimizer(.001),
            tracker=tf.train.ExponentialMovingAverage(1 - 0.001))

        def actors(x, noise=False):
            actions = [actor(x) for i in range(heads)]
            return actions

        actors = Model(actors,
                       optimizer=tf.train.AdamOptimizer(0.0001),
                       tracker=tf.train.ExponentialMovingAverage(1 - 0.001))

        def critics(x, actions):
            qs = [critic(x, a) for a in actions]
            return qs

        critics = Model(critics,
                        optimizer=tf.train.AdamOptimizer(.001),
                        tracker=tf.train.ExponentialMovingAverage(1 - 0.001))

        def act(o: [so], noise=True):
            with arg_scope([layers.batch_norm], is_training=False):
                s = preprocess(o)
                a = actors(s, noise=noise)
                q = critics(s, a)
                layers.summarize_tensors([s, *a, *q])
                return a

        self.act = Function(act)

        def train_actor(o: [so]):
            s = preprocess(o)
            a0 = actors(s)
            q = critics(tf.stop_gradient(s), a0)
            loss = sum((-tf.reduce_mean(_, axis=0) for _ in q)) / heads
            return loss

        bootstrap = False

        def train_critic(o: [so], a: [sa], r, t: tf.bool, o2: [so],
                         i: tf.int32):
            s = preprocess(o)
            q2 = critics(s, [a for _ in range(heads)])
            s2 = preprocess.tracked(o2)
            qt = critics.tracked(s2, actors.tracked(s2))
            qtt = [tf.where(t, r, r + 0.99 * tf.stop_gradient(_)) for _ in qt]

            # def loss(_i, _q2, _qtt):
            #       sel = tf.equal(i, _i) if bootstrap else tf.fill(tf.shape(i), True)
            #       e = tf.where(sel, tf.square(_q2 - _qtt), tf.zeros_like(_q2))
            #       mse = tf.reduce_sum(e, axis=0) / tf.reduce_sum(tf.cast(sel, tf.float32), axis=0)
            #       return mse

            mse = sum([
                tf.reduce_mean(tf.square(_q2 - _qtt))
                for _q2, _qtt in zip(q2, qtt)
            ]) / heads
            return mse

        def train(o: [so], a: [sa], r, t: tf.bool, o2: [so], i: tf.int32):
            al = train_actor(o)
            mse = train_critic(o, a, r, t, o2, i)
            return actors.minimize(al), critics.minimize(
                mse), preprocess.minimize([mse, al])

        self.train = Function(train,
                              prefetch_fctn=lambda: self.memory.sample_batch(),
                              prefetch_capacity=3,
                              async=True)

        def log_return(r: []):
            layers.summarize_tensor(r, 'Return')

        self.log_return = Function(log_return)
Beispiel #3
0
class DdpgAgent:
    def __init__(self,
                 env: gym.Env,
                 actor: Model,
                 critic: Model,
                 preprocess: Model = None,
                 memory=None,
                 noise: callable = ornstein_uhlenbeck_noise,
                 replay_start=5000,
                 training_repeats=1):

        self.warmup_time = replay_start
        self.training_repeats = training_repeats
        assert isinstance(
            env.action_space,
            spaces.Box), "The environment's action space has to be continuous"

        sa = env.action_space.shape
        so = env.observation_space.shape

        self.memory = memory or ReplayMemory(1000000)
        self.env = env

        if not isinstance(actor, Model):
            actor = Model(actor,
                          optimizer=tf.train.AdamOptimizer(.0001),
                          tracker=tf.train.ExponentialMovingAverage(1 - .001))

        if not isinstance(critic, Model):
            critic = Model(critic,
                           optimizer=tf.train.AdamOptimizer(.001),
                           tracker=tf.train.ExponentialMovingAverage(1 - .001))

        preprocess = preprocess or Model(
            lambda x: x,
            optimizer=tf.train.AdamOptimizer(.001),
            tracker=tf.train.ExponentialMovingAverage(1 - 0.001))

        def act(o: [so], noisy=True):
            with arg_scope([layers.batch_norm], is_training=False):
                s = preprocess(o)
                a = actor(s, noise=noisy)
                a = smart_cond(noisy, lambda: noise(a), lambda: a)
                q = critic(s, a)
                layers.summarize_tensors([s, a, q])
                return a

        self.act = Function(act)

        def train_actor(o: [so]):
            s = preprocess(o)
            a0 = actor(s)
            q = critic(tf.stop_gradient(s),
                       a0)  # stop gradients from critic to preprocessor
            loss = -tf.reduce_mean(q, axis=0)
            return loss

        def train_critic(o: [so], a: [sa], r, t: tf.bool, o2: [so]):
            s = preprocess(o)
            q2 = critic(s, a)
            s2 = preprocess.tracked(o2)
            qt = critic.tracked(s2, actor.tracked(s2))
            qtt = tf.where(t, r, r + 0.99 * qt)
            qtt = tf.stop_gradient(qtt)
            mse = tf.reduce_mean(tf.square(q2 - qtt), axis=0)
            return mse

        def train(o: [so], a: [sa], r, t: tf.bool, o2: [so]):
            al = train_actor(o)
            mse = train_critic(o, a, r, t, o2)
            return actor.minimize(al), critic.minimize(
                mse), preprocess.minimize([mse, al])

        self.train = Function(
            train,
            prefetch_fctn=lambda: self.memory.sample_batch()[:-1],
            prefetch_capacity=training_repeats,
            async=True)

        def log_return(r: []):
            layers.summarize_tensor(r, 'Return')

        self.log_return = Function(log_return, async=True)

        self.t = 0

    def play_episode(self):
        ob = self.env.reset()
        done = False
        R = 0
        self.act.initialize_local(
        )  # reset local variables (e.g. the noise state)
        while not done:
            # a = act(ob, False) if np.random.rand() > .1 else acsp.sample()
            a = self.act(ob)
            ob2, r, done, info = self.env.step(a)
            self.memory.enqueue(ob, a, r, done)

            ob = ob2
            R += info.get('unwrapped_reward', r)

            debug_training = self.t == 512  # fail fast ;)
            if self.t > self.warmup_time or debug_training:
                for _ in range(1 if debug_training else self.training_repeats):
                    self.train()

            self.t += 1

        self.log_return(R)
        return R, {}
Beispiel #4
0
    def __init__(self,
                 n_actions,
                 observation_shape,
                 q_network: tt.Model,
                 double_dqn=True,
                 replay_start=50000,
                 clip_td=False,
                 logdir="",
                 clip_gradients=10):
        self.logdir = logdir
        self.replay_start = replay_start
        self.n_actions = n_actions
        self.observation_shape = observation_shape
        self.memory = ShardedMemory()
        self.discount = .99
        self.step = 0

        @tt.model(
            tracker=tf.train.ExponentialMovingAverage(
                1 - .0005),  # TODO: replace with original weight freeze
            optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01))
        def q_network(x):
            x /= 255
            x = layers.conv2d(x, 32, 8, 4)
            x = layers.conv2d(x, 64, 4, 2)
            x = layers.conv2d(x, 64, 3, 1)
            x = layers.flatten(x)

            xv = layers.fully_connected(x, 512)
            val = layers.fully_connected(xv, 1, activation_fn=None)
            # val = tf.squeeze(val, 1)

            xa = layers.fully_connected(x, 512)
            adv = layers.fully_connected(xa,
                                         env.action_space.n,
                                         activation_fn=None)

            q = val + adv - tf.reduce_mean(adv, axis=1, keep_dims=True)
            q = tf.identity(q, name='Q')
            return q, x

        def act(x: [observation_shape]):
            qs = q_network(x)
            a = tf.argmax(qs, axis=1)
            # qm = tf.reduce_max(qs, axis=1)
            return a, qs

        self.act = Function(act)

        def train_step(o: [observation_shape], a: (tf.int32, [[]]), r,
                       t: tf.bool, o2: [observation_shape]):
            q = q_network(o)
            # ac = tf.argmax(q, axis=1)

            # compute targets
            q2 = q_network.tracked(o2)

            if double_dqn:
                a2 = tf.argmax(
                    q_network(o2),
                    axis=1)  # yep, that's really the only difference
            else:
                a2 = tf.argmax(q2, axis=1)

            mask2 = tf.one_hot(a2, n_actions, 1.0, 0.0, axis=1)
            q_target = tf.where(
                t, r, r + self.discount * tf.reduce_sum(q2 * mask2, axis=1))
            q_target = tf.stop_gradient(q_target)

            # compute loss
            mask = tf.one_hot(a, n_actions, 1.0, 0.0, axis=1)
            qs = tf.reduce_sum(q * mask, axis=1, name='q_max')
            td = tf.subtract(q_target, qs, name='td')
            if clip_td:
                td = tf.clip_by_value(td, -.5, .5, name='clipped_td')
            # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae')
            # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber')
            loss = tf.reduce_mean(tf.square(td), axis=0, name='mse')

            gav = q_network.compute_gradients(loss)
            if clip_gradients:
                gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav]
            loss_update = q_network.apply_gradients(gav)

            # logging
            layers.summarize_tensors([
                td, loss, r, o, a,
                tf.subtract(o2, o, name='state_dif'),
                tf.reduce_mean(tf.cast(t, tf.float32), name='frac_terminal'),
                tf.subtract(tf.reduce_max(q, 1, True), q, name='av_advantage')
            ])
            # layers.summarize_tensors(chi.activations())
            # layers.summarize_tensors(chi.gradients())
            return loss_update

        self.train_step = Function(
            train_step,
            prefetch_fctn=lambda: self.memory.sample_batch()[:-1],
            prefetch_capacity=10,
            prefetch_threads=3)

        def log_weigths():
            v = q_network.trainable_variables()
            # print(f'log weights {v}')

            f = q_network.tracker_variables
            # print(f'log weights EMA {f}')

            difs = []
            for g in v:
                a = q_network.tracker.average(g)
                difs.append(tf.subtract(g, a, name=f'ema/dif{g.name[:-2]}'))

            layers.summarize_tensors(v + f + difs)

        self.log_weights = Function(log_weigths, async=True)
Beispiel #5
0
    def __init__(self,
                 env: gym.Env,
                 q_network: tt.Model,
                 memory=None,
                 double_dqn=True,
                 replay_start=50000,
                 annealing_time=1000000):
        self.annealing_time = annealing_time
        self.replay_start = replay_start
        so = env.observation_space.shape

        self.env = env
        self.memory = memory or chi.rl.ReplayMemory(1000000)

        def act(x: [so]):
            qs = q_network(x)
            a = tf.argmax(qs, axis=1)
            # qm = tf.reduce_max(qs, axis=1)
            layers.summarize_tensor(a)
            return a, qs

        self.act = Function(act)

        def train(o: [so], a: (tf.int32, [[]]), r, t: tf.bool, o2: [so]):
            q = q_network(o)
            # ac = tf.argmax(q, axis=1)

            # compute targets
            q2 = q_network.tracked(o2)

            if double_dqn:
                a2 = tf.argmax(
                    q_network(o2),
                    axis=1)  # yep, that's really the only difference
            else:
                a2 = tf.argmax(q2, axis=1)

            mask2 = tf.one_hot(a2, env.action_space.n, 1.0, 0.0, axis=1)
            q_target = tf.where(t, r,
                                r + 0.99 * tf.reduce_sum(q2 * mask2, axis=1))
            q_target = tf.stop_gradient(q_target)

            # compute loss
            mask = tf.one_hot(a, env.action_space.n, 1.0, 0.0, axis=1)
            qs = tf.reduce_sum(q * mask, axis=1, name='q_max')
            td = tf.subtract(q_target, qs, name='td')
            # td = tf.clip_by_value(td, -10, 10)
            # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae')
            # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber')
            loss = tf.reduce_mean(tf.square(td), axis=0, name='mse')

            loss = q_network.minimize(loss)

            # logging
            layers.summarize_tensors([
                td, loss, r, o, a,
                tf.subtract(o2, o, name='state_dif'),
                tf.reduce_mean(tf.cast(t, tf.float32), name='frac_terminal'),
                tf.subtract(tf.reduce_max(q, 1, True), q, name='av_advantage')
            ])
            # layers.summarize_tensors(chi.activations())
            # layers.summarize_tensors(chi.gradients())
            return loss

        self.train = Function(
            train,
            prefetch_fctn=lambda: self.memory.sample_batch()[:-1],
            prefetch_capacity=3,
            async=True)

        def log_weigths():
            v = q_network.trainable_variables()
            # print(f'log weights {v}')

            f = q_network.tracker_variables
            # print(f'log weights EMA {f}')

            difs = []
            for g in v:
                a = q_network.tracker.average(g)
                difs.append(tf.subtract(g, a, name=f'ema/dif{g.name[:-2]}'))

            layers.summarize_tensors(v + f + difs)

        self.log_weights = Function(log_weigths, async=True)

        def log_returns(real_return: [], ret: [], qs):
            layers.summarize_tensors(
                [real_return, ret, qs,
                 tf.subtract(ret, qs, name='R-Q')])

        self.log_returns = Function(log_returns, async=True)

        self.t = 0
Beispiel #6
0
    def __init__(self, n_actions, observation_shape, q_network: tt.Model, double_dqn=True,
                             replay_start=50000, clip_td=False, logdir="", clip_gradients=10):
        self.logdir = logdir
        self.replay_start = replay_start
        self.n_actions = n_actions
        self.observation_shape = observation_shape
        self.memory = ShardedMemory()
        self.discount = .99
        self.step = 0

        def act(x: [observation_shape]):
            qs = q_network(x)
            a = tf.argmax(qs, axis=1)
            # qm = tf.reduce_max(qs, axis=1)
            return a, qs

        self.act = Function(act)

        def train_step(o: [observation_shape], a: (tf.int32, [[]]), r, t: tf.bool, o2: [observation_shape]):
            q = q_network(o)
            # ac = tf.argmax(q, axis=1)

            # compute targets
            q2 = q_network.tracked(o2)

            if double_dqn:
                a2 = tf.argmax(q_network(o2), axis=1)  # yep, that's really the only difference
            else:
                a2 = tf.argmax(q2, axis=1)

            mask2 = tf.one_hot(a2, n_actions, 1.0, 0.0, axis=1)
            q_target = tf.where(t, r, r + self.discount * tf.reduce_sum(q2 * mask2, axis=1))
            q_target = tf.stop_gradient(q_target)

            # compute loss
            mask = tf.one_hot(a, n_actions, 1.0, 0.0, axis=1)
            qs = tf.reduce_sum(q * mask, axis=1, name='q_max')
            td = tf.subtract(q_target, qs, name='td')
            if clip_td:
                td = tf.clip_by_value(td, -.5, .5, name='clipped_td')
            # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae')
            # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber')
            loss = tf.reduce_mean(tf.square(td), axis=0, name='mse')

            gav = q_network.compute_gradients(loss)
            if clip_gradients:
                gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav]
            loss_update = q_network.apply_gradients(gav)

            # logging
            layers.summarize_tensors([td, loss, r, o, a,
                                                                tf.subtract(o2, o, name='state_dif'),
                                                                tf.reduce_mean(tf.cast(t, tf.float32), name='frac_terminal'),
                                                                tf.subtract(tf.reduce_max(q, 1, True), q, name='av_advantage')])
            # layers.summarize_tensors(chi.activations())
            # layers.summarize_tensors(chi.gradients())
            return loss_update

        self.train_step = Function(train_step,
                                                             prefetch_fctn=lambda: self.memory.sample_batch()[:-1],
                                                             prefetch_capacity=10,
                                                             prefetch_threads=3)

        def log_weigths():
            v = q_network.trainable_variables()
            # print(f'log weights {v}')

            f = q_network.tracker_variables
            # print(f'log weights EMA {f}')

            difs = []
            for g in v:
                a = q_network.tracker.average(g)
                difs.append(tf.subtract(g, a, name=f'ema/dif{g.name[:-2]}'))

            layers.summarize_tensors(v + f + difs)

        self.log_weights = Function(log_weigths, async=True)
Beispiel #7
0
    def __init__(self, n_actions, observation_shape, pp: Model, heads: Model, double_dqn=True, replay_start=50000, logdir=None, clip_gradients=10.):
        self.logdir = logdir
        self.observation_shape = observation_shape
        self.n_actions = n_actions
        self.replay_start = replay_start
        self.n_heads = None

        self.memory = ShardedMemory()

        self.discount = .99
        self.step = 0
        self.n_state = None

        def act(x: [observation_shape]):
            s = pp(x)

            self.n_state = int(s.shape[1])

            qs = heads(s)

            self.n_heads = len(qs)
            return qs, s

        self.act = Function(act)

        @tt.model(optimizer=tf.train.RMSPropOptimizer(6.25e-5, .95, .95, .01))
        def pred(x, a: tf.int32):
            x = tf.concat((x, layers.one_hot_encoding(a, self.n_actions)), axis=1)
            x = layers.fully_connected(x, 100)
            x = layers.fully_connected(x, 50)
            x = layers.fully_connected(x, 50)
            x = layers.fully_connected(x, 100)
            x = layers.fully_connected(x, self.n_state, None)
            return x

        def train_step(o: [observation_shape], a: (tf.int32, [[]]), r, t: tf.bool, o2: [observation_shape]):
            s = pp(o)
            qs = heads(s)

            self.n_heads = len(qs)

            # compute targets
            q2s = heads.tracked(pp.tracked(o2))

            s2 = pp(o2)

            # transition model
            sp = pred(s, a)
            loss_pred = tf.reduce_mean(tf.square(sp-s2))

            if double_dqn:
                a2s = [tf.argmax(_, axis=1) for _ in heads(s2)]
            else:
                a2s = [tf.argmax(_, axis=1) for _ in q2s]

            losses = []
            for a2, q2, q in zip(a2s, q2s, qs):
                mask2 = tf.one_hot(a2, n_actions, 1.0, 0.0, axis=1)
                q_target = tf.stop_gradient(tf.where(t, r, r + self.discount * tf.reduce_sum(q2 * mask2, axis=1)))

                # compute loss
                mask = tf.one_hot(a, n_actions, 1.0, 0.0, axis=1)
                q = tf.reduce_sum(q * mask, axis=1, name='q_max')
                td = tf.subtract(q_target, q, name='td')
                # td = tf.clip_by_value(td, -10, 10)
                # loss = tf.reduce_mean(tf.abs(td), axis=0, name='mae')
                # loss = tf.where(tf.abs(td) < 1.0, 0.5 * tf.square(td), tf.abs(td) - 0.5, name='mse_huber')
                losses.append(tf.reduce_mean(tf.square(td), axis=0, name='mse'))

            loss = tf.add_n(losses)

            gav = heads.compute_gradients(loss)
            if clip_gradients:
                gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav]
            th = heads.apply_gradients(gav)

            gav = pp.compute_gradients(loss)
            if clip_gradients:
                gav = [(tf.clip_by_norm(g / self.n_heads, clip_gradients), v) for g, v in gav]
            tp = pp.apply_gradients(gav)

            gav = pred.compute_gradients(loss_pred)
            if clip_gradients:
                gav = [(tf.clip_by_norm(g, clip_gradients), v) for g, v in gav]
            tpred = pred.apply_gradients(gav)

            return th, tp, tpred

        self.train_step = Function(train_step,
                                                             prefetch_fctn=lambda: self.memory.sample_batch()[:-1],
                                                             prefetch_capacity=5,
                                                             prefetch_threads=3,
                                                             async=False)

        assert self.n_state

        def predict(s: [[self.n_state]], a: (tf.int32, [[]])):
            sp = pred(s, a)
            return sp
        self.predict = Function(predict)