Exemple #1
0
class LELPPO(Layer):
    def __init__(self, input_shape, pool_shape, nactions, name=None):
        self.input_shape = input_shape
        self.batch_size, self.h, self.w, self.fin = self.input_shape
        self.pool_shape = pool_shape
        self.nactions = nactions
        self.name = name
        self.action_name = self.name + '_action'
        self.value_name = self.name + '_value'
        self.nlp_name = self.name + '_nlp'

        self.pool = AvgPool(size=self.input_shape,
                            ksize=self.pool_shape,
                            strides=self.pool_shape,
                            padding='SAME')

        l2_input_shape = l1.output_shape()
        self.conv2fc = ConvToFullyConnected(input_shape=l2_input_shape)

        l3_input_shape = l2.output_shape()
        self.actions = FullyConnected(input_shape=l3_input_shape,
                                      size=self.nactions,
                                      init='alexnet',
                                      name=self.name + '_actions')
        self.values = FullyConnected(input_shape=l3_input_shape,
                                     size=1,
                                     init='alexnet',
                                     name=self.name + '_values')

        ####################################################

        self.logits_bias = tf.Variable(np.zeros(shape=(self.nbatch,
                                                       self.nclass)),
                                       dtype=tf.float32)
        self.values_bias = tf.Variable(np.zeros(shape=(self.nbatch, 1)),
                                       dtype=tf.float32)

        # self.actions_model = Model(layers=[l1, l2, actions])
        # self.values_model = Model(layers=[l1, l2, values])

        ####################################################

        self.advantages = tf.placeholder("float", [None])
        self.rewards = tf.placeholder("float", [None])

        self.old_actions = tf.placeholder("int32", [None])
        self.old_values = tf.placeholder("float", [None])
        self.old_nlps = tf.placeholder("float", [None])

        ####################################################

    def get_weights(self):
        return []

    def output_shape(self):
        return self.input_shape

    def num_params(self):
        return 0

    def place_holders(self):
        place_holders_dict = {}
        place_holders_dict[self.name + '_advantages'] = self.advantages
        place_holders_dict[self.name + '_rewards'] = self.rewards
        place_holders_dict[self.name + '_old_actions'] = self.old_actions
        place_holders_dict[self.name + '_old_values'] = self.old_values
        place_holders_dict[self.name + '_old_nlps'] = self.old_nlps
        return place_holders_dict

    ###################################################################

    def forward(self, X):
        return X

    def predict(self, X):
        # [logits, logits_forward] = self.actions_model.forward(X)
        # [values, values_forward] = self.values_model.forward(X)

        pool = self.pool.forward(AI)
        conv2fc = self.conv2fc.forward(pool)
        logits = self.actions.forward(conv2fc)
        values = self.values.forward(conv2fc)

        values = tf.reshape(values, (-1, ))
        actions = sample(logits)
        nlps = neg_log_prob(logits, actions)

        # states, rewards, advantages, old_actions, old_values, old_nlps
        cache = {
            self.action_name: actions,
            self.value_name: values,
            self.nlp_name: nlps
        }
        return X, cache

    ###################################################################

    def backward(self, AI, AO, DO):
        return DO

    def gv(self, AI, AO, DO):
        return []

    ###################################################################

    def dfa_backward(self, AI, AO, E, DO):
        return DO

    def dfa_gv(self, AI, AO, E, DO):
        return []

    ###################################################################

    def lel_backward(self, AI, AO, DO, cache):

        pool = self.pool.forward(AI)
        conv2fc = self.conv2fc.forward(pool)
        logits = self.actions.forward(conv2fc)
        values = self.values.forward(conv2fc)

        # [logits, logits_forward] = self.actions_model.forward(AI)
        # [values, values_forward] = self.values_model.forward(AI)

        logits = logits + self.logits_bias
        values = values + self.values_bias
        values = tf.reshape(values, (-1, ))
        nlps = neg_log_prob(logits, self.old_actions)

        ratio = tf.exp(nlps - self.old_nlps)
        ratio = tf.clip_by_value(ratio, 0, 10)
        surr1 = self.advantages * ratio
        surr2 = self.advantages * tf.clip_by_value(ratio, 1 - epsilon_decay,
                                                   1 + epsilon_decay)
        policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))

        entropy_loss = -policy_entropy(train)

        clipped_value_estimate = self.old_values + tf.clip_by_value(
            values - self.old_values, -epsilon_decay, epsilon_decay)
        value_loss_1 = tf.squared_difference(clipped_value_estimate,
                                             self.rewards)
        value_loss_2 = tf.squared_difference(values, self.rewards)
        value_loss = 0.5 * tf.reduce_mean(
            tf.maximum(value_loss_1, value_loss_2))

        ###################################################################

        loss = policy_loss + 0.01 * entropy_loss + 1. * value_loss
        # grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias] + self.params)
        grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias])

        do_logits = grads[0]
        do_values = grads[1]

        # we never call forward in lel, until backwards... forward just returns X.
        # actually works out nicely.
        # perhaps we dont actually need a cache then.
        # a few cheap redundant computations isnt so bad.

        dlogits = self.actions.backward(conv2fc, logits, do_logits)
        dvalues = self.values.backward(conv2fc, values, do_values)
        dconv2fc = self.conv2fc.backward(pool, conv2fc, dlogits + dvalues)
        dpool = self.pool.backward(AI, pool, dconv2fc)

        return dpool

    def lel_gv(self, AI, AO, DO, cache):

        pool = self.pool.forward(AI)
        conv2fc = self.conv2fc.forward(pool)
        logits = self.actions.forward(conv2fc)
        values = self.values.forward(conv2fc)

        # [logits, logits_forward] = self.actions_model.forward(AI)
        # [values, values_forward] = self.values_model.forward(AI)

        logits = logits + self.logits_bias
        values = values + self.values_bias
        values = tf.reshape(values, (-1, ))
        nlps = neg_log_prob(logits, self.old_actions)

        ratio = tf.exp(nlps - self.old_nlps)
        ratio = tf.clip_by_value(ratio, 0, 10)
        surr1 = self.advantages * ratio
        surr2 = self.advantages * tf.clip_by_value(ratio, 1 - epsilon_decay,
                                                   1 + epsilon_decay)
        policy_loss = -tf.reduce_mean(tf.minimum(surr1, surr2))

        entropy_loss = -policy_entropy(train)

        clipped_value_estimate = self.old_values + tf.clip_by_value(
            values - self.old_values, -epsilon_decay, epsilon_decay)
        value_loss_1 = tf.squared_difference(clipped_value_estimate,
                                             self.rewards)
        value_loss_2 = tf.squared_difference(values, self.rewards)
        value_loss = 0.5 * tf.reduce_mean(
            tf.maximum(value_loss_1, value_loss_2))

        ###################################################################

        loss = policy_loss + 0.01 * entropy_loss + 1. * value_loss
        # grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias] + self.params)
        grads = tf.gradients(self.loss, [self.logits_bias, self.values_bias])

        do_logits = grads[0]
        do_values = grads[1]

        # we never call forward in lel, until backwards... forward just returns X.
        # actually works out nicely.
        # perhaps we dont actually need a cache then.
        # a few cheap redundant computations isnt so bad.

        gvs = []
        dlogits = self.actions.gv(conv2fc, logits, do_logits)
        dvalues = self.values.gv(conv2fc, values, do_values)
        # dconv2fc = self.conv2fc.backward(pool, conv2fc, dlogits + dvalues)
        # dpool = self.pool.backward(AI, pool, dconv2fc)

        gvs.extend(dlogits, dvalues)

        return gvs
Exemple #2
0
class LELFC(Layer):
    def __init__(self, input_shape, num_classes, name=None):
        self.num_classes = num_classes
        self.input_shape = input_shape
        self.name = name
        '''
        if load:
            weight_dict = np.load(load).item()
            self.B = tf.cast(tf.Variable(weight_dict[self.name]), tf.float32)
        elif std is not None:
            b = np.random.normal(loc=0., scale=std, size=(self.num_classes, self.output_size))
            self.B = tf.cast(tf.Variable(b), tf.float32)
        else:
            # var = 1. / self.output_size
            # std = np.sqrt(var)
            # b = np.random.normal(loc=0., scale=std, size=(self.num_classes, self.output_size))

            b = FeedbackMatrix(size=(self.num_classes, self.output_size), sparse=self.sparse, rank=self.rank)
            self.B = tf.cast(tf.Variable(b), tf.float32) 
        '''

        # THE PROBLEM WAS NEVER THE BIAS ... IT WAS THE FACT WE WERNT DIVIDING BY N

        # l0 = FullyConnected(input_shape=input_shape, size=self.input_shape, init='alexnet', activation=Relu(), bias=1., name=self.name)
        self.l0 = FullyConnected(input_shape=input_shape,
                                 size=self.num_classes,
                                 init='alexnet',
                                 activation=Linear(),
                                 bias=0.,
                                 name=self.name)

        # self.B = Model(layers=[l1])

    def get_weights(self):
        # return self.l0.get_weights()
        return []

    def get_feedback(self):
        return self.B

    def output_shape(self):
        return self.input_shape

    def num_params(self):
        return 0

    def forward(self, X):
        return X

    ###################################################################

    def backward(self, AI, AO, DO):
        return DO

    def gv(self, AI, AO, DO):
        return []

    def train(self, AI, AO, DO):
        return []

    ###################################################################

    def dfa_backward(self, AI, AO, E, DO):
        return DO

    def dfa_gv(self, AI, AO, E, DO):
        return []

    def dfa(self, AI, AO, E, DO):
        return []

    ###################################################################

    # > https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html
    # > https://www.ics.uci.edu/~pjsadows/notes.pdf
    # > https://deepnotes.io/softmax-crossentropy
    def lel_backward(self, AI, AO, E, DO, Y):
        '''
        S = tf.matmul(AO, tf.transpose(self.B))
        # should be doing cross entropy here.
        # is this right ?
        # just adding softmax ?
        ES = tf.subtract(tf.nn.softmax(S), Y)
        DO = tf.matmul(ES, self.B)
        # (* activation.gradient) and (* AI) occur in the actual layer itself.
        return DO
        '''
        # '''
        S = self.l0.forward(AI)
        ES = tf.subtract(tf.nn.softmax(S), Y)
        DI = self.l0.backward(AI, S, ES)
        # '''

        # DI = self.B.backwards(AI, Y)

        return DI

    def lel_gv(self, AI, AO, E, DO, Y):
        # '''
        S = self.l0.forward(AI)
        ES = tf.subtract(tf.nn.softmax(S), Y)
        gvs = self.l0.gv(AI, S, ES)
        # '''

        # gvs = self.B.gvs(AI, Y)

        return gvs

    def lel(self, AI, AO, E, DO, Y):
        assert (False)