Esempio n. 1
0
    def __init__(self, hidden_units, output_units, activation):
        """
        :param input_shape:
        :param hidden_units:    # of right output units
        :param output_units:    # of top output units
        :param activation:      # of activations
        """

        super().__init__()
        self.hidden_units = hidden_units  # Hidden units
        self.output_units = output_units  # output units
        self.activation = activation  # Activation for hidden state
        self.initialize = True

        # Initialize all params
        self.w = Variable(np.random.normal(
            0, 1, (self.hidden_units, self.hidden_units)),
                          trainable=True)
        self.b = Variable(np.random.normal(0, 1, (1, self.hidden_units)),
                          trainable=True,
                          param_share=True)
        self.c = Variable(np.random.normal(0, 1, (1, self.output_units)),
                          trainable=True,
                          param_share=True)
        self.v = Variable(np.random.normal(
            0, 1, (self.hidden_units, self.output_units)),
                          trainable=True)
Esempio n. 2
0
    def update_gradient(self, x: Variable):
        if x.back_prop is not None:
            x.back_prop()

        if x.lchild is not None:
            self.update_gradient(x.lchild)

        if x.rchild is not None:
            self.update_gradient(x.rchild)
Esempio n. 3
0
def tanh(x: Variable):
    M = np.average(x.value)
    print(np.max(x.value - M))
    print(np.min(x.value - M))
    output_value = (np.exp(x.value - M) - np.exp(-x.value - M))/(np.exp(x.value - M) + np.exp(-x.value - M))
    output = Variable(output_value, lchild=x)

    output.back_prop = output.back_tanh
    output.tanh_grad_parser = {'M': M,
                               'xvalue': x.value}
    return output
Esempio n. 4
0
    def forward(self, x: Variable):
        """
        :param x: x [:::::: vocab_size] a one-hot value
        Thing is, we don't really care what's at the front.
        We only need to use the last dimension ( which must be a one-hot) to find its mapping.
        :return:
        """
        self.vocab_size = x.shape[-1]
        if self.initialize:
            self.mapping = Variable(np.random.normal(0, 1, (self.vocab_size, self.embed_size)),
                                    trainable=True)
            self.initialize = False

        # First, find the corresponding word representation
        embedded_word = x.dot(self.mapping)  # n x embed_size
        return embedded_word
Esempio n. 5
0
    def forward(self, X):

        if self.initialize:
            size = X.shape
            self.n = size[0]
            self.x = size[2]
            self.y = size[3]
            self.in_channel = size[1]

            self.x_new = int((self.x - self.kernel_size[0] + 2 * self.padding[0]) / self.stride[0] + 1)
            self.y_new = int((self.y - self.kernel_size[1] + 2 * self.padding[1]) / self.stride[1] + 1)

            self.initialize = True

        # Generate the new matrix
        output = Variable(np.zeros((self.n, self.in_channel, self.x_new, self.y_new)),
                          lchild=X)

        output.mapping = np.zeros((self.n, self.in_channel, self.x_new, self.y_new, 2))

        output.size = [self.n, self.in_channel, self.x_new, self.y_new]

        for image_idx, image in enumerate(X.value):
            for channel_idx in range(self.in_channel):
                for i in range(self.x_new):
                    for j in range(self.y_new):

                        x_start = int(i * self.stride[0])
                        x_end = int(x_start + self.kernel_size[0])
                        y_start = int(j * self.stride[1])
                        y_end = int(y_start + self.kernel_size[1])

                        # Forward-prop
                        clip = image[channel_idx, x_start: x_end, y_start: y_end]
                        output.value[image_idx, channel_idx, i, j] = np.max(clip)

                        # Backward-prop
                        maximum_x = int(np.argmax(clip)/clip.shape[0]) + x_start
                        maximum_y = np.argmax(clip) % clip.shape[0] + y_start

                        # 把最大值的位置的坐标记录在mapping里
                        output.mapping[image_idx, channel_idx, i, j, 0] = maximum_x
                        output.mapping[image_idx, channel_idx, i, j, 1] = maximum_y

        output.back_prop = output.back_maxpooling2d()

        return output
Esempio n. 6
0
def softmax(x: Variable, axis=1):
    M = Variable(x.maximum().value)  # M must only be a constant
    small_x = x - M
    exp_small_x = small_x.safe_exp()
    inv_sum_exp_small_x = exp_small_x.sum(axis=axis).safe_inv()
    long_inv_sum_exp_small_x = inv_sum_exp_small_x.repeat(x.shape[1], axis=axis)
    output = exp_small_x.multiply(long_inv_sum_exp_small_x)

    '''
    The reason we subtract the maximum value from x
    is to avoid overflow problem when doing exp()
    '''

    # exp_sum_inv = 1 / (np.sum(exp_small_x.value, axis=axis))
    #
    # output_value = np.multiply(exp_small_x.value, exp_sum_inv)
    #
    # output = Variable(output_value, lchild=x)
    # output.back_prop = output.back_softmax
    return output
Esempio n. 7
0
    def train_forward(self, x: Variable, h=None):
        """

        :param x: shape: (batch_size, sequence_length, vocab_size)
        :param h:
        :return:
        """
        if self.initialize:
            self.u = Variable(np.random.normal(
                0, 1, (x.shape[1], self.hidden_units)),
                              trainable=True)
            self.initialize = False

        if h is None:
            # In the first RNNcell, we don't have any hidden layers, so we initialize one
            h = Variable(
                np.random.normal(0, 1, (x.shape[0], self.hidden_units)))

        xu = x.dot(self.u)
        hw = h.dot(self.w)
        self.a = xu + hw + self.b
        self.h = self.activation(self.a)
        self.o = self.h.dot(self.v) + self.c
        return self.o, self.h
Esempio n. 8
0
    def update_gradient_with_optimizer(self, x: Variable,
                                       optimizer: Optimizer):
        # print(type(x))

        # Gradient Clipping
        mask = (x.gradient < GRADIENT_CLIPPING_THRESHOLD).astype(int)
        mask = np.multiply(
            mask, (x.gradient > -GRADIENT_CLIPPING_THRESHOLD).astype(int))
        contra_mask = 1 - mask
        x.gradient = np.multiply(
            mask, x.gradient) + contra_mask * GRADIENT_CLIPPING_THRESHOLD

        if x.back_prop is not None:
            # which means x is an input node
            x.back_prop()

        if x.trainable:
            optimizer.update_once(x)

        if x.lchild is not None:
            self.update_gradient_with_optimizer(x.lchild, optimizer)

        if x.rchild is not None:
            self.update_gradient_with_optimizer(x.rchild, optimizer)
Esempio n. 9
0
    def forward(self, x):
        '''
        :param x:
        :return:
        '''

        self.n = x.shape[0]
        assert self.p == x.shape[1]

        output_list = []
        # Initialize a hidden param
        self.h = Variable(np.random.normal(0, 1, (self.n, self.hidden_units)))

        for _ in range(self.p):
            y, self.h = self.RNN_cell.train_forward(x, self.h)
            output_list.append(y)

        return output_list, self.h
Esempio n. 10
0
import numpy as np
from otter.dam.structure import Variable
from otter.dam.graph import Graph
from otter.layers.language import Embedding

with Graph() as g:
    vocab_size = 100
    embed_size = 10
    max_len = 150
    data_len = 1000

    emb = Embedding(max_len, vocab_size, embed_size)

    x = Variable(np.random.randint(0, vocab_size - 1, (data_len, max_len)))

    embedded = emb.forward(x)

    print(embedded)
Esempio n. 11
0
# eyed = a.reshape(n, m, 1) * np.eye(m)
#
# ones = a.reshape(n, m, 1) * np.ones(m)
# inverse_ones = a.reshape(n, 1, m) * np.ones((m, m))
#
# ds = eyed - np.multiply(ones, inverse_ones)
# avg_ds = np.average(ds, axis=0)
#
# print(ones)
# print(inverse_ones)
# print(eyed)
# print(ds)

from otter.dam.structure import Variable

a = Variable(np.random.normal(0, 1, (20, 10)))
#
b = softmax(a)
g = Graph()
# # c = a.sum(axis=1)
# # g.set_and_update_gradient(c, np.arange(20).reshape(20,1))
# g.update_gradient(b)
#
# print(a.gradient)
c = Variable(np.random.randint(0, 9, (20, 1)))

sliced = b.slice(c.value.reshape((len(c.value), )), axis=1)

mid1 = sliced.safe_log()
mid2 = mid1.average()
mid3 = mid2.neg()
Esempio n. 12
0
def ones(shape, dtype):
    return Variable(np.ones(shape, dtype))
Esempio n. 13
0
def relu(x: Variable):

    mapping = Variable((x.value > 0).astype(int))
    output = x.multiply(mapping)

    return output
Esempio n. 14
0
    norm1_list = []
    norm2_list = []
    g = Graph()

    iteration = 1000
    batch_size = 1024
    total_epoch = int(n / batch_size)

    for it_idx in range(iteration):
        print(f"The {it_idx}th iteration.")
        for epoch in tqdm(range(total_epoch)):

            x = x_train[epoch*batch_size: (epoch+1) * batch_size]
            y = y_train[epoch*batch_size: (epoch+1) * batch_size]

            x = Variable(x)
            y = Variable(y)

            a = relu(conv1.forward(x))
            b = relu(conv2.forward(a))
            c = flatten.forward(b)
            d = relu(dense1.forward(c))
            f = dense2.forward(d)

            loss = sparse_categorical_crossentropy_with_softmax(y, f)
            acc = sparse_categorical_accuracy(y, f)

            # optimizer.learning_rate *= 0.99

            g.update_gradient_with_optimizer(loss, optimizer)
            loss_list.append(loss.value)
Esempio n. 15
0
def sigmoid(x: Variable):
    return x.neg().safe_exp().add(Variable(np.ones(1))).safe_inv()
Esempio n. 16
0
 def set_and_update_gradient(self, x: Variable, gradient):
     assert x.gradient.shape == gradient.shape
     x.gradient = gradient
     self.update_gradient(x)
Esempio n. 17
0
def zeros(shape, dtype, *args, **kwargs):
    return Variable(np.zeros(shape, dtype), dtype, args, kwargs)
Esempio n. 18
0
        return output_list, self.h


if __name__ == "__main__":

    from otter import Variable
    from otter.dam.graph import Graph
    from otter.ops.activation import softmax
    from otter.optimizer import GradientDescent

    with Graph() as g:
        n = 1000
        p = 64  # Sentence Length
        q = 5  # Prediction Choices
        m = 64  # Embedding Length
        x = Variable(np.random.normal(0, 0.1, (n, p)))
        y = Variable(np.random.randint(0, q - 1, (n, p)))
        layer2 = RNN(input_shape=p,
                     number_of_rnn_cell=p,
                     hidden_units=8,
                     output_units=q,
                     activation=softmax,
                     return_sequence=True,
                     return_state=True)

        output, hidden = layer2.forward(x)
        print(len(output))
        print(output[0].shape)

        optimizer = GradientDescent(0.5)
Esempio n. 19
0
    def forward(self, X: Variable):
        """
        :param X: X is a 4d tensor, [batch, channel, row, col]
        # TODO add channel in different places
        :return:
        """
        # print("starting convolution.")

        def idx_three2one(idx, shape):
            new_idx = idx[0] * np.prod(shape[1:]) + idx[1] * shape[2] + idx[2]
            return new_idx

        # Notice that we only need to calculate mapping once for all epochs
        if self.initialize:
            self.n, self.in_channel, self.x, self.y = X.shape

            # We first calculate the new matrix size.
            self.x_new = int((self.x - self.kernel_size[0] + 2 * self.padding[0]) / self.stride[0] + 1)
            self.y_new = int((self.y - self.kernel_size[1] + 2 * self.padding[1]) / self.stride[1] + 1)

            self.old_length = self.in_channel * self.x * self.y
            self.new_length = self.out_channel * self.x_new * self.y_new

            # The thing about mapping is that we have to calculate the mapping during each iteration,
            # Because w has changed within each iteration
            # On the other hand, we have to keep w changing at the same time.
            # We only need to initialize b once, with the knowledge of xnew and ynew
            # Initialize the kernel
            self.w = Variable(np.random.normal(0, 0.01, (self.out_channel, self.in_channel,
                                                      self.kernel_size[0], self.kernel_size[1])),
                              trainable=self.trainable)

            self.b = Variable(np.random.normal(0, 0.01, (1, self.out_channel, self.x_new, self.y_new)),
                              trainable=self.trainable, param_share=True)

            '''
            Now we create a w2mapping, the mapping itself we only need it once for all. 
            After we know the mapping, we can easily do the back-prop and forward-prop each time.
            '''
            self.w2mapping = []

            # Logic 1, without sorting
            for filter_idx in range(self.out_channel):
                for i in range(self.x_new):
                    for j in range(self.y_new):
                        # Index for new matrix
                        mapping_new = idx_three2one((filter_idx, i, j),
                                                    (self.out_channel, self.x_new, self.y_new))
                        x_start = int(i * self.stride[0])
                        y_start = int(j * self.stride[1])
                        for ix in range(self.kernel_size[0]):
                            for jx in range(self.kernel_size[1]):
                                for channel_idx in range(self.in_channel):
                                    # Index for old matrix
                                    mapping_old = idx_three2one((channel_idx, x_start + ix, y_start + jx),
                                                                (self.in_channel, self.x, self.y))
                                    # We have to record, which one in the mapping matrix is from which w
                                    self.w2mapping.append([(filter_idx, channel_idx, ix, jx),
                                                                   (mapping_old, mapping_new)])

            self.initialize = False
        # End Initialize

        input_image_flattened = X.reshape((self.n, self.old_length))

        new_image_flattened = input_image_flattened.sparse_dot_with_mapping(self.w, self.w2mapping,
                                                                            self.old_length,
                                                                            self.new_length)

        output = new_image_flattened.reshape((self.n, self.out_channel,
                                              self.x_new, self.y_new))

        # Add bias if necessary
        if self.bias:
            output1 = output + self.b
            return output1

        return output
Esempio n. 20
0
 def forward(self, X: Variable):
     self.n, self.c, self.x, self.y = X.shape
     output = X.reshape((self.n, self.c * self.x * self.y))
     return output