def __init__(self, hidden_units, output_units, activation): """ :param input_shape: :param hidden_units: # of right output units :param output_units: # of top output units :param activation: # of activations """ super().__init__() self.hidden_units = hidden_units # Hidden units self.output_units = output_units # output units self.activation = activation # Activation for hidden state self.initialize = True # Initialize all params self.w = Variable(np.random.normal( 0, 1, (self.hidden_units, self.hidden_units)), trainable=True) self.b = Variable(np.random.normal(0, 1, (1, self.hidden_units)), trainable=True, param_share=True) self.c = Variable(np.random.normal(0, 1, (1, self.output_units)), trainable=True, param_share=True) self.v = Variable(np.random.normal( 0, 1, (self.hidden_units, self.output_units)), trainable=True)
def update_gradient(self, x: Variable): if x.back_prop is not None: x.back_prop() if x.lchild is not None: self.update_gradient(x.lchild) if x.rchild is not None: self.update_gradient(x.rchild)
def tanh(x: Variable): M = np.average(x.value) print(np.max(x.value - M)) print(np.min(x.value - M)) output_value = (np.exp(x.value - M) - np.exp(-x.value - M))/(np.exp(x.value - M) + np.exp(-x.value - M)) output = Variable(output_value, lchild=x) output.back_prop = output.back_tanh output.tanh_grad_parser = {'M': M, 'xvalue': x.value} return output
def forward(self, x: Variable): """ :param x: x [:::::: vocab_size] a one-hot value Thing is, we don't really care what's at the front. We only need to use the last dimension ( which must be a one-hot) to find its mapping. :return: """ self.vocab_size = x.shape[-1] if self.initialize: self.mapping = Variable(np.random.normal(0, 1, (self.vocab_size, self.embed_size)), trainable=True) self.initialize = False # First, find the corresponding word representation embedded_word = x.dot(self.mapping) # n x embed_size return embedded_word
def forward(self, X): if self.initialize: size = X.shape self.n = size[0] self.x = size[2] self.y = size[3] self.in_channel = size[1] self.x_new = int((self.x - self.kernel_size[0] + 2 * self.padding[0]) / self.stride[0] + 1) self.y_new = int((self.y - self.kernel_size[1] + 2 * self.padding[1]) / self.stride[1] + 1) self.initialize = True # Generate the new matrix output = Variable(np.zeros((self.n, self.in_channel, self.x_new, self.y_new)), lchild=X) output.mapping = np.zeros((self.n, self.in_channel, self.x_new, self.y_new, 2)) output.size = [self.n, self.in_channel, self.x_new, self.y_new] for image_idx, image in enumerate(X.value): for channel_idx in range(self.in_channel): for i in range(self.x_new): for j in range(self.y_new): x_start = int(i * self.stride[0]) x_end = int(x_start + self.kernel_size[0]) y_start = int(j * self.stride[1]) y_end = int(y_start + self.kernel_size[1]) # Forward-prop clip = image[channel_idx, x_start: x_end, y_start: y_end] output.value[image_idx, channel_idx, i, j] = np.max(clip) # Backward-prop maximum_x = int(np.argmax(clip)/clip.shape[0]) + x_start maximum_y = np.argmax(clip) % clip.shape[0] + y_start # 把最大值的位置的坐标记录在mapping里 output.mapping[image_idx, channel_idx, i, j, 0] = maximum_x output.mapping[image_idx, channel_idx, i, j, 1] = maximum_y output.back_prop = output.back_maxpooling2d() return output
def softmax(x: Variable, axis=1): M = Variable(x.maximum().value) # M must only be a constant small_x = x - M exp_small_x = small_x.safe_exp() inv_sum_exp_small_x = exp_small_x.sum(axis=axis).safe_inv() long_inv_sum_exp_small_x = inv_sum_exp_small_x.repeat(x.shape[1], axis=axis) output = exp_small_x.multiply(long_inv_sum_exp_small_x) ''' The reason we subtract the maximum value from x is to avoid overflow problem when doing exp() ''' # exp_sum_inv = 1 / (np.sum(exp_small_x.value, axis=axis)) # # output_value = np.multiply(exp_small_x.value, exp_sum_inv) # # output = Variable(output_value, lchild=x) # output.back_prop = output.back_softmax return output
def train_forward(self, x: Variable, h=None): """ :param x: shape: (batch_size, sequence_length, vocab_size) :param h: :return: """ if self.initialize: self.u = Variable(np.random.normal( 0, 1, (x.shape[1], self.hidden_units)), trainable=True) self.initialize = False if h is None: # In the first RNNcell, we don't have any hidden layers, so we initialize one h = Variable( np.random.normal(0, 1, (x.shape[0], self.hidden_units))) xu = x.dot(self.u) hw = h.dot(self.w) self.a = xu + hw + self.b self.h = self.activation(self.a) self.o = self.h.dot(self.v) + self.c return self.o, self.h
def update_gradient_with_optimizer(self, x: Variable, optimizer: Optimizer): # print(type(x)) # Gradient Clipping mask = (x.gradient < GRADIENT_CLIPPING_THRESHOLD).astype(int) mask = np.multiply( mask, (x.gradient > -GRADIENT_CLIPPING_THRESHOLD).astype(int)) contra_mask = 1 - mask x.gradient = np.multiply( mask, x.gradient) + contra_mask * GRADIENT_CLIPPING_THRESHOLD if x.back_prop is not None: # which means x is an input node x.back_prop() if x.trainable: optimizer.update_once(x) if x.lchild is not None: self.update_gradient_with_optimizer(x.lchild, optimizer) if x.rchild is not None: self.update_gradient_with_optimizer(x.rchild, optimizer)
def forward(self, x): ''' :param x: :return: ''' self.n = x.shape[0] assert self.p == x.shape[1] output_list = [] # Initialize a hidden param self.h = Variable(np.random.normal(0, 1, (self.n, self.hidden_units))) for _ in range(self.p): y, self.h = self.RNN_cell.train_forward(x, self.h) output_list.append(y) return output_list, self.h
import numpy as np from otter.dam.structure import Variable from otter.dam.graph import Graph from otter.layers.language import Embedding with Graph() as g: vocab_size = 100 embed_size = 10 max_len = 150 data_len = 1000 emb = Embedding(max_len, vocab_size, embed_size) x = Variable(np.random.randint(0, vocab_size - 1, (data_len, max_len))) embedded = emb.forward(x) print(embedded)
# eyed = a.reshape(n, m, 1) * np.eye(m) # # ones = a.reshape(n, m, 1) * np.ones(m) # inverse_ones = a.reshape(n, 1, m) * np.ones((m, m)) # # ds = eyed - np.multiply(ones, inverse_ones) # avg_ds = np.average(ds, axis=0) # # print(ones) # print(inverse_ones) # print(eyed) # print(ds) from otter.dam.structure import Variable a = Variable(np.random.normal(0, 1, (20, 10))) # b = softmax(a) g = Graph() # # c = a.sum(axis=1) # # g.set_and_update_gradient(c, np.arange(20).reshape(20,1)) # g.update_gradient(b) # # print(a.gradient) c = Variable(np.random.randint(0, 9, (20, 1))) sliced = b.slice(c.value.reshape((len(c.value), )), axis=1) mid1 = sliced.safe_log() mid2 = mid1.average() mid3 = mid2.neg()
def ones(shape, dtype): return Variable(np.ones(shape, dtype))
def relu(x: Variable): mapping = Variable((x.value > 0).astype(int)) output = x.multiply(mapping) return output
norm1_list = [] norm2_list = [] g = Graph() iteration = 1000 batch_size = 1024 total_epoch = int(n / batch_size) for it_idx in range(iteration): print(f"The {it_idx}th iteration.") for epoch in tqdm(range(total_epoch)): x = x_train[epoch*batch_size: (epoch+1) * batch_size] y = y_train[epoch*batch_size: (epoch+1) * batch_size] x = Variable(x) y = Variable(y) a = relu(conv1.forward(x)) b = relu(conv2.forward(a)) c = flatten.forward(b) d = relu(dense1.forward(c)) f = dense2.forward(d) loss = sparse_categorical_crossentropy_with_softmax(y, f) acc = sparse_categorical_accuracy(y, f) # optimizer.learning_rate *= 0.99 g.update_gradient_with_optimizer(loss, optimizer) loss_list.append(loss.value)
def sigmoid(x: Variable): return x.neg().safe_exp().add(Variable(np.ones(1))).safe_inv()
def set_and_update_gradient(self, x: Variable, gradient): assert x.gradient.shape == gradient.shape x.gradient = gradient self.update_gradient(x)
def zeros(shape, dtype, *args, **kwargs): return Variable(np.zeros(shape, dtype), dtype, args, kwargs)
return output_list, self.h if __name__ == "__main__": from otter import Variable from otter.dam.graph import Graph from otter.ops.activation import softmax from otter.optimizer import GradientDescent with Graph() as g: n = 1000 p = 64 # Sentence Length q = 5 # Prediction Choices m = 64 # Embedding Length x = Variable(np.random.normal(0, 0.1, (n, p))) y = Variable(np.random.randint(0, q - 1, (n, p))) layer2 = RNN(input_shape=p, number_of_rnn_cell=p, hidden_units=8, output_units=q, activation=softmax, return_sequence=True, return_state=True) output, hidden = layer2.forward(x) print(len(output)) print(output[0].shape) optimizer = GradientDescent(0.5)
def forward(self, X: Variable): """ :param X: X is a 4d tensor, [batch, channel, row, col] # TODO add channel in different places :return: """ # print("starting convolution.") def idx_three2one(idx, shape): new_idx = idx[0] * np.prod(shape[1:]) + idx[1] * shape[2] + idx[2] return new_idx # Notice that we only need to calculate mapping once for all epochs if self.initialize: self.n, self.in_channel, self.x, self.y = X.shape # We first calculate the new matrix size. self.x_new = int((self.x - self.kernel_size[0] + 2 * self.padding[0]) / self.stride[0] + 1) self.y_new = int((self.y - self.kernel_size[1] + 2 * self.padding[1]) / self.stride[1] + 1) self.old_length = self.in_channel * self.x * self.y self.new_length = self.out_channel * self.x_new * self.y_new # The thing about mapping is that we have to calculate the mapping during each iteration, # Because w has changed within each iteration # On the other hand, we have to keep w changing at the same time. # We only need to initialize b once, with the knowledge of xnew and ynew # Initialize the kernel self.w = Variable(np.random.normal(0, 0.01, (self.out_channel, self.in_channel, self.kernel_size[0], self.kernel_size[1])), trainable=self.trainable) self.b = Variable(np.random.normal(0, 0.01, (1, self.out_channel, self.x_new, self.y_new)), trainable=self.trainable, param_share=True) ''' Now we create a w2mapping, the mapping itself we only need it once for all. After we know the mapping, we can easily do the back-prop and forward-prop each time. ''' self.w2mapping = [] # Logic 1, without sorting for filter_idx in range(self.out_channel): for i in range(self.x_new): for j in range(self.y_new): # Index for new matrix mapping_new = idx_three2one((filter_idx, i, j), (self.out_channel, self.x_new, self.y_new)) x_start = int(i * self.stride[0]) y_start = int(j * self.stride[1]) for ix in range(self.kernel_size[0]): for jx in range(self.kernel_size[1]): for channel_idx in range(self.in_channel): # Index for old matrix mapping_old = idx_three2one((channel_idx, x_start + ix, y_start + jx), (self.in_channel, self.x, self.y)) # We have to record, which one in the mapping matrix is from which w self.w2mapping.append([(filter_idx, channel_idx, ix, jx), (mapping_old, mapping_new)]) self.initialize = False # End Initialize input_image_flattened = X.reshape((self.n, self.old_length)) new_image_flattened = input_image_flattened.sparse_dot_with_mapping(self.w, self.w2mapping, self.old_length, self.new_length) output = new_image_flattened.reshape((self.n, self.out_channel, self.x_new, self.y_new)) # Add bias if necessary if self.bias: output1 = output + self.b return output1 return output
def forward(self, X: Variable): self.n, self.c, self.x, self.y = X.shape output = X.reshape((self.n, self.c * self.x * self.y)) return output