def call(self, x, mask=None): b, xb = 0., 0. if self.data_format == 'channels_first': kernel_sum_axes = [1, 2, 3] if self.use_bias: b = K.reshape(self.b, (self.filters, 1, 1, 1)) xb = 1. elif self.data_format == 'channels_last': kernel_sum_axes = [0, 1, 2] if self.use_bias: b = K.reshape(self.b, (1, 1, 1, self.filters)) xb = 1. Wnorm = K.sqrt( K.sum(K.square(self.W), axis=kernel_sum_axes, keepdims=True) + K.square(b) + K.epsilon()) xnorm = K.sqrt( K.conv2d(K.square(x), self.kernel_norm, strides=self.strides, padding=self.padding, data_format=self.data_format, filter_shape=self.kernel_norm_shape) + xb + K.epsilon()) W = self.W / Wnorm output = K.conv2d(x, W, strides=self.strides, padding=self.padding, data_format=self.data_format, filter_shape=self.kernel_shape) if K.backend() == 'theano': xnorm = K.pattern_broadcast(xnorm, [False, True, False, False]) output /= xnorm if self.use_bias: b /= Wnorm if self.data_format == 'channels_first': b = K.reshape(b, (1, self.filters, 1, 1)) elif self.data_format == 'channels_last': b = K.reshape(b, (1, 1, 1, self.filters)) else: raise ValueError('Invalid data_format:', self.data_format) b /= xnorm output += b output = self.activation(output) return output
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + ( 1 - self.momentum) * m std_update = self.momentum * self.running_std + ( 1 - self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape( self.beta, broadcast_shape) return out
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + (1-self.momentum) * m std_update = self.momentum * self.running_std + (1-self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape(self.beta, broadcast_shape) return out
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [(self.iterations, self.iterations+1.)] t = self.iterations + 1 beta_2t = K.sqrt(1 - K.pow(self.beta_2, t)) lr_t = self.lr * beta_2t / (1 - K.pow(self.beta_1, t)) for p, g, m, v in zip(params, grads, self.m, self.v): beta_1t = self.beta_1 * K.pow(self.lda, t-1) m_t = (beta_1t * m) + (1 - beta_1t) * g v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon * beta_2t) self.updates.append((m, m_t)) self.updates.append((v, v_t)) self.updates.append((p, p_t)) return self.updates
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [(self.iterations, self.iterations + 1.)] t = self.iterations + 1 beta_2t = K.sqrt(1 - K.pow(self.beta_2, t)) lr_t = self.lr * beta_2t / (1 - K.pow(self.beta_1, t)) for p, g, m, v in zip(params, grads, self.m, self.v): beta_1t = self.beta_1 * K.pow(self.lda, t - 1) m_t = (beta_1t * m) + (1 - beta_1t) * g v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon * beta_2t) self.updates.append((m, m_t)) self.updates.append((v, v_t)) self.updates.append((p, p_t)) return self.updates
def batchnorm(X, batch_size, hidden_dim, gamma, beta, running_mean, running_std, epsilon=1e-10, axis=1, momentum=0.99, train=False): X = K.reshape(X, (batch_size, hidden_dim)) input_shape = (batch_size, hidden_dim) # (1, 512) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[axis] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[axis] = input_shape[axis] # [1, 512] if train: m = K.mean( X, axis=reduction_axes ) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 brodcast_m = K.reshape(m, broadcast_shape) # m.shape = (1, 512) std = K.mean(K.square(X - brodcast_m) + epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = momentum * running_mean + (1 - momentum) * m # (1, 512) std_update = momentum * running_std + (1 - momentum) * std # (1, 512) X_normed = (X - brodcast_m) / (brodcast_std + epsilon) # (1, 512) else: brodcast_m = K.reshape(running_mean, broadcast_shape) brodcast_std = K.reshape(running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + epsilon)) out = K.reshape(gamma, broadcast_shape) * X_normed + K.reshape( beta, broadcast_shape) # (1, 512) return out, mean_update, std_update
def __call__(self, loss): output = self.layer.get_output(True) loss += self.l1 * K.sum(K.mean(K.abs(output), axis=0)) loss += self.l2 * K.sum(K.mean(K.square(output), axis=0)) return loss
def __call__(self, loss): loss += K.sum(K.abs(self.p)) * self.l1 / 2. loss += K.sum(K.square(self.p)) * self.l2 / 2. return loss
def CompileAndUpdate(self, Params): self.regularizerS = [] # for par_name, par_value in Params: # regularizer = regularizers.WeightRegularizer(l1=0., l2=self.options['l2_decay']) # regularizer.set_param(par_value.get_value()) # self.regularizerS.append(regularizer) weight = self.options['weight'] fea = T.tensor4(name='input_features', dtype=theano.config.floatX) att = T.tensor4(name='input_att', dtype='float32') pos_fea = T.tensor4(name='pos_fea', dtype='float32') pos_att = T.tensor4(name='pos_att', dtype='float32') neg_fea = T.tensor4(name='pos_fea', dtype='float32') neg_att = T.tensor4(name='neg_att', dtype='float32') TT = [fea, att, pos_fea, pos_att, neg_fea, neg_att] LSTM = lstm_simple(fea, att, self.options, Params) LSTMproj = LSTM.set_output() LSTMC = ComputeCode(fea, self.options, Params, LSTMproj) frame, featurepart = LSTMC.set_output() LSTM_pos = lstm_simple(pos_fea, pos_att, self.options, Params) LSTMproj_pos = LSTM_pos.set_output() LSTMC_pos = ComputeCode(pos_fea, self.options, Params, LSTMproj_pos) frame_pos, featurepart_pos = LSTMC_pos.set_output() LSTM_neg = lstm_simple(neg_fea, neg_att, self.options, Params) LSTMproj_neg = LSTM_neg.set_output() LSTMC_neg = ComputeCode(neg_fea, self.options, Params, LSTMproj_neg) frame_neg, featurepart_neg = LSTMC_neg.set_output() self.params = LSTM.get_Params() steps = self.options['steps'] self.loss_1 = self.loss2 = self.loss_3 = 0. loss = 0. # for i in range(self.options['batch_size']): AA = K.sigmoid(frame) BB = K.sigmoid(frame_pos) CC = K.sigmoid(frame_neg) Code = AA * featurepart Code_pos = BB * featurepart_pos Code_neg = CC * featurepart_neg Code_ = (Code >= 0).astype('float32') Code_pos_ = (Code_pos >= 0).astype('float32') Code_neg_ = (Code_neg >= 0).astype('float32') # Code = Code / T.sqrt(T.sum(T.sqr(featurepart))) # Code_pos = Code_pos / T.sqrt(T.sum(T.sqr(Code_pos))) # Code_neg = Code_neg / T.sqrt(T.sum(T.sqr(Code_neg))) self.loss2 = T.max( (0, 2. - T.sqrt(T.sum(T.sqr(Code - Code_neg))) / 32. + T.sqrt(T.sum(T.sqr(Code - Code_pos))) / 32.)) for i in range(32): self.loss_3 += T.max( (0, 2. - T.sqrt(T.sum(T.sqr(Code_[0][i] - Code_neg_[0][i]))) + T.sqrt(T.sum(T.sqr(Code_[0][i] - Code_pos_[0][i]))))) loss = self.loss2 + 0.1 * self.loss_3 for par in Params.values(): loss += K.sum(K.square(par)) * self.options['l2_decay'] / 2. # def Regularize(Params): # for par_name, par_value in Params: # Params[par_name] += self.options['l2_decay'] * K.sum(K.mean(K.square(par_value.get_value()), axis=0)) # return Params # Params = Regularize(Params) # opt = optimizer.Adam(self.params, lr=self.options['lrate']) # updates = opt.get_updates(self.params, loss) # train_graph = theano.function([fea, att, pos_fea, pos_att, neg_fea, neg_att], loss, on_unused_input='warn', allow_input_downcast=True) # self.test_graph = theano.function([fea, att, pos_fea, pos_att, neg_fea, neg_att], loss, on_unused_input='warn') my_H_last = Code encoder = theano.function( [fea, att, pos_fea, pos_att, neg_fea, neg_att], my_H_last, on_unused_input='ignore', allow_input_downcast=True) return loss, fea, att, pos_fea, pos_att, neg_fea, neg_att, Code, encoder
def step(self, cell_p, hid_p, mean_p, std_p): embed = T.reshape(T.dot(self.attribute[:, 0], self.params['W_ctx_3']), [self.batch_size, 10]) hidP = T.dot(hid_p, self.params['W_ctx_2']) # (25, 10) embedd = T.repeat(self.params['W_ctx_1'], self.batch_size, 0) * T.tanh( embed + hidP + T.repeat(self.params['b_ctx'], self.batch_size, 0)) # (25, 10) alpha_base = T.reshape(T.exp(embedd), [self.batch_size, 10, 1]) # (25, 10, 1) alpha_base = alpha_base / alpha_base.sum() att = T.reshape(self.attribute[:, 0], [self.batch_size, 10, self.att_frame]) ctx = (alpha_base * att / T.reshape(alpha_base.sum(axis=1), [self.batch_size, 1, 1])).sum( axis=1) # (25, 300) ctx = T.reshape(ctx, [self.batch_size, self.att_frame]) # ctx += T.dot(hid_p, self.params['W_att']) + T.repeat(self.params['b_att'], self.batch_size, 0) input_to = T.dot(ctx, self.params['W_in']) + T.repeat( self.params['b'], self.batch_size, 0) # (25, 2048) # input_to_i = T.dot(ctx, self.params['W_in_i']) + T.repeat(self.params['b_i'], self.batch_size, 0) # input_to_f = T.dot(ctx, self.params['W_in_f']) + T.repeat(self.params['b_f'], self.batch_size, 0) # input_to_o = T.dot(ctx, self.params['W_in_o']) + T.repeat(self.params['b_o'], self.batch_size, 0) # input_to_c = T.dot(ctx, self.params['W_in_c']) + T.repeat(self.params['b_c'], self.batch_size, 0) gate = input_to + T.dot(hid_p, self.params['W_hid']) # gate_i = input_to_i + T.dot(hid_p, self.params['W_hid_i']) # gate_f = input_to_f + T.dot(hid_p, self.params['W_hid_f']) # gate_o = input_to_o + T.dot(hid_p, self.params['W_hid_o']) # gate_c = input_to_c + T.dot(hid_p, self.params['W_hid_c']) # Apply nonlinearities ingate = T.nnet.sigmoid( self._slice(gate, 0, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][0], self.batch_size, 0)) forgetgate = T.nnet.sigmoid( self._slice(gate, 1, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][1], self.batch_size, 0)) cell_input = T.tanh(self._slice(gate, 2, self.hidden_dim)) # Compute new cell value cell = forgetgate * cell_p + ingate * cell_input # BatchNormalization input_shape = (self.batch_size, self.hidden_dim) # (1, 512) cell = K.reshape(cell, input_shape) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[self.axis_bn] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[self.axis_bn] = input_shape[self.axis_bn] # [1, 512] # m = K.mean(cell, axis=reduction_axes) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 m = K.mean(cell, axis=0) brodcast_m = K.reshape(m, [1, self.hidden_dim]) # m.shape = (1, 512) # brodcast_m = m std = K.mean(K.square(cell - brodcast_m) + self.epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = self.momentum * mean_p + (1 - self.momentum) * m # (1, 512) std_update = self.momentum * std_p + (1 - self.momentum) * std # (1, 512) cell_normed = (cell - brodcast_m) / (brodcast_std + self.epsilon ) # (1, 512) cell_bn = K.reshape( self.params['gamma'], broadcast_shape) * cell_normed + K.reshape( self.params['beta'], broadcast_shape) # (1, 512) # cell_bn, mean, std = batchnorm(cell, self.batch_size, self.hidden_dim, self.params['gamma'], self.params['beta'], mean_p, std_p, train=True) outgate = T.nnet.sigmoid( self._slice(gate, 3, self.hidden_dim) + cell_bn * T.repeat(self.params['W_cell'][2], self.batch_size, 0)) # Compute new hidden unit activation hid = outgate * T.tanh(cell_bn) return T.reshape( cell_bn, [self.batch_size, self.hidden_dim]), T.reshape( hid, [self.batch_size, self.hidden_dim]), mean_update, std_update