def get_updates(self, params, gparams): self._ms_ = [] self._vs_ = [] for param in params: self._ms_ += [K.shared(np.zeros_like(param.get_value()))] self._vs_ += [K.shared(np.zeros_like(param.get_value()))] updates = [] t = self._iter_ + 1 alpha_t = self._alpha_ * (K.sqrt(1. - K.power(self._beta2_, t)) / (1. - K.power(self._beta1_, t))) for p, g, m, v in zip(params, gparams, self._ms_, self._vs_): m_new = self._beta1_ * m + (1. - self._beta1_) * g updates.append((m, m_new)) v_new = self._beta2_ * v + (1. - self._beta2_) * K.sqr(g) updates.append((v, v_new)) p_new = p - alpha_t * m_new / (K.sqrt(v_new) + self._eps_) updates.append((p, p_new)) updates.append((self._iter_, self._iter_ + 1)) return updates
def call(self, x, mask=None): b, xb = 0., 0. if self.data_format == 'channels_first': kernel_sum_axes = [1, 2, 3] if self.use_bias: b = K.reshape(self.b, (self.filters, 1, 1, 1)) xb = 1. elif self.data_format == 'channels_last': kernel_sum_axes = [0, 1, 2] if self.use_bias: b = K.reshape(self.b, (1, 1, 1, self.filters)) xb = 1. Wnorm = K.sqrt( K.sum(K.square(self.W), axis=kernel_sum_axes, keepdims=True) + K.square(b) + K.epsilon()) xnorm = K.sqrt( K.conv2d(K.square(x), self.kernel_norm, strides=self.strides, padding=self.padding, data_format=self.data_format, filter_shape=self.kernel_norm_shape) + xb + K.epsilon()) W = self.W / Wnorm output = K.conv2d(x, W, strides=self.strides, padding=self.padding, data_format=self.data_format, filter_shape=self.kernel_shape) if K.backend() == 'theano': xnorm = K.pattern_broadcast(xnorm, [False, True, False, False]) output /= xnorm if self.use_bias: b /= Wnorm if self.data_format == 'channels_first': b = K.reshape(b, (1, self.filters, 1, 1)) elif self.data_format == 'channels_last': b = K.reshape(b, (1, 1, 1, self.filters)) else: raise ValueError('Invalid data_format:', self.data_format) b /= xnorm output += b output = self.activation(output) return output
def get_updates(self, params, gparams): if not self._ms: for param in params: self._ms += [K.shared(np.zeros_like(param.get_value()))] self._vs += [K.shared(np.zeros_like(param.get_value()))] update_params = [] update_ms = [] update_vs = [] for i1 in xrange(len(params)): m_new = self._beta1 * self._ms[i1] + (1 - self._beta1) * gparams[i1] v_new = self._beta2 * self._vs[i1] + (1 - self._beta2) * gparams[i1]**2 m_unbias = m_new / (1 - K.power(self._beta1, self._epoch)) v_unbias = v_new / (1 - K.power(self._beta2, self._epoch)) param_new = params[i1] - self._alpha * m_unbias / ( K.sqrt(v_unbias) + self._eps) update_ms += [(self._ms[i1], m_new)] update_vs += [(self._vs[i1], v_new)] update_params += [(params[i1], param_new)] update_epoch = [(self._epoch, self._epoch + 1.)] updates = update_params + update_ms + update_vs + update_epoch return updates
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + ( 1 - self.momentum) * m std_update = self.momentum * self.running_std + ( 1 - self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape( self.beta, broadcast_shape) return out
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + (1-self.momentum) * m std_update = self.momentum * self.running_std + (1-self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape(self.beta, broadcast_shape) return out
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [(self.iterations, self.iterations+1.)] t = self.iterations + 1 beta_2t = K.sqrt(1 - K.pow(self.beta_2, t)) lr_t = self.lr * beta_2t / (1 - K.pow(self.beta_1, t)) for p, g, m, v in zip(params, grads, self.m, self.v): beta_1t = self.beta_1 * K.pow(self.lda, t-1) m_t = (beta_1t * m) + (1 - beta_1t) * g v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon * beta_2t) self.updates.append((m, m_t)) self.updates.append((v, v_t)) self.updates.append((p, p_t)) return self.updates
def get_updates(self, params, loss): grads = self.get_gradients(loss, params) self.updates = [(self.iterations, self.iterations + 1.)] t = self.iterations + 1 beta_2t = K.sqrt(1 - K.pow(self.beta_2, t)) lr_t = self.lr * beta_2t / (1 - K.pow(self.beta_1, t)) for p, g, m, v in zip(params, grads, self.m, self.v): beta_1t = self.beta_1 * K.pow(self.lda, t - 1) m_t = (beta_1t * m) + (1 - beta_1t) * g v_t = (self.beta_2 * v) + (1 - self.beta_2) * K.square(g) p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon * beta_2t) self.updates.append((m, m_t)) self.updates.append((v, v_t)) self.updates.append((p, p_t)) return self.updates
def get_updates(self, params, gparams): self._accumulators_ = [] self._delta_accumulators_ = [] for param in params: self._accumulators_ += [K.shared(np.zeros_like(param.get_value()))] self._delta_accumulators_ += [K.shared(np.zeros_like(param.get_value()))] updates = [] for p, g, a, d_a in zip(params, gparams, self._accumulators_, self._delta_accumulators_): a_new = self._rou_ * a + (1. - self._rou_) * K.sqr(g) updates.append((a, a_new)) p_delta = - g * K.sqrt(d_a + self._eps_) / K.sqrt(a_new + self._eps_) p_new = p + p_delta updates.append((p, p_new)) d_a_new = self._rou_ * d_a + (1. - self._rou_) * K.sqr(p_delta) updates.append((d_a, d_a_new)) return updates
def get_updates(self, params, gparams): self._accumulators_ = [] for param in params: self._accumulators_.append(K.shared(np.zeros_like(K.get_value(param)))) updates = [] for p, g, a in zip(params, gparams, self._accumulators_): a_new = a + K.sqr(g) p_new = p - self._lr_ * g / (K.sqrt(a_new) + self._eps_) updates.append((a, a_new)) updates.append((p, p_new)) return updates
def get_updates(self, params, gparams): if len(self._Gs) == 0: for param in params: self._Gs.append(K.shared(np.zeros_like(K.get_value(param)))) update_params = [] update_Gs = [] for i1 in xrange(len(params)): G_new = self._Gs[i1] + gparams[i1]**2 update_Gs.append((self._Gs[i1], G_new)) update_params.append( (params[i1], params[i1] - self._lr * gparams[i1] / K.sqrt(G_new + self._eps))) return update_params + update_Gs
def batchnorm(X, batch_size, hidden_dim, gamma, beta, running_mean, running_std, epsilon=1e-10, axis=1, momentum=0.99, train=False): X = K.reshape(X, (batch_size, hidden_dim)) input_shape = (batch_size, hidden_dim) # (1, 512) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[axis] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[axis] = input_shape[axis] # [1, 512] if train: m = K.mean( X, axis=reduction_axes ) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 brodcast_m = K.reshape(m, broadcast_shape) # m.shape = (1, 512) std = K.mean(K.square(X - brodcast_m) + epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = momentum * running_mean + (1 - momentum) * m # (1, 512) std_update = momentum * running_std + (1 - momentum) * std # (1, 512) X_normed = (X - brodcast_m) / (brodcast_std + epsilon) # (1, 512) else: brodcast_m = K.reshape(running_mean, broadcast_shape) brodcast_std = K.reshape(running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + epsilon)) out = K.reshape(gamma, broadcast_shape) * X_normed + K.reshape( beta, broadcast_shape) # (1, 512) return out, mean_update, std_update
def test_knn_cpu(algorithm, dist): x = th.randn(8, 3).to(F.cpu()) kg = dgl.nn.KNNGraph(3) if dist == 'euclidean': d = th.cdist(x, x).to(F.cpu()) else: x = x + th.randn(1).item() tmp_x = x / (1e-5 + F.sqrt(F.sum(x * x, dim=1, keepdims=True))) d = 1 - F.matmul(tmp_x, tmp_x.T).to(F.cpu()) def check_knn(g, x, start, end, k): assert g.device == x.device for v in range(start, end): src, _ = g.in_edges(v) src = set(src.numpy()) i = v - start src_ans = set( th.topk(d[start:end, start:end][i], k, largest=False)[1].numpy() + start) assert src == src_ans # check knn with 2d input g = kg(x, algorithm, dist) check_knn(g, x, 0, 8, 3) # check knn with 3d input g = kg(x.view(2, 4, 3), algorithm, dist) check_knn(g, x, 0, 4, 3) check_knn(g, x, 4, 8, 3) # check segmented knn kg = dgl.nn.SegmentedKNNGraph(3) g = kg(x, [3, 5], algorithm, dist) check_knn(g, x, 0, 3, 3) check_knn(g, x, 3, 8, 3) # check k > num_points kg = dgl.nn.KNNGraph(10) with pytest.warns(DGLWarning): g = kg(x, algorithm, dist) check_knn(g, x, 0, 8, 8) with pytest.warns(DGLWarning): g = kg(x.view(2, 4, 3), algorithm, dist) check_knn(g, x, 0, 4, 4) check_knn(g, x, 4, 8, 4) kg = dgl.nn.SegmentedKNNGraph(5) with pytest.warns(DGLWarning): g = kg(x, [3, 5], algorithm, dist) check_knn(g, x, 0, 3, 3) check_knn(g, x, 3, 8, 3) # check k == 0 kg = dgl.nn.KNNGraph(0) with pytest.raises(DGLError): g = kg(x, algorithm, dist) kg = dgl.nn.SegmentedKNNGraph(0) with pytest.raises(DGLError): g = kg(x, [3, 5], algorithm, dist) # check empty x_empty = th.tensor([]) kg = dgl.nn.KNNGraph(3) with pytest.raises(DGLError): g = kg(x_empty, algorithm, dist) kg = dgl.nn.SegmentedKNNGraph(3) with pytest.raises(DGLError): g = kg(x_empty, [3, 5], algorithm, dist)
def step(self, cell_p, hid_p, mean_p, std_p): embed = T.reshape(T.dot(self.attribute[:, 0], self.params['W_ctx_3']), [self.batch_size, 10]) hidP = T.dot(hid_p, self.params['W_ctx_2']) # (25, 10) embedd = T.repeat(self.params['W_ctx_1'], self.batch_size, 0) * T.tanh( embed + hidP + T.repeat(self.params['b_ctx'], self.batch_size, 0)) # (25, 10) alpha_base = T.reshape(T.exp(embedd), [self.batch_size, 10, 1]) # (25, 10, 1) alpha_base = alpha_base / alpha_base.sum() att = T.reshape(self.attribute[:, 0], [self.batch_size, 10, self.att_frame]) ctx = (alpha_base * att / T.reshape(alpha_base.sum(axis=1), [self.batch_size, 1, 1])).sum( axis=1) # (25, 300) ctx = T.reshape(ctx, [self.batch_size, self.att_frame]) # ctx += T.dot(hid_p, self.params['W_att']) + T.repeat(self.params['b_att'], self.batch_size, 0) input_to = T.dot(ctx, self.params['W_in']) + T.repeat( self.params['b'], self.batch_size, 0) # (25, 2048) # input_to_i = T.dot(ctx, self.params['W_in_i']) + T.repeat(self.params['b_i'], self.batch_size, 0) # input_to_f = T.dot(ctx, self.params['W_in_f']) + T.repeat(self.params['b_f'], self.batch_size, 0) # input_to_o = T.dot(ctx, self.params['W_in_o']) + T.repeat(self.params['b_o'], self.batch_size, 0) # input_to_c = T.dot(ctx, self.params['W_in_c']) + T.repeat(self.params['b_c'], self.batch_size, 0) gate = input_to + T.dot(hid_p, self.params['W_hid']) # gate_i = input_to_i + T.dot(hid_p, self.params['W_hid_i']) # gate_f = input_to_f + T.dot(hid_p, self.params['W_hid_f']) # gate_o = input_to_o + T.dot(hid_p, self.params['W_hid_o']) # gate_c = input_to_c + T.dot(hid_p, self.params['W_hid_c']) # Apply nonlinearities ingate = T.nnet.sigmoid( self._slice(gate, 0, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][0], self.batch_size, 0)) forgetgate = T.nnet.sigmoid( self._slice(gate, 1, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][1], self.batch_size, 0)) cell_input = T.tanh(self._slice(gate, 2, self.hidden_dim)) # Compute new cell value cell = forgetgate * cell_p + ingate * cell_input # BatchNormalization input_shape = (self.batch_size, self.hidden_dim) # (1, 512) cell = K.reshape(cell, input_shape) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[self.axis_bn] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[self.axis_bn] = input_shape[self.axis_bn] # [1, 512] # m = K.mean(cell, axis=reduction_axes) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 m = K.mean(cell, axis=0) brodcast_m = K.reshape(m, [1, self.hidden_dim]) # m.shape = (1, 512) # brodcast_m = m std = K.mean(K.square(cell - brodcast_m) + self.epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = self.momentum * mean_p + (1 - self.momentum) * m # (1, 512) std_update = self.momentum * std_p + (1 - self.momentum) * std # (1, 512) cell_normed = (cell - brodcast_m) / (brodcast_std + self.epsilon ) # (1, 512) cell_bn = K.reshape( self.params['gamma'], broadcast_shape) * cell_normed + K.reshape( self.params['beta'], broadcast_shape) # (1, 512) # cell_bn, mean, std = batchnorm(cell, self.batch_size, self.hidden_dim, self.params['gamma'], self.params['beta'], mean_p, std_p, train=True) outgate = T.nnet.sigmoid( self._slice(gate, 3, self.hidden_dim) + cell_bn * T.repeat(self.params['W_cell'][2], self.batch_size, 0)) # Compute new hidden unit activation hid = outgate * T.tanh(cell_bn) return T.reshape( cell_bn, [self.batch_size, self.hidden_dim]), T.reshape( hid, [self.batch_size, self.hidden_dim]), mean_update, std_update