def test_topk_nodes(): # test#1: basic g0 = dgl.DGLGraph(nx.path_graph(14)) feat0 = F.randn((g0.number_of_nodes(), 10)) g0.ndata['x'] = feat0 # to test the case where k > number of nodes. dgl.topk_nodes(g0, 'x', 20, idx=-1) # test correctness val, indices = dgl.topk_nodes(g0, 'x', 5, idx=-1) ground_truth = F.reshape( F.argsort(F.slice_axis(feat0, -1, 9, 10), 0, True)[:5], (5,)) assert F.allclose(ground_truth, indices) g0.ndata.pop('x') # test#2: batched graph g1 = dgl.DGLGraph(nx.path_graph(12)) feat1 = F.randn((g1.number_of_nodes(), 10)) bg = dgl.batch([g0, g1]) bg.ndata['x'] = F.cat([feat0, feat1], 0) # to test the case where k > number of nodes. dgl.topk_nodes(bg, 'x', 16, idx=1) # test correctness val, indices = dgl.topk_nodes(bg, 'x', 6, descending=False, idx=0) ground_truth_0 = F.reshape( F.argsort(F.slice_axis(feat0, -1, 0, 1), 0, False)[:6], (6,)) ground_truth_1 = F.reshape( F.argsort(F.slice_axis(feat1, -1, 0, 1), 0, False)[:6], (6,)) ground_truth = F.stack([ground_truth_0, ground_truth_1], 0) assert F.allclose(ground_truth, indices) # test idx=None val, indices = dgl.topk_nodes(bg, 'x', 6, descending=True) assert F.allclose(val, F.stack([F.topk(feat0, 6, 0), F.topk(feat1, 6, 0)], 0))
def test_edge_softmax(g, norm_by, shp, idtype): g = g.astype(idtype).to(F.ctx()) edata = F.tensor(np.random.rand(g.number_of_edges(), *shp)) e1 = F.attach_grad(F.clone(edata)) with F.record_grad(): score1 = edge_softmax(g, e1, norm_by=norm_by) F.backward(F.reduce_sum(score1)) grad_edata = F.grad(e1) with F.record_grad(): e2 = F.attach_grad(F.clone(edata)) e2_2d = F.reshape( e2, (g.number_of_src_nodes(), g.number_of_dst_nodes(), *e2.shape[1:])) if norm_by == 'src': score2 = F.softmax(e2_2d, 1) score2 = F.reshape(score2, (-1, *e2.shape[1:])) if norm_by == 'dst': score2 = F.softmax(e2_2d, 0) score2 = F.reshape(score2, (-1, *e2.shape[1:])) assert F.allclose(score1, score2) print('forward passed') F.backward(F.reduce_sum(score2)) assert F.allclose(F.grad(e2), grad_edata) print('backward passed')
def step(self, cell_p, hid_p): embed = T.reshape(T.dot(self.attribute[:, 0], self.params['W_ctx_3']), [self.batch_size, 10]) hidP = T.dot(hid_p, self.params['W_ctx_2']) # (25, 10) embedd = T.repeat(self.params['W_ctx_1'], self.batch_size, 0) * T.tanh( embed + hidP + T.repeat(self.params['b_ctx'], self.batch_size, 0)) # (25, 10) alpha_base = T.reshape(T.exp(embedd), [self.batch_size, 10, 1]) # (25, 10, 1) alpha_base = alpha_base / alpha_base.sum() att = T.reshape(self.attribute[:, 0], [self.batch_size, 10, self.att_frame]) ctx = (alpha_base * att / T.reshape(alpha_base.sum(axis=1), [self.batch_size, 1, 1])).sum( axis=1) # (25, 300) ctx = T.reshape(ctx, [self.batch_size, self.att_frame]) # ctx += T.dot(hid_p, self.params['W_att']) + T.repeat(self.params['b_att'], self.batch_size, 0) input_to = T.dot(ctx, self.params['W_in']) + T.repeat( self.params['b'], self.batch_size, 0) # (25, 2048) gate = input_to + T.dot(hid_p, self.params['W_hid']) # Apply nonlinearities ingate = T.nnet.sigmoid( self._slice(gate, 0, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][0], self.batch_size, 0)) forgetgate = T.nnet.sigmoid( self._slice(gate, 1, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][1], self.batch_size, 0)) cell_input = T.tanh(self._slice(gate, 2, self.hidden_dim)) # Compute new cell value cell = forgetgate * cell_p + ingate * cell_input # BatchNormalization # brodcast_m = K.reshape(mean_p, broadcast_shape) # brodcast_std = K.reshape(std_p, broadcast_shape) # cell_normed = ((cell - brodcast_m) / # (brodcast_std + self.epsilon)) broadcast_shape = [self.batch_size, self.hidden_dim] cell_bn = K.reshape( self.params['gamma'], broadcast_shape) * cell + K.reshape( self.params['beta'], broadcast_shape) # (1, 512) outgate = T.nnet.sigmoid( self._slice(gate, 3, self.hidden_dim) + cell_bn * T.repeat(self.params['W_cell'][2], self.batch_size, 0)) # Compute new hidden unit activation hid = outgate * T.tanh(cell_bn) return T.reshape(cell_bn, [self.batch_size, self.hidden_dim]), T.reshape( hid, [self.batch_size, self.hidden_dim])
def __call__(self, edges): sdata = edges.src[self.src_field] edata = edges.data[self.edge_field] # Due to the different broadcasting semantics of different backends, # we need to broadcast the sdata and edata to be of the same rank. rank = max(F.ndim(sdata), F.ndim(edata)) sshape = F.shape(sdata) eshape = F.shape(edata) sdata = F.reshape(sdata, sshape + (1, ) * (rank - F.ndim(sdata))) edata = F.reshape(edata, eshape + (1, ) * (rank - F.ndim(edata))) ret = self.mul_op(sdata, edata) return {self.out_field: ret}
def call(self, x, mask=None): b, xb = 0., 0. if self.data_format == 'channels_first': kernel_sum_axes = [1, 2, 3] if self.use_bias: b = K.reshape(self.b, (self.filters, 1, 1, 1)) xb = 1. elif self.data_format == 'channels_last': kernel_sum_axes = [0, 1, 2] if self.use_bias: b = K.reshape(self.b, (1, 1, 1, self.filters)) xb = 1. Wnorm = K.sqrt( K.sum(K.square(self.W), axis=kernel_sum_axes, keepdims=True) + K.square(b) + K.epsilon()) xnorm = K.sqrt( K.conv2d(K.square(x), self.kernel_norm, strides=self.strides, padding=self.padding, data_format=self.data_format, filter_shape=self.kernel_norm_shape) + xb + K.epsilon()) W = self.W / Wnorm output = K.conv2d(x, W, strides=self.strides, padding=self.padding, data_format=self.data_format, filter_shape=self.kernel_shape) if K.backend() == 'theano': xnorm = K.pattern_broadcast(xnorm, [False, True, False, False]) output /= xnorm if self.use_bias: b /= Wnorm if self.data_format == 'channels_first': b = K.reshape(b, (1, self.filters, 1, 1)) elif self.data_format == 'channels_last': b = K.reshape(b, (1, 1, 1, self.filters)) else: raise ValueError('Invalid data_format:', self.data_format) b /= xnorm output += b output = self.activation(output) return output
def get_smallest_eigenpair(hes_val, eigvec_shape): """ Get the smallest eigenvalue and its corresponding eigenvector of the input hes_val. """ assert len(hes_val.shape) == 2 * len(eigvec_shape) assert np.array_equal(eigvec_shape, hes_val.shape[:len(eigvec_shape)]) assert np.array_equal(eigvec_shape, hes_val.shape[len(eigvec_shape):]) # get the eigenvector of the hessian matrix hes_val_mat = T.reshape(hes_val, (np.prod(eigvec_shape), -1)) eigvals, eigvecs = T.eigh(hes_val_mat) # index for smallest eigenvalue idx = T.argmin(eigvals) eig_val = eigvals[idx] eigvec = T.reshape(eigvecs[:, idx], eigvec_shape) return eig_val, eigvec
def set_output(self, train=False): [X_H, X_M] = self.get_input(train=train) assert hasattr(self, 'input_frame') [cell, hid] = self.step(self.input_frame, X_M, X_H, train) self.output = [hid, cell] self.output_frame = T.dot(hid, self.lstmpar.W_output) + K.reshape( self.lstmpar.b_output, [1, self.output_dim])
def step(self, cell_previous, hid_previous, train): ingate = T.dot(hid_previous, self.lstmpar.W_hid_to_ingate) + K.reshape(self.lstmpar.b_ingate,[1,self.num_lstm]) forgetgate = T.dot(hid_previous, self.lstmpar.W_hid_to_forgetgate) + K.reshape(self.lstmpar.b_forgetgate,[1,self.num_lstm]) cell_input = T.dot(hid_previous, self.lstmpar.W_hid_to_cell) + K.reshape(self.lstmpar.b_cell,[1,self.num_lstm]) outgate = T.dot(hid_previous, self.lstmpar.W_hid_to_outgate) + K.reshape(self.lstmpar.b_outgate,[1,self.num_lstm]) # Compute peephole connections ingate += cell_previous * K.reshape(self.lstmpar.W_cell_to_ingate,[1,self.num_lstm]) forgetgate += cell_previous * K.reshape(self.lstmpar.W_cell_to_forgetgate,[1,self.num_lstm]) # Apply nonlinearities ingate = K.sigmoid(ingate) forgetgate = K.sigmoid(forgetgate) cell_input = K.tanh(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input cell_bn = self.bn.set_output(cell,train=train) outgate += cell_bn * K.reshape(self.lstmpar.W_cell_to_outgate,[1,self.num_lstm]) outgate = K.sigmoid(outgate) # Compute new hidden unit activation if self.use_th: hid = outgate * K.tanh(cell_bn) else: hid = outgate * cell_bn return [cell_bn, hid]
def test_broadcast(idtype, g): g = g.astype(idtype).to(F.ctx()) gfeat = F.randn((g.batch_size, 3)) # Test.0: broadcast_nodes g.ndata['h'] = dgl.broadcast_nodes(g, gfeat) subg = dgl.unbatch(g) for i, sg in enumerate(subg): assert F.allclose( sg.ndata['h'], F.repeat(F.reshape(gfeat[i], (1, 3)), sg.number_of_nodes(), dim=0)) # Test.1: broadcast_edges g.edata['h'] = dgl.broadcast_edges(g, gfeat) subg = dgl.unbatch(g) for i, sg in enumerate(subg): assert F.allclose( sg.edata['h'], F.repeat(F.reshape(gfeat[i], (1, 3)), sg.number_of_edges(), dim=0))
def call(self, x, **kwargs): debug_print("call") # filters = K.zeros(shape=(N_filt, Filt_dim)) min_freq = 50.0 min_band = 50.0 filt_beg_freq = K.abs(self.filt_b1) + min_freq / self.freq_scale filt_end_freq = filt_beg_freq + (K.abs(self.filt_band) + min_band / self.freq_scale) n = np.linspace(0, self.Filt_dim, self.Filt_dim) window = 0.54 - 0.46 * K.cos(2 * math.pi * n / self.Filt_dim) window = K.cast(window, "float32") window = K.variable(window) t_right_linspace = np.linspace(1, (self.Filt_dim - 1) / 2, int((self.Filt_dim - 1) / 2)) t_right = K.variable(t_right_linspace / self.fs) # Compute the filters. output_list = [] for i in range(self.N_filt): low_pass1 = ( 2 * self.filt_beg_freq[i] * sinc(self.filt_beg_freq[i] * self.freq_scale, self.t_right)) low_pass2 = ( 2 * self.filt_end_freq[i] * sinc(self.filt_end_freq[i] * self.freq_scale, self.t_right)) band_pass = low_pass2 - low_pass1 band_pass = band_pass / K.max(band_pass) output_list.append(band_pass * self.window) filters = K.stack(output_list) # (80, 251) filters = K.transpose(filters) # (251, 80) filters = K.reshape( filters, (self.Filt_dim, 1, self.N_filt) ) # (251,1,80) in TF: (filter_width, in_channels, out_channels) in # PyTorch (out_channels, in_channels, filter_width) """Given an input tensor of shape [batch, in_width, in_channels] if data_format is "NWC", or [batch, in_channels, in_width] if data_format is "NCW", and a filter / kernel tensor of shape [filter_width, in_channels, out_channels], this op reshapes the arguments to pass them to conv2d to perform the equivalent convolution operation. Internally, this op reshapes the input tensors and invokes tf.nn.conv2d. For example, if data_format does not start with "NC", a tensor of shape [batch, in_width, in_channels] is reshaped to [ batch, 1, in_width, in_channels], and the filter is reshaped to [1, filter_width, in_channels, out_channels]. The result is then reshaped back to [batch, out_width, out_channels] (where out_width is a function of the stride and padding as in conv2d) and returned to the caller. """ # Do the convolution. debug_print("call") debug_print(" x", x) debug_print(" filters", filters) out = K.conv1d(x, kernel=filters) debug_print(" out", out) return out
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + (1-self.momentum) * m std_update = self.momentum * self.running_std + (1-self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape(self.beta, broadcast_shape) return out
def set_output(self, X, train=False): input_shape = (self.batch_size, self.num_lstm) reduction_axes = list(range(len(input_shape))) del reduction_axes[self.axis] broadcast_shape = [1] * len(input_shape) broadcast_shape[self.axis] = input_shape[self.axis] if train: m = K.mean(X, axis=reduction_axes) brodcast_m = K.reshape(m, broadcast_shape) std = K.mean(K.square(X - brodcast_m) + self.epsilon, axis=reduction_axes) std = K.sqrt(std) brodcast_std = K.reshape(std, broadcast_shape) mean_update = self.momentum * self.running_mean + ( 1 - self.momentum) * m std_update = self.momentum * self.running_std + ( 1 - self.momentum) * std self.updates = [(self.running_mean, mean_update), (self.running_std, std_update)] X_normed = (X - brodcast_m) / (brodcast_std + self.epsilon) else: brodcast_m = K.reshape(self.running_mean, broadcast_shape) brodcast_std = K.reshape(self.running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + self.epsilon)) out = K.reshape(self.gamma, broadcast_shape) * X_normed + K.reshape( self.beta, broadcast_shape) return out
def step(self, input_n, cell_previous, hid_previous, train): input_to_in = T.dot(input_n, self.lstmpar.W_in_to_ingate) + K.reshape( self.lstmpar.b_ingate, [1, self.num_lstm]) input_to_forget = T.dot( input_n, self.lstmpar.W_in_to_forgetgate) + K.reshape( self.lstmpar.b_forgetgate, [1, self.num_lstm]) input_to_cell = T.dot(input_n, self.lstmpar.W_in_to_cell) + K.reshape( self.lstmpar.b_cell, [1, self.num_lstm]) input_to_out = T.dot(input_n, self.lstmpar.W_in_to_outgate) + K.reshape( self.lstmpar.b_outgate, [1, self.num_lstm]) ingate = input_to_in + T.dot(hid_previous, self.lstmpar.W_hid_to_ingate) forgetgate = input_to_forget + T.dot(hid_previous, self.lstmpar.W_hid_to_forgetgate) cell_input = input_to_cell + T.dot(hid_previous, self.lstmpar.W_hid_to_cell) outgate = input_to_out + T.dot(hid_previous, self.lstmpar.W_hid_to_outgate) # Compute peephole connections ingate += cell_previous * K.reshape(self.lstmpar.W_cell_to_ingate, [1, self.num_lstm]) forgetgate += cell_previous * K.reshape( self.lstmpar.W_cell_to_forgetgate, [1, self.num_lstm]) # Apply nonlinearities ingate = K.sigmoid(ingate) forgetgate = K.sigmoid(forgetgate) cell_input = K.tanh(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input cell_bn = self.bn.set_output(cell, train=train) outgate += cell_bn * K.reshape(self.lstmpar.W_cell_to_outgate, [1, self.num_lstm]) outgate = K.sigmoid(outgate) # Compute new hidden unit activation hid = outgate * cell_bn return [cell_bn, hid]
def _matvec(self, vec): dg = self.dmrg_graph in_data = T.reshape(vec, self.eigvec_shape) self.feed_dict.update({dg.vnodes[self.index]: in_data}) if self.initial_matvec: if self.index == 0: reset_graph = True evicted_inputs = [] else: reset_graph = False evicted_inputs = [ dg.mps_inputs[self.index - 1], dg.vnodes[self.index] ] self.initial_matvec = False else: reset_graph = False evicted_inputs = [dg.vnodes[self.index]] out_data, = self.executor.run(feed_dict=self.feed_dict, reset_graph=reset_graph, evicted_inputs=evicted_inputs, out_nodes=[dg.hvps[self.index]]) return out_data.ravel()
def batchnorm(X, batch_size, hidden_dim, gamma, beta, running_mean, running_std, epsilon=1e-10, axis=1, momentum=0.99, train=False): X = K.reshape(X, (batch_size, hidden_dim)) input_shape = (batch_size, hidden_dim) # (1, 512) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[axis] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[axis] = input_shape[axis] # [1, 512] if train: m = K.mean( X, axis=reduction_axes ) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 brodcast_m = K.reshape(m, broadcast_shape) # m.shape = (1, 512) std = K.mean(K.square(X - brodcast_m) + epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = momentum * running_mean + (1 - momentum) * m # (1, 512) std_update = momentum * running_std + (1 - momentum) * std # (1, 512) X_normed = (X - brodcast_m) / (brodcast_std + epsilon) # (1, 512) else: brodcast_m = K.reshape(running_mean, broadcast_shape) brodcast_std = K.reshape(running_std, broadcast_shape) X_normed = ((X - brodcast_m) / (brodcast_std + epsilon)) out = K.reshape(gamma, broadcast_shape) * X_normed + K.reshape( beta, broadcast_shape) # (1, 512) return out, mean_update, std_update
def dmrg_local_update(intermediate, eigvec, max_mps_rank): """ Perform local update for DMRG. Parameters ---------- intermediate: the input einsum node. Its inputs are two mps sites. eigvec: the eigenvector to get the low rank decomposition. max_mps_rank: maximum mps tensor rank. """ # parse intermediate strings inputs = intermediate.inputs assert len(inputs) == 2 # Here input names are formatted as A{i}. index_input_0 = int(inputs[0].name[1:]) index_input_1 = int(inputs[1].name[1:]) in_subs, out_subs, _ = _parse_einsum_input( (intermediate.einsum_subscripts, *intermediate.inputs)) if index_input_0 > index_input_1: # right site appers first right_subs, left_subs = in_subs.split(',') else: left_subs, right_subs = in_subs.split(',') map_subs_indices = dict(zip(out_subs, list(range(len(intermediate.shape))))) contract_char, = list(set(left_subs) - set(out_subs)) left_uncontract_chars = list(set(left_subs) - set(contract_char)) right_uncontract_chars = list(set(right_subs) - set(contract_char)) left_indices = [map_subs_indices[char] for char in left_uncontract_chars] right_indices = [map_subs_indices[char] for char in right_uncontract_chars] left_uncontract_str = "".join(left_uncontract_chars) right_uncontract_str = "".join(right_uncontract_chars) ############################################################# # svd decomposition to get updated sites eigvec_shape = intermediate.shape eigvec_mat = T.transpose(eigvec, left_indices + right_indices) eigvec_mat = T.reshape(eigvec_mat, (np.prod([eigvec_shape[i] for i in left_indices]), -1)) U, s, VT = T.svd(eigvec_mat) rank = min([max_mps_rank, eigvec_mat.shape[0], eigvec_mat.shape[1]]) U, s, VT = U[:, :rank], s[:rank], VT[:rank, :] VT = T.diag(s) @ VT U = T.reshape(U, [eigvec_shape[i] for i in left_indices] + [rank]) VT = T.reshape(VT, ([rank] + [eigvec_shape[i] for i in right_indices])) left = T.einsum(f"{left_uncontract_str}{contract_char}->{left_subs}", U) right = T.einsum(f"{contract_char}{right_uncontract_str}->{right_subs}", VT) return left, right
def dmrg_shared_exec_iterative_solve(mpo_tensors, init_mps_tensors, max_mps_rank, num_iter=1, sequence='R'): """ Perform DMRG iterations with shared execution and iterative solve. """ if sequence != "R": raise NotImplementedError num = len(mpo_tensors) size = mpo_tensors[0].shape[1] mpo_ranks = [mpo_tensors[i].shape[0] for i in range(1, len(mpo_tensors))] mps_tensors = copy.deepcopy(init_mps_tensors) mps_ranks = [mps_tensors[i].shape[0] for i in range(1, len(mps_tensors))] dg = DmrgImplicitUpdateGraph.create(num, mpo_ranks, mps_ranks, size) for i, hvp in enumerate(dg.hvps): dg.hvps[i] = simplify(hvp) assert isinstance(hvp, ad.EinsumNode) dg.hvps = generate_sequential_optimal_tree(dg.hvps, dg.mps_inputs) executor_hvps = ad.Executor(dg.hvps) executor_intermediates = ad.Executor(dg.intermediates) # sequence is R for iter in range(num_iter): mps_tensors = gauge_transform_mps(mps_tensors, right=True) mps_ranks = [ mps_tensors[i].shape[0] for i in range(1, len(mps_tensors)) ] for i in range(num - 1): dg.update_graph(num, mpo_ranks, mps_ranks, size) feed_dict = dict(zip(dg.mpo_inputs, mpo_tensors)) feed_dict.update(dict(zip(dg.mps_inputs, mps_tensors))) intermediate, = executor_intermediates.run( feed_dict=feed_dict, out_nodes=[dg.intermediates[i]]) # Calculate the eigenvector using the implicit solver. # Note: This only supports NumPy datatype. # TODO: Add a general Lanczos solver that adapts to all the backends. operator = DMRGLinearOperator(dg, executor_hvps, i, feed_dict) # Reference: https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.eigsh.html eig_vals, eigvecs = spla.eigsh(operator, k=1, ncv=4, tol=1e-3, which='SA', v0=intermediate.ravel()) eig_val, eigvec = eig_vals[0], eigvecs[:, 0] eigvec = T.reshape(eigvec, dg.intermediates[i].shape) # Update the two sites of mps mps_tensors[i], mps_tensors[i + 1] = dmrg_local_update( dg.intermediates[i], eigvec, max_mps_rank) # update the rank mps_ranks[i] = mps_tensors[i + 1].shape[0] print(f'At site {i}, the smallest eigenvalue is: {eig_val}') print(f'At iteration {iter} the smallest eigenvalue is: {eig_val}') return mps_tensors, eig_val
def set_output(self,train=False): [X_H, X_M] = self.get_input(train=train) [cell, hid] = self.step(X_M, X_H, train) self.output = [hid, cell] self.output_frame = T.dot(hid, self.lstmpar.W_output) + K.reshape(self.lstmpar.b_output,[1,self.dim_frame])
def set_output(self, train=False): [X_H, X_M] = self.get_input(train=train) [cell, hid] = self.step(X_M, X_H, train) self.output = [hid, cell] self.output_frame = T.dot(hid, self.lstmpar.W_output) + K.reshape( self.lstmpar.b_output, [1, self.dim_frame])
def step(self, cell_p, hid_p, mean_p, std_p): embed = T.reshape(T.dot(self.attribute[:, 0], self.params['W_ctx_3']), [self.batch_size, 10]) hidP = T.dot(hid_p, self.params['W_ctx_2']) # (25, 10) embedd = T.repeat(self.params['W_ctx_1'], self.batch_size, 0) * T.tanh( embed + hidP + T.repeat(self.params['b_ctx'], self.batch_size, 0)) # (25, 10) alpha_base = T.reshape(T.exp(embedd), [self.batch_size, 10, 1]) # (25, 10, 1) alpha_base = alpha_base / alpha_base.sum() att = T.reshape(self.attribute[:, 0], [self.batch_size, 10, self.att_frame]) ctx = (alpha_base * att / T.reshape(alpha_base.sum(axis=1), [self.batch_size, 1, 1])).sum( axis=1) # (25, 300) ctx = T.reshape(ctx, [self.batch_size, self.att_frame]) # ctx += T.dot(hid_p, self.params['W_att']) + T.repeat(self.params['b_att'], self.batch_size, 0) input_to = T.dot(ctx, self.params['W_in']) + T.repeat( self.params['b'], self.batch_size, 0) # (25, 2048) # input_to_i = T.dot(ctx, self.params['W_in_i']) + T.repeat(self.params['b_i'], self.batch_size, 0) # input_to_f = T.dot(ctx, self.params['W_in_f']) + T.repeat(self.params['b_f'], self.batch_size, 0) # input_to_o = T.dot(ctx, self.params['W_in_o']) + T.repeat(self.params['b_o'], self.batch_size, 0) # input_to_c = T.dot(ctx, self.params['W_in_c']) + T.repeat(self.params['b_c'], self.batch_size, 0) gate = input_to + T.dot(hid_p, self.params['W_hid']) # gate_i = input_to_i + T.dot(hid_p, self.params['W_hid_i']) # gate_f = input_to_f + T.dot(hid_p, self.params['W_hid_f']) # gate_o = input_to_o + T.dot(hid_p, self.params['W_hid_o']) # gate_c = input_to_c + T.dot(hid_p, self.params['W_hid_c']) # Apply nonlinearities ingate = T.nnet.sigmoid( self._slice(gate, 0, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][0], self.batch_size, 0)) forgetgate = T.nnet.sigmoid( self._slice(gate, 1, self.hidden_dim) + cell_p * T.repeat(self.params['W_cell'][1], self.batch_size, 0)) cell_input = T.tanh(self._slice(gate, 2, self.hidden_dim)) # Compute new cell value cell = forgetgate * cell_p + ingate * cell_input # BatchNormalization input_shape = (self.batch_size, self.hidden_dim) # (1, 512) cell = K.reshape(cell, input_shape) reduction_axes = list(range(len(input_shape))) # [0, 1] del reduction_axes[self.axis_bn] # [0] broadcast_shape = [1] * len(input_shape) # [1, 1] broadcast_shape[self.axis_bn] = input_shape[self.axis_bn] # [1, 512] # m = K.mean(cell, axis=reduction_axes) # m.shape = (1, 512), note that if matrix is 1-d then mean function will return one number even if axis=0 m = K.mean(cell, axis=0) brodcast_m = K.reshape(m, [1, self.hidden_dim]) # m.shape = (1, 512) # brodcast_m = m std = K.mean(K.square(cell - brodcast_m) + self.epsilon, axis=reduction_axes) # batchnormed m(m**2) std = K.sqrt(std) # batchnormed m, (1, 512) brodcast_std = K.reshape(std, broadcast_shape) # (1, 512) mean_update = self.momentum * mean_p + (1 - self.momentum) * m # (1, 512) std_update = self.momentum * std_p + (1 - self.momentum) * std # (1, 512) cell_normed = (cell - brodcast_m) / (brodcast_std + self.epsilon ) # (1, 512) cell_bn = K.reshape( self.params['gamma'], broadcast_shape) * cell_normed + K.reshape( self.params['beta'], broadcast_shape) # (1, 512) # cell_bn, mean, std = batchnorm(cell, self.batch_size, self.hidden_dim, self.params['gamma'], self.params['beta'], mean_p, std_p, train=True) outgate = T.nnet.sigmoid( self._slice(gate, 3, self.hidden_dim) + cell_bn * T.repeat(self.params['W_cell'][2], self.batch_size, 0)) # Compute new hidden unit activation hid = outgate * T.tanh(cell_bn) return T.reshape( cell_bn, [self.batch_size, self.hidden_dim]), T.reshape( hid, [self.batch_size, self.hidden_dim]), mean_update, std_update