def test_bprop_vector(self): r = [] for _ in xrange(self.N): embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32) true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = Connector(Matrix.from_npa(true_labels)) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels) qW.fprop() qrow_idxs.fprop() row_slicing_block.fprop() sce_block.fprop() sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_theano_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.imatrix() th_true_labels = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs, np.hstack(true_labels[:sequence_len])) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_bprop_vector(self): r = [] for _ in xrange(self.N): embd_dim = self.rng.random_integers(10000) batch_size, output_dim = self.rng.random_integers(2000, size=2) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, 1)).astype(np.int32) true_labels = self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qW = Connector(Matrix.from_npa(W), device_id) qtrue_labels = Connector(Matrix.from_npa(true_labels)) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) sce_block = SoftmaxCeBlock(row_slicing_block.output, qtrue_labels) qtrue_labels.fprop() qW.fprop() qrow_idxs.fprop() row_slicing_block.fprop() sce_block.fprop() sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.ivector() th_true_labels = T.ivector() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs[:, 0], true_labels[:, 0]) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value())) self.assertEqual(sum(r), len(r))
def test_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
class DotBlock(object): """ Computes dot product (scalar product) between matrices ``W`` and ``x``, also adds bias ``b``. Parameters ---------- W : Matrix (GpuMatrix or CpuMatrix) Weigh matrix b : Matrix (GpuMatrix or CpuMatrix) Bias matrix (one dimesion equals 1, can be view as a vector) x : Matrix (GpuMatrix or CpuMatrix) Block's input device_id : int Defines the device's id on which the computation will take place """ def __init__(self, W, b, x, device_id=None): self.f_context = Context(device_id) device_id = self.f_context.device_id if W.bpropagable: self.W, self.dL_dW = W.register_usage(device_id, device_id) else: self.W = W.register_usage(device_id) if b: if b.bpropagable: self.b, self.dL_db = b.register_usage(device_id, device_id) self.ones = Matrix.empty(x.nrows, 1, self.b.dtype, device_id) self.ones.sync_fill(1.0) else: self.b = b.register_usage(device_id) if x.bpropagable: self.x, self.dL_dx = x.register_usage(device_id, device_id) else: self.x = x.register_usage(device_id) output = Matrix.empty(x.nrows, self.W.ncols, device_id=device_id) self.learning = hasattr(self, 'dL_dW') or hasattr(self, 'dL_db') or \ hasattr(self, 'dL_dx') if self.learning: self.b_context = Context(device_id) self.output = Connector(output, device_id) else: self.output = Connector(output) def fprop(self): self.output.assign_dot(self.f_context, self.x, self.W) if hasattr(self, 'b'): self.output.add(self.f_context, self.b) self.output.fprop() def bprop(self): if not self.learning: return dL_doutput = self.output.backward_matrix # dL/dW = x.T * dL_doutput if hasattr(self, 'dL_dW'): self.dL_dW.add_dot(self.b_context, self.x, dL_doutput, 'T') # TODO(sergii): replace this modification with reduction kernel along axis=0 # dL/db = 1.T * dL_doutput if hasattr(self, 'dL_db'): self.dL_db.add_dot(self.b_context, self.ones, dL_doutput, 'T') # dL/dx = dL_doutput * W.T if hasattr(self, 'dL_dx'): self.dL_dx.add_dot(self.b_context, dL_doutput, self.W, 'N', 'T')