def __init__(self, train_data, valid_data, batch_size, word_dropout_prob, device_id): self.train_data = HomogeneousDataIterator(train_data, batch_size, randomize=True, infinite=True) self.valid_data = HomogeneousDataIterator(valid_data, batch_size) self.train_data_iterator = iter(self.train_data) self.valid_data_iterator = iter(self.valid_data) self.word_keep_prob = 1.0 - word_dropout_prob self.rnd = RandomState(47571) self.unk_idx = word_to_idx['<UNK>'] self.context = Context(device_id) c = Counter([len(line) for line in chain(train_data, valid_data)]) print c.most_common() max_len = max([len(line) for line in chain(train_data, valid_data)]) self.enc_x = Connector(Matrix.empty(batch_size, max_len, 'int', device_id)) self.enc_lengths = Matrix.empty(self.enc_x.nrows, 1, 'int', device_id) self._enc_mask = Matrix.empty(self.enc_x.nrows, self.enc_x.ncols, 'float', device_id) self.enc_mask = List([Connector(self._enc_mask[:, i]) for i in xrange(max_len)], self.enc_x.ncols) self.dec_x = Connector(Matrix.empty(batch_size, max_len + 1, 'int', device_id)) self._dec_y = Matrix.empty(batch_size, max_len + 1, 'int', device_id) self.dec_y = List([Connector(self._dec_y[:, i]) for i in xrange(max_len + 1)], self._dec_y.ncols) self.dec_lengths = Matrix.empty(self.dec_x.nrows, 1, 'int', device_id) self._dec_mask = Matrix.empty(self.dec_x.nrows, self.dec_x.ncols, 'float', device_id) self.dec_mask = List([Connector(self._dec_mask[:, i]) for i in xrange(max_len + 1)], self.dec_x.ncols) self.blocking_contexts = None self.training_mode = True
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) smean_pooling_block_gpu.fprop() output_gpu = smean_pooling_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) smean_pooling_block_cpu.fprop() output_cpu = smean_pooling_block_cpu.output.to_host() r.append(np.allclose(output_gpu, output_cpu)) self.assertEqual(sum(r), self.N)
def __init__(self, data, char_to_idx, batch_size, x_device_id, y_device_id): self.data = HomogeneousDataIterator(data, char_to_idx, batch_size, True, True) self.data_iterator = iter(self.data) self.x_context = Context(x_device_id) self.y_context = Context(y_device_id) max_len = 0 for sub_line in data: cur_len = len(sub_line) if cur_len > max_len: max_len = cur_len print max_len self.x = Connector( Matrix.empty(batch_size, max_len - 1, 'int', x_device_id)) self._y = Matrix.empty(batch_size, max_len - 1, 'int', y_device_id) self.y = List([Connector(self._y[:, i]) for i in xrange(max_len - 1)], self.x.ncols) self.lengths = Matrix.empty(self.x.nrows, 1, 'int', x_device_id) self._mask = Matrix.empty(self.x.nrows, self.x.ncols, 'float', x_device_id) self.mask = List( [Connector(self._mask[:, i]) for i in xrange(max_len)], self.x.ncols) self.blocking_contexts = None
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_gpu = SequentialMeanPoolingBlock(x_gpu) x_gpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_gpu.output.register_usage( context, context) smean_pooling_block_gpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_gpu.bprop() dL_dmatrices_gpu = [e.backward_matrix.to_host() for e in x_gpu] self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) smean_pooling_block_cpu = SequentialMeanPoolingBlock(x_cpu) x_cpu.set_length(sequence_len) _, dL_doutput = smean_pooling_block_cpu.output.register_usage( context, context) smean_pooling_block_cpu.fprop() random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) Matrix.from_npa(random_matrix, 'float').copy_to(context, dL_doutput) smean_pooling_block_cpu.bprop() dL_dmatrices_cpu = [e.backward_matrix.to_host() for e in x_cpu] for dL_dmatrix_gpu, dL_dmatrix_cpu in izip(dL_dmatrices_gpu, dL_dmatrices_cpu): if not np.allclose(dL_dmatrix_gpu, dL_dmatrix_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim_x, dim_y = self.rng.random_integers(1500, size=2) x = [ self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] y = [ self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' x_gpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_gpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu) x_gpu.length = sequence_len y_gpu.length = sequence_len if sequence_len == 0: pass seq_hstack_block_gpu.fprop() output_sequence_gpu = seq_hstack_block_gpu.output.to_host() self.rng.set_state(state) quagga.processor_type = 'cpu' x_cpu = List([Connector(Matrix.from_npa(e)) for e in x]) y_cpu = List([Connector(Matrix.from_npa(e)) for e in y]) seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu) x_cpu.length = sequence_len y_cpu.length = sequence_len seq_hstack_block_cpu.fprop() output_sequence_cpu = seq_hstack_block_cpu.output.to_host() for out_gpu, out_cpu in izip(output_sequence_gpu, output_sequence_cpu): if not np.allclose(out_gpu, out_cpu): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) from quagga.cuda import cudart cudart.cuda_set_device(1) qoutput = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector( Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput[processor_type] = seq_dot_block.output.to_host( ) for output_gpu, output_cpu in izip(qoutput['gpu'], qoutput['cpu']): if not np.allclose(output_gpu, output_cpu, atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def __init__(self, x_sequence, y_sequence, device_id=None): """ TODO """ # TODO add during hsplit otherwise wrong accumulation of gradients if all(e.bpropagable for e in chain(x_sequence, y_sequence)): learning = True elif all(not e.bpropagable for e in chain(x_sequence, y_sequence)): learning = False else: raise ValueError('All elements should be bpropagable or ' 'non-bpropagable. Mixed state is not allowed!') x_ncols = x_sequence[0].ncols y_ncols = y_sequence[0].ncols dtype = x_sequence[0].dtype for x, y in izip(x_sequence, y_sequence): if x.ncols != x_ncols or y.ncols != y_ncols: raise ValueError( "All matrices in the sequence should have the same number of columns!" ) if x.nrows != y.nrows: raise ValueError( "Can't stack matrices in sequence with different number of rows!" ) if x.dtype != dtype or y.dtype != dtype: raise ValueError("Can't stack matrices with different dtypes!") self.context = Context(device_id) device_id = self.context.device_id if learning: self.x_sequence, self.dL_dx_sequences = izip( *x_sequence.register_usage(device_id, device_id)) self.y_sequence, self.dL_dy_sequences = izip( *y_sequence.register_usage(device_id, device_id)) self.dL_dx_sequences = List(self.dL_dx_sequences, x_sequence.length) self.dL_dy_sequences = List(self.dL_dy_sequences, y_sequence.length) else: self.x_sequence = x_sequence.register_usage(device_id) self.y_sequence = y_sequence.register_usage(device_id) self.x_sequence = List(self.x_sequence, x_sequence.length) self.y_sequence = List(self.y_sequence, y_sequence.length) output = [] for _ in xrange(x_sequence.length): matrix = Matrix.empty(x_sequence[0].nrows, x_ncols + y_ncols, dtype, device_id) output.append(Connector(matrix, device_id)) self.output = List(output, x_sequence.length) if learning: self.dL_dx_sequences = List(self.dL_dx_sequences, x_sequence.length) self.dL_dy_sequences = List(self.dL_dy_sequences, x_sequence.length)
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) for reverse in [False, True]: for with_bias in [False, True]: qx = List([Connector(Matrix.from_npa(e)) for e in x]) qW = Connector(Matrix.from_npa(W)) qb = Connector(Matrix.from_npa(b)) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) qx.length = sequence_len qx.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() qoutput = seq_dot_block.output.to_host() seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) th_x = T.ftensor3() get_th_output = theano.function( [th_x], seq_dot_layer.get_output_expr(th_x)) th_output = get_th_output(np.dstack(x[:sequence_len])) for i in xrange(th_output.shape[0]): if not np.allclose(qoutput[i], th_output[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(2, max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 quagga.processor_type = 'gpu' qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) th_row_idxs = T.imatrix() th_true_labels = T.imatrix() row_slicing_layer = RowSlicingLayer(W) toutput = row_slicing_layer.get_output_expr(th_row_idxs) loss = SequentialSoftmaxLayer.get_loss(toutput, th_true_labels) dL_dW = T.grad(loss, row_slicing_layer.W) fun = theano.function([th_row_idxs, th_true_labels], updates=[(row_slicing_layer.W, row_slicing_layer.W + dL_dW)]) fun(row_idxs, np.hstack(true_labels[:sequence_len])) r.append(np.allclose(qW.to_host(), row_slicing_layer.W.get_value(), atol=1e-5)) self.assertEqual(sum(r), len(r))
def __init__(self, block_class, params, sequences, output_names=None, prev_names=None, paddings=None, reverse=False, device_id=None): context = Context(device_id) device_id = context.device_id self.reverse = reverse self.prev_names = prev_names if prev_names and reverse: self.temp_prev = [] self.dL_dtemp_prev = [] self.k = None self._length = sequences[0]._length self.blocks = [] output_names = output_names if output_names else [] outputs = [[] for _ in output_names] for k in xrange(self._length): k = self._length.value - 1 - k if reverse else k args = params + [s[k] for s in sequences] if prev_names: if k == (self._length.value - 1 if reverse else 0): prevs = paddings else: prev_block = self.blocks[-1] prevs = [getattr(prev_block, name) for name in prev_names] args += prevs try: self.blocks.append(block_class(*args, device_id=device_id)) except TypeError: self.blocks.append(block_class(*args)) for i, output_name in enumerate(output_names): outputs[i].append(getattr(self.blocks[-1], output_name)) for output_name, output in izip(output_names, outputs): output = output[::-1] if reverse else output output = List(output, self._length) setattr(self, output_name, output) if hasattr(self.blocks[0], 'calculate_loss') and hasattr(self.blocks[0], 'loss'): def calculate_loss(context): context.wait(*[self.blocks[i].context for i in xrange(self._length)]) for i in xrange(self._length): self.blocks[i].calculate_loss(context) self.calculate_loss = calculate_loss self.context = context SequencerBlock.loss = property(lambda self: [self.blocks[i].loss for i in xrange(self._length)])
def __init__(self, ptb_train, ptb_valid, batch_size, sentence_max_len, device_id): self.blocking_contexts = None self.context = Context(device_id) device_id = self.context.device_id self.train_offsets = HomogeneousDataGenerator(ptb_train, batch_size, sentence_max_len, randomize=True, infinite=True) self.valid_offsets = HomogeneousDataGenerator(ptb_valid, batch_size, sentence_max_len) train_sentences = np.array([self.train_offsets.flatten_sentences]) valid_sentences = np.array([self.valid_offsets.flatten_sentences]) self.train_sents = Matrix.from_npa(train_sentences, 'int', device_id) self.valid_sents = Matrix.from_npa(valid_sentences, 'int', device_id) self._sent_lengths = np.empty((batch_size, 1), dtype=np.int32, order='F')[...] self.sent_lengths = Matrix.from_npa(self._sent_lengths, device_id=device_id) sentence_batch = Matrix.empty(batch_size, sentence_max_len, 'int', device_id) self.sentence_batch = Connector(sentence_batch, self.context) self.sentence_batch.sync_fill(0) self._mask = Matrix.empty(sentence_batch.nrows, self.sentence_batch.ncols, 'float', device_id) self.mask = List( [Connector(self._mask[:, i]) for i in xrange(sentence_max_len)], self.sentence_batch.ncols) self.train_offsets_iterator = iter(self.train_offsets) self.valid_offsets_iterator = iter(self.valid_offsets) self.training_mode = True
def test_bprop_matrix(self): r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers(max_input_sequence_len) embd_dim = self.rng.random_integers(10000) batch_size = self.rng.random_integers(500) output_dim = self.rng.random_integers(2000) W = self.get_orthogonal_matrix(embd_dim, output_dim) row_idxs = self.rng.randint(embd_dim, size=(batch_size, max_input_sequence_len)).astype(np.int32) true_labels = [self.rng.randint(output_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len)] device_id = 0 output = {} for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qrow_idxs = Connector(Matrix.from_npa(row_idxs)) qtrue_labels = List([Connector(Matrix.from_npa(e)) for e in true_labels], qrow_idxs.ncols) qW = Connector(Matrix.from_npa(W), device_id) row_slicing_block = RowSlicingBlock(qW, qrow_idxs) seq_sce_block = SequencerBlock(block_class=SoftmaxCeBlock, params=[], sequences=[row_slicing_block.output, qtrue_labels]) qW.fprop() qrow_idxs.ncols = sequence_len qrow_idxs.fprop() row_slicing_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() row_slicing_block.bprop() qW.add(Context(), qW.backward_matrix) output[processor_type] = qW.to_host() r.append(np.allclose(output['gpu'], output['cpu'])) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(300) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(128) input_dim, hidden_dim, class_num = self.rng.random_integers(1500, size=3) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(class_num, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) lr_W = self.get_orthogonal_matrix(hidden_dim, class_num) lr_b = self.rng.rand(1, class_num).astype(dtype=np.float32) device_id = 0 for reverse in [False, True]: for with_mask in [False, True]: for learn_inital_states in [False, True]: # quagga model context = Context() qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], qx.length) qmask = Matrix.empty(batch_size, qx.length, 'float') qmask_list = [ Connector(qmask[:, i]) for i in xrange(qmask.ncols) ] qmask = Connector(qmask) qh_0 = Connector( Matrix.from_npa(h_0), device_id if learn_inital_states else None) qc_0 = Connector( Matrix.from_npa(c_0), device_id if learn_inital_states else None) qW = Connector(Matrix.from_npa(W), device_id) qR = Connector(Matrix.from_npa(R), device_id) qlr_W = Connector(Matrix.from_npa(lr_W), device_id) qlr_b = Connector(Matrix.from_npa(lr_b), device_id) lstm = SequencerBlock( block_class=LstmBlock, params=[qW, qR], sequences=[ qx, qmask_list if with_mask else [None] * len(qx) ], output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qlr_W, qlr_b], sequences=[lstm.h], output_names=['output']) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[ seq_dot_block.output, qtrue_labels, qmask_list if with_mask else [None] * len(qx) ]) qx.length = sequence_len for e in qx: e.fprop() for e in qtrue_labels: e.fprop() qmask.assign_npa(context, mask) qmask.fprop() qlr_W.fprop() qlr_b.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() lstm.bprop() quagga_grads = [ qlr_b.backward_matrix.to_host(), qlr_W.backward_matrix.to_host(), qW.backward_matrix.to_host(), qR.backward_matrix.to_host() ] if learn_inital_states: quagga_grads.append(qc_0.backward_matrix.to_host()) quagga_grads.append(qh_0.backward_matrix.to_host()) quagga_grads.append( [e.backward_matrix.to_host() for e in qx]) del qx del qlr_b del qlr_W del qW del qR del qmask del lstm del seq_dot_block del seq_sce_block # theano model th_x = T.ftensor3() th_true_labels = T.imatrix() th_mask = T.fmatrix() lstm_layer = LstmLayer(W, R, c_0, h_0, reverse=reverse) th_h = lstm_layer.get_output_expr( th_x, th_mask if with_mask else None) seq_softmax_layer = SequentialSoftmaxLayer( lr_W, lr_b, reverse) loss = seq_softmax_layer.get_loss( th_h, th_true_labels, th_mask if with_mask else None) wrt = [ seq_softmax_layer.b, seq_softmax_layer.W, lstm_layer.W, lstm_layer.R ] if learn_inital_states: wrt.append(lstm_layer.c0) wrt.append(lstm_layer.h0) wrt.append(th_x) grads = T.grad(loss, wrt) if with_mask: get_theano_grads = theano.function( [th_x, th_true_labels, th_mask], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len]), mask[:, :sequence_len]) else: get_theano_grads = theano.function( [th_x, th_true_labels], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip( quagga_grads[:-1], theano_grads[:-1]): r.append( np.allclose(quagga_grad, theano_grad, atol=1e-6)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-6): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
sce_dot_block_W={ 'init': Orthogonal(1024, len(vocab)), 'device_id': 0 }, sce_dot_block_b={ 'init': Constant(1, len(vocab)), 'device_id': 0 }) data_block = PtbMiniBatchesGenerator(ptb_train, ptb_valid, batch_size=64, sentence_max_len=100, device_id=0) seq_embd_block = RowSlicingBlock(p['embd_W'], data_block.sentence_batch) # remove last in the list output = List(seq_embd_block.output[:-1], seq_embd_block.output.length - 1) c_fwd_repeat_block = RepeatBlock(p['lstm_fwd_c0'], data_block.sentence_batch.nrows, axis=0, device_id=0) h_fwd_repeat_block = RepeatBlock(p['lstm_fwd_h0'], data_block.sentence_batch.nrows, axis=0, device_id=0) fwd_lstm_block = SequencerBlock( block_class=LstmBlock, params=[p['lstm_fwd_W'], p['lstm_fwd_R'], 0.5], sequences=[output, data_block.mask], output_names=['h'], prev_names=['c', 'h'], paddings=[c_fwd_repeat_block.output, h_fwd_repeat_block.output],
def test_theano_grad(self): class SequentialMeanPoolingLayer(object): def get_output_expr(self, input_sequence): return T.mean(input_sequence, axis=2) class LogisticRegressionLayer(object): def __init__(self, W_init, b_init): self.W = theano.shared(value=W_init()) self.b = theano.shared(value=b_init()) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(512) dim = self.rng.random_integers(1500) x = [ self.rng.rand(batch_size, dim).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = self.rng.randint(1, size=(batch_size, 1)).astype(dtype=np.float32) W_init = self.get_orthogonal_initializer(dim, 1) b_init = lambda: self.rng.rand(1, 1).astype(dtype=np.float32) # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_true_labels = T.fmatrix() smp_layer = SequentialMeanPoolingLayer() lr_layer = LogisticRegressionLayer(W_init, lambda: b_init()[0]) probs = lr_layer.get_output_expr(smp_layer.get_output_expr(th_x)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) grad_x = T.grad(loss, wrt=th_x) get_grad_x = theano.function([th_x, th_true_labels], grad_x) # quagga model self.rng.set_state(state) context = Context() x = List( [Connector(Matrix.from_npa(e), context, context) for e in x]) true_labels = Connector(Matrix.from_npa(true_labels)) smp_block = SequentialMeanPoolingBlock(x) dot_block = DotBlock(W_init, b_init, smp_block.output) sce_block = SigmoidCeBlock(dot_block.output, true_labels) x.set_length(sequence_len) smp_block.fprop() dot_block.fprop() sce_block.fprop() sce_block.bprop() dot_block.bprop() smp_block.bprop() dL_dx = [e.backward_matrix.to_host() for e in x] dL_dx_th = get_grad_x(np.dstack([e.to_host() for e in x]), true_labels.to_host()) for i in xrange(dL_dx_th.shape[-1]): if not np.allclose(dL_dx[i], dL_dx_th[..., i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N)
def test_theano_grad(self): class AttentionLayer(object): def __init__(self, u, mask=None): self.u = theano.shared(value=u) self.mask = mask def get_output_expr(self, input_expr): input_expr = input_expr.dimshuffle(0, 2, 1) pre_a = T.dot(input_expr, self.u)[:, :, 0] if self.mask: pre_a = self.mask * pre_a - \ (1 - self.mask) * 3.402823466e+38 a = T.nnet.softmax(pre_a)[:, :, np.newaxis] return T.sum(a * input_expr, axis=1) class LogisticRegressionLayer(object): def __init__(self, W, b): self.W = theano.shared(value=W) if b is not None: self.b = theano.shared(value=b[0]) def get_output_expr(self, input_expr): if hasattr(self, 'b'): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) else: return T.nnet.sigmoid(T.dot(input_expr, self.W)) r = [] for i in xrange(self.N): batch_size = self.rng.random_integers(500) x_dim = self.rng.random_integers(3000) n_ts = self.rng.random_integers(100) x = [ self.rng.rand(batch_size, x_dim).astype(np.float32) for _ in xrange(n_ts) ] u = self.get_orthogonal_matrix(x_dim, 1) lr_dot_W = self.get_orthogonal_matrix(x_dim, 1) lr_dot_b = self.rng.rand(1, 1).astype( np.float32) if self.rng.randint(2) else None true_labels = self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) mask = self.rng.randint(2, size=(batch_size, n_ts)).astype( np.float32) if self.rng.randint(2) else None device_id = 0 # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_mask = T.fmatrix() if mask is not None else None th_true_labels = T.fmatrix() attnt_layer = AttentionLayer(u, th_mask) lr_layer = LogisticRegressionLayer(lr_dot_W, lr_dot_b) probs = th_x for layer in [attnt_layer, lr_layer]: probs = layer.get_output_expr(probs) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) params = [lr_layer.W, attnt_layer.u, th_x] if hasattr(lr_layer, 'b'): params.append(lr_layer.b) th_grads = T.grad(loss, wrt=params) get_theano_grads = theano.function( [th_x, th_true_labels] + ([th_mask] if mask is not None else []), th_grads) th_grads = get_theano_grads( *([np.dstack(x), true_labels] + ([mask] if mask is not None else []))) # quagga model self.rng.set_state(state) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) u = Connector(Matrix.from_npa(u), device_id) lr_dot_W = Connector(Matrix.from_npa(lr_dot_W), device_id) lr_dot_b = Connector( Matrix.from_npa(lr_dot_b), device_id) if lr_dot_b is not None else lr_dot_b true_labels = Connector(Matrix.from_npa(true_labels)) if mask is not None: mask = Connector(Matrix.from_npa(mask)) attnt_block = AttentionBlock(x, u, mask) lrdot_block = DotBlock(lr_dot_W, lr_dot_b, attnt_block.output) sce_block = SigmoidCeBlock(lrdot_block.output, true_labels) x.fprop() true_labels.fprop() u.fprop() lr_dot_W.fprop() if lr_dot_b: lr_dot_b.fprop() attnt_block.fprop() lrdot_block.fprop() sce_block.fprop() sce_block.bprop() lrdot_block.bprop() attnt_block.bprop() q_grads = [ lr_dot_W.backward_matrix.to_host(), u.backward_matrix.to_host(), np.dstack([e.backward_matrix.to_host() for e in x]) ] if lr_dot_b: q_grads.append(lr_dot_b.backward_matrix.to_host()) for th_grad, q_grad in izip(th_grads, q_grads): r.append(np.allclose(th_grad, q_grad, atol=1.e-7)) print r[-1] self.assertEqual(sum(r), len(r))
def test_theano_bprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 for reverse in [False, True]: for with_bias in [False, True]: qx = List( [Connector(Matrix.from_npa(e), device_id) for e in x]) qtrue_labels = List( [Connector(Matrix.from_npa(e)) for e in true_labels], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads = [qW.backward_matrix.to_host()] if with_bias: quagga_grads.append(qb.backward_matrix.to_host()) quagga_grads.append( [e.backward_matrix.to_host() for e in qx]) seq_dot_layer = SequentialDotLayer( W, b if with_bias else None, reverse) seq_sce_layer = SequentialSoftmaxLayer() th_x = T.ftensor3() th_true_labels = T.imatrix() loss = seq_sce_layer.get_loss( seq_dot_layer.get_output_expr(th_x), th_true_labels) wrt = [seq_dot_layer.W] if with_bias: wrt.append(seq_dot_layer.b) wrt.append(th_x) grads = T.grad(loss, wrt) get_theano_grads = theano.function([th_x, th_true_labels], grads) theano_grads = get_theano_grads( np.dstack(x[:sequence_len]), np.hstack(true_labels[:sequence_len])) for quagga_grad, theano_grad in izip( quagga_grads[:-1], theano_grads[:-1]): r.append( np.allclose(quagga_grad, theano_grad, atol=1e-5)) for i in xrange(theano_grads[-1].shape[-1]): if not np.allclose(quagga_grads[-1][i], theano_grads[-1][..., i], atol=1e-5): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(2, size=(batch_size, 1)).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) lr_W = self.get_orthogonal_matrix(hidden_dim, 1) lr_b = self.rng.rand(1, 1).astype(dtype=np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_mask in [False, True]: for learn_inital_states in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type context = Context() qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], len(qx)) qmask = Matrix.empty(batch_size, len(qx)) qh_0 = Connector( Matrix.from_npa(h_0), device_id if learn_inital_states else None) qc_0 = Connector( Matrix.from_npa(c_0), device_id if learn_inital_states else None) qW = Connector(Matrix.from_npa(W), device_id) qR = Connector(Matrix.from_npa(R), device_id) qlr_W = Connector(Matrix.from_npa(lr_W), device_id) qlr_b = Connector(Matrix.from_npa(lr_b), device_id) sequences = [qx] if with_mask: sequences.append( List([ Connector(qmask[:, i]) for i in xrange(len(qx)) ], len(qx))) qmask.assign_npa(context, mask) qmask = sequences[-1] else: sequences.append([None] * len(qx)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=sequences, output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) seq_dot_block = SequencerBlock( block_class=DotBlock, params=[qlr_W, qlr_b], sequences=[lstm.h], output_names=['output']) seq_sce_block = SequencerBlock( block_class=SigmoidCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels ] + ([qmask] if with_mask else [])) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() if with_mask: qmask.fprop() qlr_W.fprop() qlr_b.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() lstm.bprop() quagga_grads[processor_type] = [ qlr_b.backward_matrix.to_host(), qlr_W.backward_matrix.to_host(), qW.backward_matrix.to_host(), qR.backward_matrix.to_host() ] if learn_inital_states: quagga_grads[processor_type].append( qc_0.backward_matrix.to_host()) quagga_grads[processor_type].append( qh_0.backward_matrix.to_host()) quagga_grads[processor_type].extend( e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-6)) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = [ self.rng.randint(hidden_dim, size=(batch_size, 1)).astype(np.int32) for _ in xrange(max_input_sequence_len) ] W = self.get_orthogonal_matrix(input_dim, hidden_dim) b = self.rng.rand(1, hidden_dim).astype(np.float32) device_id = 0 quagga_grads = {} for reverse in [False, True]: for with_bias in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type qx = List([ Connector(Matrix.from_npa(e), device_id) for e in x ]) qtrue_labels = List([ Connector(Matrix.from_npa(e)) for e in true_labels ], len(qx)) qW = Connector(Matrix.from_npa(W), device_id) qb = Connector(Matrix.from_npa(b), device_id) if with_bias else None seq_dot_block = SequencerBlock(block_class=DotBlock, params=[qW, qb], sequences=[qx], output_names=['output'], reverse=reverse) seq_sce_block = SequencerBlock( block_class=SoftmaxCeBlock, params=[], sequences=[seq_dot_block.output, qtrue_labels], reverse=reverse) qx.length = sequence_len qx.fprop() qtrue_labels.fprop() qW.fprop() if qb: qb.fprop() seq_dot_block.fprop() seq_sce_block.fprop() seq_sce_block.bprop() seq_dot_block.bprop() quagga_grads[processor_type] = [ qW.backward_matrix.to_host() ] if with_bias: quagga_grads[processor_type].append( qb.backward_matrix.to_host()) quagga_grads[processor_type].extend( e.backward_matrix.to_host() for e in qx) for grad_gpu, grad_cpu in izip(quagga_grads['gpu'], quagga_grads['cpu']): r.append(np.allclose(grad_gpu, grad_cpu, atol=1e-5)) self.assertEqual(sum(r), len(r))
def test_theano_fprop(self): quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) for reverse in [False, True]: for with_mask in [False, True]: context = Context() qx = List([Connector(Matrix.from_npa(e)) for e in x]) qmask = Connector( Matrix.empty(batch_size, len(qx), 'float')) qh_0 = Connector(Matrix.from_npa(h_0)) qc_0 = Connector(Matrix.from_npa(c_0)) qW = Connector(Matrix.from_npa(W)) qR = Connector(Matrix.from_npa(R)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=[qx] + ([qmask] if with_mask else []), output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) qx.length = sequence_len for e in qx: e.fprop() qmask.assign_npa(context, mask) qmask.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() q_h = lstm.h.to_host() th_x = T.ftensor3() lstm_layer = LstmLayer(W, R, c_0, h_0, reverse) if with_mask: th_mask = T.fmatrix() get_th_h = theano.function([th_x, th_mask], lstm_layer.get_output_expr( th_x, th_mask)) th_h = get_th_h(np.dstack(x[:sequence_len]), mask[:, :sequence_len]) else: get_th_h = theano.function( [th_x], lstm_layer.get_output_expr(th_x)) th_h = get_th_h(np.dstack(x[:sequence_len])) for i in xrange(th_h.shape[0]): if not np.allclose(q_h[i], th_h[i]): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_theano_grad(self): device_id = 0 class SequentialHorizontalStackLayer(object): def get_output_expr(self, x_sequence, y_sequence): return T.concatenate((x_sequence, y_sequence), axis=1) class SequentialMeanPoolingLayer(object): def get_output_expr(self, input_sequence): return T.mean(input_sequence, axis=2) class LogisticRegressionLayer(object): def __init__(self, W_init, b_init): self.W = theano.shared(value=W_init()) self.b = theano.shared(value=b_init()) def get_output_expr(self, input_expr): return T.nnet.sigmoid(T.dot(input_expr, self.W) + self.b) quagga.processor_type = 'gpu' r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) dim_x, dim_y = self.rng.random_integers(1280, size=2) x = [ self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] y = [ self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] true_labels = self.rng.randint(1, size=(batch_size, 1)).astype(dtype=np.float32) W_init = self.get_orthogonal_initializer(dim_x + dim_y, 1) b_init = lambda: self.rng.rand(1, 1).astype(dtype=np.float32) # Theano model state = self.rng.get_state() th_x = T.ftensor3() th_y = T.ftensor3() th_true_labels = T.fmatrix() shs_layer = SequentialHorizontalStackLayer() smp_layer = SequentialMeanPoolingLayer() lr_layer = LogisticRegressionLayer(W_init, lambda: b_init()[0]) probs = shs_layer.get_output_expr(th_x, th_y) probs = lr_layer.get_output_expr(smp_layer.get_output_expr(probs)) loss = T.mean(T.nnet.binary_crossentropy(probs, th_true_labels)) grads = T.grad(loss, wrt=[th_x, th_y]) get_grads = theano.function([th_x, th_y, th_true_labels], grads) dL_dx_sequence_th, dL_dy_sequence_th = get_grads( np.dstack(x[:sequence_len]), np.dstack(y[:sequence_len]), true_labels) # quagga model self.rng.set_state(state) W = Connector(Matrix.from_npa(W_init(), device_id=device_id), device_id) b = Connector(Matrix.from_npa(b_init(), device_id=device_id), device_id) x = List([Connector(Matrix.from_npa(e), device_id) for e in x]) y = List([Connector(Matrix.from_npa(e), device_id) for e in y]) true_labels = Connector(Matrix.from_npa(true_labels)) shs_block = SequentialHorizontalStackBlock(x, y) smp_block = SequentialMeanPoolingBlock(shs_block.output) dot_block = DotBlock(W, b, smp_block.output) sce_block = SigmoidCeBlock(dot_block.output, true_labels) x.length = sequence_len y.length = sequence_len shs_block.fprop() smp_block.fprop() dot_block.fprop() sce_block.fprop() sce_block.bprop() dot_block.bprop() smp_block.bprop() shs_block.bprop() dL_dx_sequence = [e.backward_matrix.to_host() for e in x] dL_dy_sequence = [e.backward_matrix.to_host() for e in y] for i in xrange(dL_dx_sequence_th.shape[-1]): if not np.allclose(dL_dx_sequence[i], dL_dx_sequence_th[..., i], atol=1.e-6): r.append(False) break else: r.append(True) for i in xrange(dL_dy_sequence_th.shape[-1]): if not np.allclose(dL_dy_sequence[i], dL_dy_sequence_th[..., i], atol=1.e-6): r.append(False) break else: r.append(True) self.assertEqual(sum(r), self.N * 2)
def test_fprop(self): """ compare `fprop` results for cpu and gpu backends """ r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) input_dim, hidden_dim = self.rng.random_integers(1500, size=2) x = [ self.rng.randn(batch_size, input_dim).astype(np.float32) for _ in xrange(max_input_sequence_len) ] mask = (self.rng.rand(batch_size, sequence_len) < 0.8).astype( np.float32) h_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) c_0 = self.rng.randn(batch_size, hidden_dim).astype(np.float32) W_z = self.get_orthogonal_matrix(input_dim, hidden_dim) W_i = self.get_orthogonal_matrix(input_dim, hidden_dim) W_f = self.get_orthogonal_matrix(input_dim, hidden_dim) W_o = self.get_orthogonal_matrix(input_dim, hidden_dim) W = np.hstack((W_z, W_i, W_f, W_o)) R_z = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_i = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_f = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R_o = self.get_orthogonal_matrix(hidden_dim, hidden_dim) R = np.hstack((R_z, R_i, R_f, R_o)) qh = {} for reverse in [False, True]: for with_mask in [False, True]: for processor_type in ['gpu', 'cpu']: quagga.processor_type = processor_type context = Context() qx = List([Connector(Matrix.from_npa(e)) for e in x]) qmask = Matrix.empty(batch_size, len(qx), 'float') qh_0 = Connector(Matrix.from_npa(h_0)) qc_0 = Connector(Matrix.from_npa(c_0)) qW = Connector(Matrix.from_npa(W)) qR = Connector(Matrix.from_npa(R)) sequences = [qx] if with_mask: sequences.append( List([ Connector(qmask[:, i]) for i in xrange(len(qx)) ], len(qx))) qmask.assign_npa(context, mask) qmask = sequences[-1] else: sequences.append([None] * len(qx)) lstm = SequencerBlock(block_class=LstmBlock, params=[qW, qR], sequences=sequences, output_names=['h'], prev_names=['c', 'h'], paddings=[qc_0, qh_0], reverse=reverse) qx.length = sequence_len if with_mask: qmask.fprop() qx.fprop() qh_0.fprop() qc_0.fprop() qW.fprop() qR.fprop() lstm.fprop() qh[processor_type] = lstm.h.to_host() for h_gpu, h_cpu in izip(qh['gpu'], qh['cpu']): if not np.allclose(h_gpu, h_cpu, rtol=1e-7, atol=1e-3): r.append(False) break else: r.append(True) self.assertEqual(sum(r), len(r))
def test_bprop(self): """ compare `bprop` results for cpu and gpu backends """ device_id = 0 r = [] for i in xrange(self.N): max_input_sequence_len = self.rng.random_integers(500) sequence_len = max_input_sequence_len if i == 0 else self.rng.random_integers( max_input_sequence_len) batch_size = self.rng.random_integers(256) dim_x, dim_y = self.rng.random_integers(1280, size=2) x = [ self.rng.rand(batch_size, dim_x).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] y = [ self.rng.rand(batch_size, dim_y).astype(dtype=np.float32) for _ in xrange(max_input_sequence_len) ] state = self.rng.get_state() quagga.processor_type = 'gpu' context = Context() x_gpu = List([Connector(Matrix.from_npa(e), device_id) for e in x]) y_gpu = List([Connector(Matrix.from_npa(e), device_id) for e in y]) seq_hstack_block_gpu = SequentialHorizontalStackBlock(x_gpu, y_gpu) x_gpu.length = sequence_len y_gpu.length = sequence_len _, dL_doutput_sequence = izip( *seq_hstack_block_gpu.output.register_usage( device_id, device_id)) seq_hstack_block_gpu.fprop() for dL_doutput in dL_doutput_sequence: random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) seq_hstack_block_gpu.bprop() dL_dx_matrices_gpu = [e.backward_matrix.to_host() for e in x_gpu] dL_dy_matrices_gpu = [e.backward_matrix.to_host() for e in y_gpu] self.rng.set_state(state) quagga.processor_type = 'cpu' context = Context() x_cpu = List([Connector(Matrix.from_npa(e), device_id) for e in x]) y_cpu = List([Connector(Matrix.from_npa(e), device_id) for e in y]) seq_hstack_block_cpu = SequentialHorizontalStackBlock(x_cpu, y_cpu) x_cpu.length = sequence_len y_cpu.length = sequence_len _, dL_doutput_sequence = izip( *seq_hstack_block_cpu.output.register_usage( device_id, device_id)) seq_hstack_block_cpu.fprop() for dL_doutput in dL_doutput_sequence: random_matrix = self.rng.rand(dL_doutput.nrows, dL_doutput.ncols) dL_doutput.assign(context, Matrix.from_npa(random_matrix, 'float')) seq_hstack_block_cpu.bprop() dL_dx_matrices_cpu = [e.backward_matrix.to_host() for e in x_cpu] dL_dy_matrices_cpu = [e.backward_matrix.to_host() for e in y_cpu] for dL_dx_gpu, dL_dx_cpu in izip(dL_dx_matrices_gpu, dL_dx_matrices_cpu): if not np.allclose(dL_dx_gpu, dL_dx_cpu): r.append(False) break else: r.append(True) for dL_dy_gpu, dL_dy_cpu in izip(dL_dy_matrices_gpu, dL_dy_matrices_cpu): if not np.allclose(dL_dy_gpu, dL_dy_cpu): r.append(False) break else: r.append(True) del x_gpu del y_gpu del seq_hstack_block_gpu del dL_dx_matrices_gpu del dL_dy_matrices_gpu self.assertEqual(sum(r), self.N * 2)