def test_dot_infershape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() self._compile_and_check( [W, h, iIdx, b, oIdx], [sparse_block_dot(W, h, iIdx, b, oIdx)], self.gemv_data(), self.gemv_class )
def test_dot_infershape(self): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() self._compile_and_check([W, h, iIdx, b, oIdx], [sparse_block_dot(W, h, iIdx, b, oIdx)], self.gemv_data(), self.gemv_class)
def test_blocksparse_gpu_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu) assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockGemv)
def test_blocksparse_gpu_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=mode_with_gpu) assert sum(1 for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuSparseBlockGemv)) == 1
def test_blocksparse_gpu_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)], mode=mode_with_gpu) assert isinstance(f.maker.fgraph.toposort()[-2].op, GpuSparseBlockOuter)
def test_blocksparse_inplace_gemv_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace else: assert f.maker.fgraph.toposort()[-1].op.inplace
def test_blocksparse_inplace_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)]) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer) else: assert f.maker.fgraph.toposort()[-1].op.inplace assert check_stack_trace(f, ops_to_check=sparse_block_outer_inplace)
def test_blocksparse_inplace_outer_opt(): b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) theano.printing.debugprint(tensor.grad(o.sum(), wrt=W)) f = theano.function([W, h, iIdx, b, oIdx], [o, tensor.grad(o.sum(), wrt=W)]) if theano.config.mode == "FAST_COMPILE": assert not f.maker.fgraph.toposort()[-1].op.inplace else: assert f.maker.fgraph.toposort()[-1].op.inplace
def test_sparseblockdot(self): # Compares the numpy version of sparseblockgemv to sparse_block_dot. b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val) ref_out = self.gemv_numpy(b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def test_sparseblockdot(self): """ Compares the numpy version of sparseblockgemv to sparse_block_dot. """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = sparse_block_dot(W, h, iIdx, b, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(W_val, h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy(b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def metaop(b, h, W): return sparse_block_dot(W, h, iIdx, b, oIdx)
def h_softmax(x, batch_size, n_outputs, n_classes, n_outputs_per_class, W1, b1, W2, b2, target=None): """ Two-level hierarchical softmax. The architecture is composed of two softmax layers: the first predicts the class of the input x while the second predicts the output of the input x in the predicted class. More explanations can be found in the original paper [1]_. If target is specified, it will only compute the outputs of the corresponding targets. Otherwise, if target is None, it will compute all the outputs. The outputs are grouped in the same order as they are initially defined. .. versionadded:: 0.7.1 Parameters ---------- x: tensor of shape (batch_size, number of features) the minibatch input of the two-layer hierarchical softmax. batch_size: int the size of the minibatch input x. n_outputs: int the number of outputs. n_classes: int the number of classes of the two-layer hierarchical softmax. It corresponds to the number of outputs of the first softmax. See note at the end. n_outputs_per_class: int the number of outputs per class. See note at the end. W1: tensor of shape (number of features of the input x, n_classes) the weight matrix of the first softmax, which maps the input x to the probabilities of the classes. b1: tensor of shape (n_classes,) the bias vector of the first softmax layer. W2: tensor of shape (n_classes, number of features of the input x, n_outputs_per_class) the weight matrix of the second softmax, which maps the input x to the probabilities of the outputs. b2: tensor of shape (n_classes, n_outputs_per_class) the bias vector of the second softmax layer. target: tensor of shape either (batch_size,) or (batch_size, 1) (optional, default None) contains the indices of the targets for the minibatch input x. For each input, the function computes the output for its corresponding target. If target is None, then all the outputs are computed for each input. Returns ------- output_probs: tensor of shape (batch_size, n_outputs) or (batch_size, 1) Output of the two-layer hierarchical softmax for input x. If target is not specified (None), then all the outputs are computed and the returned tensor has shape (batch_size, n_outputs). Otherwise, when target is specified, only the corresponding outputs are computed and the returned tensor has thus shape (batch_size, 1). Notes ----- The product of n_outputs_per_class and n_classes has to be greater or equal to n_outputs. If it is strictly greater, then the irrelevant outputs will be ignored. n_outputs_per_class and n_classes have to be the same as the corresponding dimensions of the tensors of W1, b1, W2 and b2. The most computational efficient configuration is when n_outputs_per_class and n_classes are equal to the square root of n_outputs. References ---------- .. [1] J. Goodman, "Classes for Fast Maximum Entropy Training," ICASSP, 2001, <http://arxiv.org/abs/cs/0108006>`. """ # First softmax that computes the probabilities of belonging to each class class_probs = theano.tensor.nnet.softmax(T.dot(x, W1) + b1) if target is None: # Computes the probabilites of all the outputs # Second softmax that computes the output probabilities activations = T.tensordot(x, W2, (1, 1)) + b2 output_probs = theano.tensor.nnet.softmax( activations.reshape((-1, n_outputs_per_class))) output_probs = output_probs.reshape((batch_size, n_classes, -1)) output_probs = class_probs.dimshuffle(0, 1, 'x') * output_probs output_probs = output_probs.reshape((batch_size, -1)) # output_probs.shape[1] is n_classes * n_outputs_per_class, which might # be greater than n_outputs, so we ignore the potential irrelevant # outputs with the next line: output_probs = output_probs[:, :n_outputs] else: # Computes the probabilities of the outputs specified by the targets target = target.flatten() # Classes to which belong each target target_classes = target // n_outputs_per_class # Outputs to which belong each target inside a class target_outputs_in_class = target % n_outputs_per_class # Second softmax that computes the output probabilities activations = sparse_block_dot(W2.dimshuffle('x', 0, 1, 2), x.dimshuffle(0, 'x', 1), T.zeros((batch_size, 1), dtype='int32'), b2, target_classes.dimshuffle(0, 'x')) output_probs = theano.tensor.nnet.softmax(activations.dimshuffle(0, 2)) target_class_probs = class_probs[T.arange(batch_size), target_classes] output_probs = output_probs[T.arange(batch_size), target_outputs_in_class] output_probs = target_class_probs * output_probs return output_probs