Ejemplo n.º 1
0
    def _log_prob_gradient(self, grad_output):
        """
        Parameters
        ----------
        grad_output

        Returns
        -------
        input_grad
        target_grad
        transition_grad
        """

        losses = self.auxiliary_data

        # Compute the gradients for each example:
        def seq_grad(b):
            gtn.backward(losses[b])

        # Compute gradients in parallel over the batch:
        gtn.parallel_for(seq_grad, range(len(losses)))

        transition_grad = self._transition_fst.grad().weights_to_numpy()
        duration_grad = self._duration_fst.grad().weights_to_numpy()

        input_grad = None
        target_grad = None
        transition_grad = torch.from_numpy(transition_grad) * grad_output.cpu()
        duration_grad = torch.from_numpy(duration_grad) * grad_output.cpu()

        return input_grad, target_grad, transition_grad, duration_grad
Ejemplo n.º 2
0
    def viterbi(self, outputs):
        B, T, C = outputs.shape
        assert C == self.N, "Wrong number of classes in output."

        predictions = [None] * B

        def process(b):
            # create emission graph
            g_emissions = gtn.linear_graph(T, C, False)
            cpu_data = outputs[b].cpu().contiguous()
            g_emissions.set_weights(cpu_data.data_ptr())

            # create transition graph
            g_transitions = utils.ASGLossFunction.create_transitions_graph(
                self.transitions)
            g_path = gtn.viterbi_path(gtn.intersect(g_emissions,
                                                    g_transitions))
            prediction = g_path.labels_to_list()

            collapsed_prediction = [p for p, _ in groupby(prediction)]
            if self.garbage_idx is not None:
                # remove garbage tokens
                collapsed_prediction = [
                    p for p in collapsed_prediction if p != self.garbage_idx
                ]
            predictions[b] = utils.unpack_replabels(collapsed_prediction,
                                                    self.num_replabels)

        gtn.parallel_for(process, range(B))
        return [torch.IntTensor(p) for p in predictions]
Ejemplo n.º 3
0
    def viterbi(self, outputs):
        B, T, C = outputs.shape

        if self.transitions is not None:
            cpu_data = self.transition_params.cpu().contiguous()
            self.transitions.set_weights(cpu_data.data_ptr())
            self.transitions.calc_grad = False

        self.tokens.arc_sort()

        paths = [None] * B
        def process(b):
            emissions = gtn.linear_graph(T, C, False)
            cpu_data = outputs[b].cpu().contiguous()
            emissions.set_weights(cpu_data.data_ptr())
            if self.transitions is not None:
                full_graph = gtn.intersect(emissions, self.transitions)
            else:
                full_graph = emissions

            # Find the best path and remove back-off arcs:
            path = gtn.remove(gtn.viterbi_path(full_graph))
            # Left compose the viterbi path with the "alignment to token"
            # transducer to get the outputs:
            path = gtn.compose(path, self.tokens)

            # When there are ambiguous paths (allow_repeats is true), we take
            # the shortest:
            path = gtn.viterbi_path(path)
            path = gtn.remove(gtn.project_output(path))
            paths[b] = path.labels_to_list()

        gtn.parallel_for(process, range(B))
        predictions = [torch.IntTensor(path) for path in paths]
        return predictions
    def backward(ctx, grad_output):
        output_graphs, input_graphs, kernels = CTX_GRAPHS
        B, T, C = ctx.input_shape
        kernel_size = ctx.kernel_size
        stride = ctx.stride
        input_grad = torch.zeros((B, T, C))
        deltas = grad_output.cpu().numpy()

        def process(b):
            for t, window in enumerate(output_graphs[b]):
                for c, out in enumerate(window):
                    delta = make_scalar_graph(deltas[b, t, c])
                    gtn.backward(out, delta)
                grad = (input_graphs[b][t].grad().weights_to_numpy().reshape(
                    kernel_size, -1))
                input_grad[b, t * stride:t * stride + kernel_size] += grad

        gtn.parallel_for(process, range(B))

        if ctx.needs_input_grad[4]:
            kernel_grads = [k.grad().weights_to_numpy() for k in kernels]
            kernel_grads = np.concatenate(kernel_grads)
            kernel_grads = torch.from_numpy(kernel_grads).to(
                grad_output.device)
        else:
            kernel_grads = None
        return (
            input_grad.to(grad_output.device),
            None,  # kernels
            None,  # kernel_size
            None,  # stride
            kernel_grads,
            None,  # viterbi
        )
Ejemplo n.º 5
0
    def argmax(self, inputs):
        seq_fsts = self.seq_fst()

        arc_scores = self.scores_to_arc(inputs)

        device = arc_scores.device
        arc_scores = arc_scores.cpu()

        batch_size, num_samples, num_classes = arc_scores.shape

        best_paths = [None] * batch_size

        def pred_seq(batch_index):
            obs_fst = linearFstFromArray(arc_scores[batch_index].reshape(
                num_samples, -1))

            # Compose each sequence fst individually: it seems like composition
            # only works for lattices
            denom_fst = obs_fst
            for seq_fst in seq_fsts:
                denom_fst = gtn.compose(denom_fst, seq_fst)

            viterbi_path = gtn.viterbi_path(denom_fst)
            best_paths[batch_index] = gtn.remove(
                gtn.project_output(viterbi_path))

        gtn.parallel_for(pred_seq, range(batch_size))

        best_paths = torch.tensor(
            [self._getOutputString(p) for p in best_paths]).to(device)
        return best_paths
Ejemplo n.º 6
0
    def backward(ctx, grad_output):
        """Backward computation.

        :param torch.tensor grad_output: backward passed gradient value
        :return: cumulative gradient output
        :rtype: (torch.Tensor, None, None, None)
        """
        losses, scales, emissions_graphs, in_shape, ilens = ctx.auxiliary_data
        B, T, C = in_shape
        input_grad = torch.zeros((B, T, C))

        def process(b):
            T = ilens[b]
            gtn.backward(losses[b], False)
            emissions = emissions_graphs[b]
            grad = emissions.grad().weights_to_numpy()
            input_grad[b][:T] = torch.from_numpy(grad).view(1, T,
                                                            C) * scales[b]

        gtn.parallel_for(process, range(B))

        if grad_output.is_cuda:
            input_grad = input_grad.cuda()
        input_grad *= grad_output / B

        return (
            input_grad,
            None,  # targets
            None,  # ilens
            None,  # blank_idx
            None,  # reduction
        )
    def forward(ctx,
                inputs,
                kernels,
                kernel_size,
                stride,
                kernel_params=None,
                viterbi=False):
        B, T, C = inputs.shape
        if T < kernel_size:
            # Padding should be done outside of this function:
            raise ValueError(
                f"Input ({T}) too short for kernel ({kernel_size})")
        cpu_inputs = inputs.cpu()
        output_graphs = [[] for _ in range(B)]
        input_graphs = [[] for _ in range(B)]

        if kernel_params is not None:
            cpu_data = kernel_params.cpu().contiguous()
            s = 0
            for kernel in kernels:
                na = kernel.num_arcs()
                data_ptr = cpu_data[s:s + na].data_ptr()
                s += na
                kernel.set_weights(data_ptr)
                kernel.calc_grad = kernel_params.requires_grad
                kernel.zero_grad()

        def process(b):
            for t in range(0, T - kernel_size + 1, stride):
                input_graph = gtn.linear_graph(kernel_size, C,
                                               inputs.requires_grad)
                window = cpu_inputs[b, t:t + kernel_size, :].contiguous()
                input_graph.set_weights(window.data_ptr())
                if viterbi:
                    window_outputs = [
                        gtn.viterbi_score(gtn.intersect(input_graph, kernel))
                        for kernel in kernels
                    ]
                else:
                    window_outputs = [
                        gtn.forward_score(gtn.intersect(input_graph, kernel))
                        for kernel in kernels
                    ]
                output_graphs[b].append(window_outputs)

                # Save for backward:
                if input_graph.calc_grad:
                    input_graphs[b].append(input_graph)

        gtn.parallel_for(process, range(B))

        global CTX_GRAPHS
        CTX_GRAPHS = (output_graphs, input_graphs, kernels)
        ctx.input_shape = inputs.shape
        ctx.kernel_size = kernel_size
        ctx.stride = stride
        outputs = [[[o.item() for o in window] for window in example]
                   for example in output_graphs]
        return torch.tensor(outputs).to(inputs.device)
Ejemplo n.º 8
0
    def forward(ctx, inputs, transitions, targets, reduction="none"):
        B, T, C = inputs.shape
        losses = [None] * B
        scales = [None] * B
        emissions_graphs = [None] * B
        transitions_graphs = [None] * B

        calc_trans_grad = transitions.requires_grad
        transitions = transitions.cpu()  # avoid multiple cuda -> cpu copies

        def process(b):
            # create emission graph
            g_emissions = gtn.linear_graph(T, C, inputs.requires_grad)
            cpu_data = inputs[b].cpu().contiguous()
            g_emissions.set_weights(cpu_data.data_ptr())

            # create transition graph
            g_transitions = ASGLossFunction.create_transitions_graph(
                transitions, calc_trans_grad)

            # create force align criterion graph
            g_fal = ASGLossFunction.create_force_align_graph(targets[b])

            # compose the graphs
            g_fal_fwd = gtn.forward_score(
                gtn.intersect(gtn.intersect(g_fal, g_transitions),
                              g_emissions))
            g_fcc_fwd = gtn.forward_score(
                gtn.intersect(g_emissions, g_transitions))
            g_loss = gtn.subtract(g_fcc_fwd, g_fal_fwd)
            scale = 1.0
            if reduction == "mean":
                L = len(targets[b])
                scale = 1.0 / L if L > 0 else scale
            elif reduction != "none":
                raise ValueError("invalid value for reduction '" +
                                 str(reduction) + "'")

            # Save for backward:
            losses[b] = g_loss
            scales[b] = scale
            emissions_graphs[b] = g_emissions
            transitions_graphs[b] = g_transitions

        gtn.parallel_for(process, range(B))

        ctx.auxiliary_data = (
            losses,
            scales,
            emissions_graphs,
            transitions_graphs,
            inputs.shape,
        )
        loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)])
        return torch.mean(loss.cuda() if inputs.is_cuda else loss)
Ejemplo n.º 9
0
    def backward(ctx, grad_output):
        losses, emissions_graphs, in_shape = ctx.auxiliary_data
        B, T, C = in_shape
        input_grad = torch.empty((B, T, C))

        # Compute the gradients for each example:
        def backward_single(b):
            gtn.backward(losses[b])
            emissions = emissions_graphs[b]
            grad = emissions.grad().weights_to_numpy()
            input_grad[b] = torch.from_numpy(grad).view(1, T, C)

        # Compute gradients in parallel over the batch:
        gtn.parallel_for(backward_single, range(B))

        return input_grad.to(grad_output.device), None
Ejemplo n.º 10
0
    def forward(ctx, log_probs, targets, ilens, blank_idx=0, reduction="none"):
        """Forward computation.

        :param torch.tensor log_probs: batched log softmax probabilities (B, Tmax, oDim)
        :param list targets: batched target sequences, list of lists
        :param int blank_idx: index of blank token
        :return: ctc loss value
        :rtype: torch.Tensor
        """
        B, _, C = log_probs.shape
        losses = [None] * B
        scales = [None] * B
        emissions_graphs = [None] * B

        def process(b):
            # create emission graph
            T = ilens[b]
            g_emissions = gtn.linear_graph(T, C, log_probs.requires_grad)
            cpu_data = log_probs[b][:T].cpu().contiguous()
            g_emissions.set_weights(cpu_data.data_ptr())

            # create criterion graph
            g_criterion = GTNCTCLossFunction.create_ctc_graph(
                targets[b], blank_idx)
            # compose the graphs
            g_loss = gtn.negate(
                gtn.forward_score(gtn.intersect(g_emissions, g_criterion)))

            scale = 1.0
            if reduction == "mean":
                L = len(targets[b])
                scale = 1.0 / L if L > 0 else scale
            elif reduction != "none":
                raise ValueError("invalid value for reduction '" +
                                 str(reduction) + "'")

            # Save for backward:
            losses[b] = g_loss
            scales[b] = scale
            emissions_graphs[b] = g_emissions

        gtn.parallel_for(process, range(B))

        ctx.auxiliary_data = (losses, scales, emissions_graphs,
                              log_probs.shape, ilens)
        loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)])
        return torch.mean(loss.cuda() if log_probs.is_cuda else loss)
Ejemplo n.º 11
0
    def backward(ctx, grad_output):
        (
            losses,
            scales,
            emissions_graphs,
            transitions_graphs,
            in_shape,
        ) = ctx.auxiliary_data
        B, T, C = in_shape
        input_grad = transitions_grad = None
        if ctx.needs_input_grad[0]:
            input_grad = torch.empty((B, T, C))
        if ctx.needs_input_grad[1]:
            transitions_grad = torch.empty((B, C + 1, C))

        def process(b):
            gtn.backward(losses[b], False)
            emissions = emissions_graphs[b]
            transitions = transitions_graphs[b]
            if input_grad is not None:
                grad = emissions.grad().weights_to_numpy()
                input_grad[b] = torch.from_numpy(grad).view(1, T,
                                                            C) * scales[b]
            if transitions_grad is not None:
                grad = transitions.grad().weights_to_numpy()
                transitions_grad[b] = (
                    torch.from_numpy(grad).view(1, C + 1, C) * scales[b])

        gtn.parallel_for(process, range(B))
        if input_grad is not None:
            if grad_output.is_cuda:
                input_grad = input_grad.cuda()
            input_grad *= grad_output / B
        if transitions_grad is not None:
            if grad_output.is_cuda:
                transitions_grad = transitions_grad.cuda()

            transitions_grad = torch.mean(transitions_grad, 0) * grad_output
        return (
            input_grad,
            transitions_grad,
            None,  # target
            None,  # reduction
        )
Ejemplo n.º 12
0
    def _log_prob(self, inputs, targets, transition_params, duration_params):
        seq_fsts = self.seq_fst(transition_params=transition_params,
                                duration_params=duration_params)

        device = inputs.device
        arc_scores = self.scores_to_arc(inputs)
        arc_labels = self.labels_to_arc(targets)

        arc_scores = arc_scores.cpu()
        arc_labels = arc_labels.cpu()

        batch_size, num_samples, num_classes = arc_scores.shape

        losses = [None] * batch_size
        obs_fsts = [None] * batch_size

        def seq_loss(batch_index):
            obs_fst = linearFstFromArray(arc_scores[batch_index].reshape(
                num_samples, -1))
            gt_fst = fromSequence(arc_labels[batch_index])

            # Compose each sequence fst individually: it seems like composition
            # only works for lattices
            denom_fst = obs_fst
            for seq_fst in seq_fsts:
                denom_fst = gtn.compose(denom_fst, seq_fst)
                denom_fst = gtn.project_output(denom_fst)

            num_fst = gtn.compose(denom_fst, gt_fst)

            loss = gtn.subtract(gtn.forward_score(num_fst),
                                gtn.forward_score(denom_fst))

            losses[batch_index] = loss
            obs_fsts[batch_index] = obs_fst

        gtn.parallel_for(seq_loss, range(batch_size))

        self.auxiliary_data = losses

        losses = torch.tensor([lp.item() for lp in losses]).to(device)

        return losses
Ejemplo n.º 13
0
    def test_parallel_func(self):
        B = 3
        inputs1 = [gtn.scalar_graph(k) for k in [1.0, 2.0, 3.0]]
        inputs2 = [gtn.scalar_graph(k) for k in [1.0, 2.0, 3.0]]

        out = [None] * B

        def process(b):
            out[b] = gtn.add(gtn.add(inputs1[b], inputs1[b]),
                             gtn.negate(inputs2[b]))

        gtn.parallel_for(process, range(B))

        expected = []
        for b in range(B):
            expected.append(
                gtn.add(gtn.add(inputs1[b], inputs1[b]),
                        gtn.negate(inputs2[b])))

        self.assertEqual(len(out), len(expected))
        for i in range(len(expected)):
            self.assertTrue(gtn.equal(out[i], expected[i]))
Ejemplo n.º 14
0
    def forward(ctx, inputs, targets):
        B, T, C = inputs.shape
        losses = [None] * B
        emissions_graphs = [None] * B

        # Move data to the host:
        device = inputs.device
        inputs = inputs.cpu()
        targets = targets.cpu()

        # Compute the loss for the b-th example:
        def forward_single(b):
            emissions = gtn.linear_graph(T, C, inputs.requires_grad)
            data = inputs[b].contiguous()
            emissions.set_weights(data.data_ptr())

            target = GTNLossFunction.make_target_graph(targets[b])

            # Score the target:
            target_score = gtn.forward_score(gtn.intersect(target, emissions))

            # Normalization term:
            norm = gtn.forward_score(emissions)

            # Compute the loss:
            loss = gtn.subtract(norm, target_score)

            # Save state for backward:
            losses[b] = loss
            emissions_graphs[b] = emissions

        # Compute the loss in parallel over the batch:
        gtn.parallel_for(forward_single, range(B))

        ctx.auxiliary_data = (losses, emissions_graphs, inputs.shape)

        # Put losses back in a torch tensor and move them  back to the device:
        return torch.tensor([l.item() for l in losses]).to(device)
Ejemplo n.º 15
0
    def backward(ctx, grad_output) -> Tuple:
        losses, emissions_graphs, transitions = ctx.graphs
        scales = ctx.scales

        B, T, C = ctx.input_shape
        calc_emissions = ctx.needs_input_grad[0]
        input_grad = torch.empty((B, T, C)) if calc_emissions else None

        def process(b: int) -> None:
            scale = make_scalar_graph(scales[b])
            gtn.backward(losses[b], scale)
            emissions = emissions_graphs[b]
            if calc_emissions:
                grad = emissions.grad().weights_to_numpy()
                input_grad[b] = torch.tensor(grad).view(1, T, C)

        gtn.parallel_for(process, range(B))

        if calc_emissions:
            input_grad = input_grad.to(grad_output.device)
            input_grad *= grad_output / B

        if ctx.needs_input_grad[4]:
            grad = transitions.grad().weights_to_numpy()
            transition_grad = torch.tensor(grad).to(grad_output.device)
            transition_grad *= grad_output / B
        else:
            transition_grad = None

        return (
            input_grad,
            None,  # target
            None,  # tokens
            None,  # lexicon
            transition_grad,  # transition params
            None,  # transitions graph
            None,
        )
Ejemplo n.º 16
0
    def forward(ctx, log_probs, targets, blank_idx=0, reduction="none"):
        B, T, C = log_probs.shape
        losses = [None] * B
        scales = [None] * B
        emissions_graphs = [None] * B

        def process(b):
            # create emission graph
            g_emissions = gtn.linear_graph(T, C, log_probs.requires_grad)
            cpu_data = log_probs[b].cpu().contiguous()
            g_emissions.set_weights(cpu_data.data_ptr())

            # create criterion graph
            g_criterion = CTCLossFunction.create_ctc_graph(
                targets[b], blank_idx)
            # compose the graphs
            g_loss = gtn.negate(
                gtn.forward_score(gtn.intersect(g_emissions, g_criterion)))

            scale = 1.0
            if reduction == "mean":
                L = len(targets[b])
                scale = 1.0 / L if L > 0 else scale
            elif reduction != "none":
                raise ValueError("invalid value for reduction '" +
                                 str(reduction) + "'")

            # Save for backward:
            losses[b] = g_loss
            scales[b] = scale
            emissions_graphs[b] = g_emissions

        gtn.parallel_for(process, range(B))

        ctx.auxiliary_data = (losses, scales, emissions_graphs,
                              log_probs.shape)
        loss = torch.tensor([losses[b].item() * scales[b] for b in range(B)])
        return torch.mean(loss.cuda() if log_probs.is_cuda else loss)
Ejemplo n.º 17
0
    def backward(ctx, grad_output):
        losses, scales, emissions_graphs, in_shape = ctx.auxiliary_data
        B, T, C = in_shape
        input_grad = torch.empty((B, T, C))

        def process(b):
            gtn.backward(losses[b], False)
            emissions = emissions_graphs[b]
            grad = emissions.grad().weights_to_numpy()
            input_grad[b] = torch.from_numpy(grad).view(1, T, C) * scales[b]

        gtn.parallel_for(process, range(B))

        if grad_output.is_cuda:
            input_grad = input_grad.cuda()
        input_grad *= grad_output / B

        return (
            input_grad,
            None,  # targets
            None,  # blank_idx
            None,  # reduction
        )
Ejemplo n.º 18
0
    def forward(
        ctx,
        inputs,
        targets,
        tokens,
        lexicon,
        transition_params=None,
        transitions=None,
        reduction="none",
    ):
        B, T, C = inputs.shape
        losses = [None] * B
        emissions_graphs = [None] * B
        if transitions is not None:
            if transition_params is None:
                raise ValueError("Specified transitions, but not transition params.")
            cpu_data = transition_params.cpu().contiguous()
            transitions.set_weights(cpu_data.data_ptr())
            transitions.calc_grad = transition_params.requires_grad
            transitions.zero_grad()

        def process(b):
            # Create emissions graph:
            emissions = gtn.linear_graph(T, C, inputs.requires_grad)
            cpu_data = inputs[b].cpu().contiguous()
            emissions.set_weights(cpu_data.data_ptr())
            target = make_chain_graph(targets[b])
            target.arc_sort(True)

            # Create token to grapheme decomposition graph
            tokens_target = gtn.remove(gtn.project_output(gtn.compose(target, lexicon)))
            tokens_target.arc_sort()

            # Create alignment graph:
            alignments = gtn.project_input(
                gtn.remove(gtn.compose(tokens, tokens_target))
            )
            alignments.arc_sort()

            # Add transition scores:
            if transitions is not None:
                alignments = gtn.intersect(transitions, alignments)
                alignments.arc_sort()

            loss = gtn.forward_score(gtn.intersect(emissions, alignments))

            # Normalize if needed:
            if transitions is not None:
                norm = gtn.forward_score(gtn.intersect(emissions, transitions))
                loss = gtn.subtract(loss, norm)

            losses[b] = gtn.negate(loss)

            # Save for backward:
            if emissions.calc_grad:
                emissions_graphs[b] = emissions

        gtn.parallel_for(process, range(B))

        ctx.graphs = (losses, emissions_graphs, transitions)
        ctx.input_shape = inputs.shape

        # Optionally reduce by target length:
        if reduction == "mean":
            scales = [(1 / len(t) if len(t) > 0 else 1.0) for t in targets]
        else:
            scales = [1.0] * B
        ctx.scales = scales

        loss = torch.tensor([l.item() * s for l, s in zip(losses, scales)])
        return torch.mean(loss.to(inputs.device))
Ejemplo n.º 19
0
 def indexed_func():
     gtn.parallel_for(process, range(B))