Example #1
0
    def __call__(self, xs, labels):  # B, T, F, 2048, labels: B, T, F, 12
        xs = F.transpose(xs, (0, 2, 1, 3))  # B, F, T, 2048
        orig_labels = labels
        labels = F.transpose(labels, (0, 2, 1, 3))  # B, F, T, 12
        mini_batch, frame_node, T, _ = xs.shape
        xs = xs.reshape(xs.shape[0] * xs.shape[1], xs.shape[2], xs.shape[3])
        labels = labels.reshape(labels.shape[0] * labels.shape[1],
                                labels.shape[2], labels.shape[3])
        xs = list(F.separate(xs, axis=0))  # list of T, 2048
        labels = list(F.separate(labels, axis=0))  # list of T, 12
        output = F.stack(self.label_dep_rnn(xs, labels))  # B * F, T, 12
        output = output.reshape(mini_batch, frame_node, T, -1)
        output = F.transpose(output, (0, 2, 1, 3))  # B, T, F, D
        output = output.reshape(-1, self.class_num)  # B * T * F, 12
        orig_labels = orig_labels.reshape(-1, self.class_num)
        assert output.shape == orig_labels.shape
        pick_index, accuracy_pick_index = self.get_loss_index(
            output, orig_labels)
        loss = F.sigmoid_cross_entropy(
            output[list(pick_index[0]),
                   list(pick_index[1])], orig_labels[list(pick_index[0]),
                                                     list(pick_index[1])])
        accuracy = F.binary_accuracy(
            output[list(accuracy_pick_index[0]),
                   list(accuracy_pick_index[1])], orig_labels[[
                       list(accuracy_pick_index[0]),
                       list(accuracy_pick_index[1])
                   ]])

        return loss, accuracy
Example #2
0
    def calc_accuracy(self, predictions, labels):
        batch_predictions = predictions
        # concat all individual predictions and slice for each time step
        batch_predictions = F.concat([F.expand_dims(p, axis=2) for p in batch_predictions], axis=2)

        t = F.reshape(labels, (1, self.args.timesteps, -1))

        accuracies = []
        with cuda.get_device_from_array(batch_predictions.data):
            for prediction, label in zip(F.separate(batch_predictions, axis=0), F.separate(t, axis=2)):
                classification = F.softmax(prediction, axis=2)
                classification = classification.data
                classification = self.xp.argmax(classification, axis=2)
                # classification = self.xp.transpose(classification, (1, 0))

                words = self.strip_prediction(classification)
                labels = self.strip_prediction(label.data)

                for word, label in zip(words, labels):
                    word = "".join(map(self.label_to_char, word))
                    label = "".join(map(self.label_to_char, label))
                    if word == label:
                        self.num_correct_words += 1
                    self.num_words += 1

        return word, label
Example #3
0
File: ima.py Project: nuric/softuni
def seq_rnn_embed(vxs, exs, rnn_layer, initial_state=None, reverse=False):
    """Embed given sequences using rnn."""
    # vxs.shape == (..., S)
    # exs.shape == (..., S, E)
    # initial_state == (..., E)
    assert vxs.shape == exs.shape[:
                                  -1], "Sequence embedding dimensions do not match."
    lengths = np.sum(vxs != 0, -1).flatten()  # (X,)
    seqs = F.reshape(exs, (-1, ) + exs.shape[-2:])  # (X, S, E)
    if reverse:
        toembed = [
            F.flip(s[..., :l, :], -2)
            for s, l in zip(F.separate(seqs, 0), lengths) if l != 0
        ]  # Y x [(S1, E), (S2, E), ...]
    else:
        toembed = [
            s[..., :l, :] for s, l in zip(F.separate(seqs, 0), lengths)
            if l != 0
        ]  # Y x [(S1, E), (S2, E), ...]
    if initial_state is not None:
        initial_state = F.reshape(initial_state, (-1, EMBED))  # (X, E)
        initial_state = initial_state[None,
                                      np.flatnonzero(lengths)]  # (1, Y, E)
    hs, ys = rnn_layer(initial_state,
                       toembed)  # (1, Y, E), Y x [(S1, 2*E), (S2, 2*E), ...]
    hs = hs[0]  # (Y, E)
    if hs.shape[0] == lengths.size:
        hs = F.reshape(hs, vxs.shape[:-1] + (EMBED, ))  # (..., E)
        return hs
    # Add zero values back to match original shape
    embeds = np.zeros((lengths.size, EMBED), dtype=np.float32)  # (X, E)
    idxs = np.nonzero(lengths)  # (Y,)
    embeds = F.scatter_add(embeds, idxs, hs)  # (X, E)
    embeds = F.reshape(embeds, vxs.shape[:-1] + (EMBED, ))  # (..., E)
    return embeds  # (..., E)
Example #4
0
    def draw_bboxes(self, bboxes, image):
        draw = ImageDraw.Draw(image)
        for boxes, colour in zip(F.separate(bboxes, axis=0), self.colours):
            num_boxes = boxes.shape[0]

            for i, bbox in enumerate(F.separate(boxes, axis=0)):
                # render all intermediate results with lower alpha as the others
                fill_colour = colour
                if i < num_boxes - 1:
                    if not self.render_intermediate_bboxes:
                        continue
                    fill_colour += '88'

                bbox.data[...] = (bbox.data[...] + 1) / 2
                bbox.data[0, :] *= self.image_size.width
                bbox.data[1, :] *= self.image_size.height

                x = self.xp.clip(bbox.data[0, :].reshape(self.out_size), 0,
                                 self.image_size.width)
                y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0,
                                 self.image_size.height)

                top_left = (x[0, 0], y[0, 0])
                top_right = (x[0, -1], y[0, -1])
                bottom_left = (x[-1, 0], y[-1, 0])
                bottom_right = (x[-1, -1], y[-1, -1])

                corners = [top_left, top_right, bottom_right, bottom_left]
                next_corners = corners[1:] + [corners[0]]

                for first_corner, next_corner in zip(corners, next_corners):
                    draw.line([first_corner, next_corner],
                              fill=fill_colour,
                              width=3)
    def __call__(self, *inputs):
        images, labels = inputs[:2]
        with cuda.Device(self.device):
            _, bboxes = self.link(images)

            bboxes = cuda.to_cpu(bboxes.data)
            labels = cuda.to_cpu(labels)

            xp = cuda.get_array_module(bboxes)

            bboxes = self.extract_corners(bboxes)
            bboxes = self.scale_bboxes(bboxes, Size._make(images.shape[-2:]))

            ious = bbox_iou(bboxes.data.copy(), xp.squeeze(labels))[xp.eye(len(bboxes)).astype(xp.bool)]
            mean_iou = ious.mean()

            reporter.report({'mean_iou': mean_iou})

            pred_bboxes = [bbox.data[xp.newaxis, ...].astype(xp.int32) for bbox in F.separate(bboxes, axis=0)]
            pred_scores = xp.ones((len(bboxes), 1))
            pred_labels = xp.zeros_like(pred_scores)

            gt_bboxes = [bbox.data[...] for bbox in F.separate(labels, axis=0)]
            gt_labels = xp.zeros_like(pred_scores)

            result = chainercv.evaluations.eval_detection_voc(
                pred_bboxes,
                pred_labels,
                pred_scores,
                gt_bboxes,
                gt_labels
            )

            reporter.report({'map': result['map']})
            reporter.report({'ap/sheep': result['ap'][0]})
Example #6
0
    def draw_bboxes(self, bboxes, image):
        draw = ImageDraw.Draw(image)
        for i, sub_box in enumerate(F.separate(bboxes, axis=1)):
            for bbox, colour in zip(F.separate(sub_box, axis=0), self.colours):
                bbox.data[...] = (bbox.data[...] + 1) / 2
                bbox.data[0, :] *= self.image_size.width
                bbox.data[1, :] *= self.image_size.height

                x = self.xp.clip(
                    bbox.data[0, :].reshape(self.out_size), 0,
                    self.image_size.width) + i * self.image_size.width
                y = self.xp.clip(bbox.data[1, :].reshape(self.out_size), 0,
                                 self.image_size.height)

                top_left = (x[0, 0], y[0, 0])
                top_right = (x[0, -1], y[0, -1])
                bottom_left = (x[-1, 0], y[-1, 0])
                bottom_right = (x[-1, -1], y[-1, -1])

                corners = [top_left, top_right, bottom_right, bottom_left]
                next_corners = corners[1:] + [corners[0]]

                for first_corner, next_corner in zip(corners, next_corners):
                    draw.line([first_corner, next_corner],
                              fill=colour,
                              width=3)
Example #7
0
    def calc_loss(self, x, t):
        batch_predictions, _, grids = x
        self.xp = cuda.get_array_module(batch_predictions[0], t)

        # reshape labels
        batch_size = t.shape[0]
        t = F.reshape(t, (batch_size, self.num_timesteps, -1))

        # reshape grids
        grid_shape = grids.shape
        if self.uses_original_data:
            grids = F.reshape(grids, (
                self.num_timesteps,
                batch_size,
                4,
            ) + grid_shape[1:])
        else:
            grids = F.reshape(grids, (
                self.num_timesteps,
                batch_size,
                1,
            ) + grid_shape[1:])
        losses = []

        # with cuda.get_device_from_array(grids.data):
        #     grid_list = F.separate(F.reshape(grids, (self.num_timesteps, -1,) + grids.shape[3:]), axis=0)
        #     overlap_losses = []
        #     for grid_1, grid_2 in itertools.combinations(grid_list, 2):
        #         overlap_losses.append(self.calc_iou_loss(grid_1, grid_2))
        #     losses.append(sum(overlap_losses) / max(len(overlap_losses), 1))

        loss_weights = [1, 1.25, 2, 1.25]
        for i, (predictions, grid,
                labels) in enumerate(zip(batch_predictions,
                                         F.separate(grids, axis=0),
                                         F.separate(t, axis=1)),
                                     start=1):
            with cuda.get_device_from_array(
                    getattr(predictions, 'data', predictions[0].data)):
                # adapt ctc weight depending on current prediction position and labels
                # if all labels are blank, we want this weight to be full weight!
                overall_loss_weight = loss_weights[i - 1]
                loss = self.calc_actual_loss(predictions, grid, labels)
                # label_lengths = self.get_label_lengths(labels)

                for sub_grid in F.separate(grid, axis=1):
                    width, height = self.get_bbox_side_lengths(sub_grid)
                    loss += self.area_loss_factor * self.calc_area_loss(
                        width, height)
                    loss += self.aspect_ratio_loss_factor * self.calc_aspect_ratio_loss(
                        width, height)
                    loss += self.calc_direction_loss(sub_grid)
                    loss += self.calc_height_loss(height)
                loss *= overall_loss_weight
                losses.append(loss)

        return sum(losses) / len(losses)
Example #8
0
    def forward(self, xs, labels):  # xs shape = (batch, T, F, D)
        '''
        :param xs: appearance features of all boxes feature across all frames
        :param gs:  geometry features of all polygons. each is 4 coordinates represent box
        :param crf_pact_structures: packaged graph structure contains supplementary information
        :return:
        '''
        xp = chainer.cuda.get_array_module(xs.data)
        batch_size = xs.shape[0]
        T = xs.shape[1]
        frame_node = xs.shape[2]
        assert frame_node == self.frame_node_num
        dim = xs.shape[-1]
        orig_labels = labels
        # first frame node_id ==> other frame node_id in same corresponding box
        if self.spatial_edge_mode != SpatialEdgeMode.no_edge:
            if self.spatial_sequence_type == SpatialSequenceType.in_frame:
                input_space = F.separate(F.reshape(xs, shape=(batch_size * T, self.frame_node_num, dim)), axis=0) # batch x T, F, D
                labels = F.separate(F.reshape(labels, shape=(batch_size * T, self.frame_node_num, labels.shape[-1])), axis=0) # batch x T, F, D
            elif self.spatial_sequence_type == SpatialSequenceType.cross_time:
                input_space = F.separate(F.reshape(xs, shape=(batch_size, T * self.frame_node_num, dim)), axis=0)  # batch ,T x F, D
                labels = F.separate(F.reshape(labels, shape=(batch_size, T * self.frame_node_num, labels.shape[-1])),
                                    axis=0)  # batch, T x F, D

            if self.spatial_edge_mode == SpatialEdgeMode.rnn:
                _, _, space_out = self.space_bi_lstm(hx=None, cx=None, xs=list(input_space))
                space_out = F.stack(space_out) # B, T, D
                space_out = F.reshape(space_out, (-1, self.space_mid_size))
                space_out = self.space_output(space_out)

            elif self.spatial_edge_mode == SpatialEdgeMode.ld_rnn or self.spatial_edge_mode == SpatialEdgeMode.bi_ld_rnn:
                space_out = self.space_module(list(input_space), list(labels))
            elif self.spatial_edge_mode == SpatialEdgeMode.no_edge:
                space_out = self.space_module(F.stack(input_space))

            space_out = F.stack(space_out)  # batch * T, F, D
            space_out = F.reshape(space_out, (batch_size, T, frame_node, self.out_size))
        else:
            input_space = F.reshape(xs, shape=(-1, self.in_size))
            space_out = self.space_module(input_space)
            space_out = F.reshape(space_out, (batch_size, T, frame_node, self.out_size))

        temporal_out_dict = self.temporal_node_recurrent_forward(xs, orig_labels)
        # shape = F, B, T, mid_size
        temporal_out = F.stack([node_out_ for _, node_out_ in sorted(temporal_out_dict.items(),
                                                                 key=lambda e: int(e[0]))])
        temporal_out = F.transpose(temporal_out, (1,2,0,3))  # shape = (B,T,F,D)

        if self.spatial_edge_mode == SpatialEdgeMode.no_edge and self.temporal_edge_mode != TemporalEdgeMode.no_temporal:
            return temporal_out
        elif self.temporal_edge_mode == TemporalEdgeMode.no_temporal and self.spatial_edge_mode!= SpatialEdgeMode.no_edge:
            return space_out
        elif self.temporal_edge_mode == TemporalEdgeMode.no_temporal and self.spatial_edge_mode == SpatialEdgeMode.no_edge:
            return temporal_out
        elif self.temporal_edge_mode != TemporalEdgeMode.no_temporal and self.spatial_edge_mode != SpatialEdgeMode.no_edge:
            final_out = space_out * temporal_out
            return final_out
Example #9
0
    def decode_prediction(self, prediction):
        words = []
        for box in F.separate(prediction, axis=1):
            word = [
                F.argmax(F.softmax(character), axis=1)
                for character in F.separate(box, axis=1)
            ]
            words.append(F.stack(word, axis=1))

        return F.stack(words, axis=1)
    def forward(self, xs):
        space_output = None
        temporal_output = None
        if self.temporal_edge_mode != TemporalEdgeMode.no_temporal:
            temporal_input = F.transpose(xs, axes=(0, 2, 1, 3))  # B, F, T, D
            assert temporal_input.shape[1] == config.BOX_NUM[self.database]
            all_temporal_output = []
            for idx, temporal_input_each_box in enumerate(
                    F.separate(temporal_input,
                               axis=1)):  # B,F,T,D =>F list of B, T, D
                # temporal_input_each_box : list of (T,D)
                temporal_input_each_box = list(
                    F.separate(temporal_input_each_box,
                               axis=0))  # list of (T,D)
                _, _, temporal_output = getattr(
                    self,
                    "temporal_lstm_{}".format(idx))(None, None,
                                                    temporal_input_each_box)
                temporal_output = F.stack(temporal_output)  # B,T,D
                all_temporal_output.append(temporal_output)
            all_temporal_output = F.stack(all_temporal_output,
                                          axis=1)  # B,F,T,D
            temporal_output = F.transpose(all_temporal_output,
                                          axes=(0, 2, 1, 3))  # B,T,F,D

        if self.spatial_edge_mode != SpatialEdgeMode.no_edge:  # B,T,F,D
            minibatch, T, frame_box, _ = xs.shape  # B,T,F,D
            space_input = F.reshape(xs,
                                    shape=(xs.shape[0] * xs.shape[1],
                                           xs.shape[2],
                                           xs.shape[3]))  # B*T,F,D
            space_input = list(F.separate(space_input, axis=0))  # list of F,D
            _, _, space_output = self.space_fc_lstm(
                None, None, space_input)  # B*T, F, 1024
            # B, T, F, D
            space_output = F.stack(space_output)  # B*T, F, 1024
            space_output = F.reshape(space_output,
                                     shape=(minibatch, T, frame_box, -1))

        if self.temporal_edge_mode != TemporalEdgeMode.no_temporal and self.spatial_edge_mode != SpatialEdgeMode.no_edge:
            assert space_output.shape == temporal_output.shape
            fusion_output = F.concat([space_output, temporal_output], axis=3)

        elif self.spatial_edge_mode != SpatialEdgeMode.no_edge:
            fusion_output = space_output
        elif self.temporal_edge_mode != TemporalEdgeMode.no_temporal:
            fusion_output = temporal_output
        fc_input = F.reshape(
            fusion_output,
            shape=(fusion_output.shape[0] * fusion_output.shape[1] *
                   fusion_output.shape[2], -1))
        score = self.score_fc(fc_input)
        return F.reshape(score,
                         shape=(fusion_output.shape[0], fusion_output.shape[1],
                                fusion_output.shape[2], -1))
Example #11
0
def loss_information(enc, x):
    p_logit = enc(x)
    p = F.sigmoid(p_logit)
    p_ave = F.sum(p, axis=0) / x.data.shape[0]

    cond_ent = F.sum(-p * F.log(p + 1e-8) -
                     (1 - p) * F.log(1 - p + 1e-8)) / p.data.shape[0]
    marg_ent = F.sum(-p_ave * F.log(p_ave + 1e-8) -
                     (1 - p_ave) * F.log(1 - p_ave + 1e-8))

    p_ave = F.reshape(p_ave, (1, len(p_ave.data)))

    p_ave_separated = F.separate(p_ave, axis=1)
    p_separated = F.separate(F.expand_dims(p, axis=2), axis=1)

    p_ave_list_i = []
    p_ave_list_j = []

    p_list_i = []
    p_list_j = []

    for i in range(n_bit - 1):
        p_ave_list_i.extend(list(p_ave_separated[i + 1:]))
        p_list_i.extend(list(p_separated[i + 1:]))

        p_ave_list_j.extend([p_ave_separated[i] for n in range(n_bit - i - 1)])
        p_list_j.extend([p_separated[i] for n in range(n_bit - i - 1)])

    p_ave_pair_i = F.expand_dims(F.concat(tuple(p_ave_list_i), axis=0), axis=1)
    p_ave_pair_j = F.expand_dims(F.concat(tuple(p_ave_list_j), axis=0), axis=1)

    p_pair_i = F.expand_dims(F.concat(tuple(p_list_i), axis=1), axis=2)
    p_pair_j = F.expand_dims(F.concat(tuple(p_list_j), axis=1), axis=2)

    p_pair_stacked_i = F.concat(
        (p_pair_i, 1 - p_pair_i, p_pair_i, 1 - p_pair_i), axis=2)
    p_pair_stacked_j = F.concat(
        (p_pair_j, p_pair_j, 1 - p_pair_j, 1 - p_pair_j), axis=2)

    p_ave_pair_stacked_i = F.concat(
        (p_ave_pair_i, 1 - p_ave_pair_i, p_ave_pair_i, 1 - p_ave_pair_i),
        axis=1)
    p_ave_pair_stacked_j = F.concat(
        (p_ave_pair_j, p_ave_pair_j, 1 - p_ave_pair_j, 1 - p_ave_pair_j),
        axis=1)

    p_product = F.sum(p_pair_stacked_i * p_pair_stacked_j, axis=0) / len(
        p.data)
    p_ave_product = p_ave_pair_stacked_i * p_ave_pair_stacked_j
    pairwise_mi = 2 * F.sum(p_product * F.log(
        (p_product + 1e-8) / (p_ave_product + 1e-8)))

    return cond_ent, marg_ent, pairwise_mi
Example #12
0
 def calc_loss(self, predictions, labels):
     recognition_losses = []
     assert predictions.shape[1] == labels.shape[
         1], "Number of boxes is not equal in predictions and labels"
     for box, box_labels in zip(F.separate(predictions, axis=1),
                                F.separate(labels, axis=1)):
         assert box.shape[1] == box_labels.shape[
             1], "Number of predicted chars is not equal to number of chars in label"
         box_losses = [
             F.softmax_cross_entropy(char, char_label, reduce="no")
             for char, char_label in zip(F.separate(box, axis=1),
                                         F.separate(box_labels, axis=1))
         ]
         recognition_losses.append(F.stack(box_losses))
     return F.mean(F.stack(recognition_losses))
Example #13
0
def test_ctc_loss():
    pytest.importorskip("torch")
    pytest.importorskip("warpctc_pytorch")
    import torch
    import warpctc_pytorch

    from espnet.nets.e2e_asr_th import pad_list

    n_out = 7
    input_length = numpy.array([11, 17, 15], dtype=numpy.int32)
    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
    np_pred = [
        numpy.random.rand(il, n_out).astype(numpy.float32)
        for il in input_length
    ]
    np_target = [
        numpy.random.randint(0, n_out, size=ol, dtype=numpy.int32)
        for ol in label_length
    ]

    # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr.py
    ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2)
    ch_target = F.pad_sequence(np_target, padding=-1)
    ch_loss = F.connectionist_temporal_classification(ch_pred, ch_target, 0,
                                                      input_length,
                                                      label_length).data

    th_pred = pad_list([torch.from_numpy(x) for x in np_pred],
                       0.0).transpose(0, 1)
    th_target = torch.from_numpy(numpy.concatenate(np_target))
    th_ilen = torch.from_numpy(input_length)
    th_olen = torch.from_numpy(label_length)
    th_loss = warpctc_pytorch.CTCLoss(size_average=True)(
        th_pred, th_target, th_ilen, th_olen).data.numpy()[0]
    numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
Example #14
0
def arr2list(arr, length=None):
    xs = F.separate(F.swapaxes(arr, 1, 2))

    if length is not None:
        assert len(xs) == len(length)
        xs = [x[:l] for x, l in zip(xs, length)]
    return xs
Example #15
0
 def forward(self, equery, vmemory, ememory, mask, iteration=0):
     """Compute an attention over memory given the query."""
     # equery.shape == (..., E)
     # vmemory.shape == (..., Ms, M)
     # ememory.shape == (..., Ms, E)
     # mask.shape == (..., Ms)
     # Setup memory embedding
     eq = F.repeat(equery[..., None, :], vmemory.shape[-2],
                   -2)  # (..., Ms, E)
     # Compute content based attention
     merged = F.concat(
         [eq, ememory, eq * ememory,
          F.squared_difference(eq, ememory)], -1)  # (..., Ms, 4*E)
     inter = self.att_linear(merged, n_batch_axes=len(vmemory.shape) -
                             1)  # (..., Ms, E)
     inter = F.tanh(inter)  # (..., Ms, E)
     inter = F.dropout(inter, DROPOUT)  # (..., Ms, E)
     # Split into sentences
     lengths = np.sum(np.any((vmemory != 0), -1), -1)  # (...,)
     mems = [s[..., :l, :] for s, l in zip(F.separate(inter, 0), lengths)
             ]  # B x [(M1, E), (M2, E), ...]
     _, bimems = self.att_birnn(None,
                                mems)  # B x [(M1, 2*E), (M2, 2*E), ...]
     bimems = F.pad_sequence(bimems)  # (..., Ms, 2*E)
     att = self.att_score(bimems, n_batch_axes=len(vmemory.shape) -
                          1)  # (..., Ms, 1)
     att = F.squeeze(att, -1)  # (..., Ms)
     if mask is not None:
         att += mask * MINUS_INF  # (..., Ms)
     return att
Example #16
0
def seq_rnn_embed(vxs, exs, birnn, return_seqs=False):
    """Embed given sequences using rnn."""
    # vxs.shape == (..., S)
    # exs.shape == (..., S, E)
    assert vxs.shape == exs.shape[:
                                  -1], "Sequence embedding dimensions do not match."
    lengths = np.sum(vxs != 0, -1).flatten()  # (X,)
    seqs = F.reshape(exs, (-1, ) + exs.shape[-2:])  # (X, S, E)
    toembed = [
        s[..., :l, :] for s, l in zip(F.separate(seqs, 0), lengths) if l != 0
    ]  # Y x [(S1, E), (S2, E), ...]
    hs, ys = birnn(None, toembed)  # (2, Y, E), Y x [(S1, 2*E), (S2, 2*E), ...]
    if return_seqs:
        ys = F.pad_sequence(ys)  # (Y, S, 2*E)
        ys = F.reshape(ys, ys.shape[:-1] + (2, EMBED))  # (Y, S, 2, E)
        ys = F.mean(ys, -2)  # (Y, S, E)
        if ys.shape[0] == lengths.size:
            ys = F.reshape(ys, exs.shape)  # (..., S, E)
            return ys
        embeds = np.zeros((lengths.size, vxs.shape[-1], EMBED),
                          dtype=np.float32)  # (X, S, E)
        idxs = np.nonzero(lengths)  # (Y,)
        embeds = F.scatter_add(embeds, idxs, ys)  # (X, S, E)
        embeds = F.reshape(embeds, exs.shape)  # (..., S, E)
        return embeds  # (..., S, E)
    hs = F.mean(hs, 0)  # (Y, E)
    if hs.shape[0] == lengths.size:
        hs = F.reshape(hs, vxs.shape[:-1] + (EMBED, ))  # (..., E)
        return hs
    # Add zero values back to match original shape
    embeds = np.zeros((lengths.size, EMBED), dtype=np.float32)  # (X, E)
    idxs = np.nonzero(lengths)  # (Y,)
    embeds = F.scatter_add(embeds, idxs, hs)  # (X, E)
    embeds = F.reshape(embeds, vxs.shape[:-1] + (EMBED, ))  # (..., E)
    return embeds  # (..., E)
    def translate(self, hxs, max_length=100):
        batch_size, _, _ = hxs.shape
        compute_context = self.attention(hxs)
        c = Variable(self.xp.zeros((batch_size, self.n_units), 'f'))
        h = Variable(self.xp.zeros((batch_size, self.n_units), 'f'))

        ys = self.xp.full(batch_size, tokens['<SOS>'], np.int32)

        results = []
        for _ in range(max_length):
            eys = self.embed_y(ys)

            context = compute_context(h)
            concatenated = F.concat([eys, context])

            c, h = self.lstm(c, h, concatenated)
            concatenated = F.concat([concatenated, h])

            logit = self.w(self.maxout(concatenated))
            y = F.reshape(F.argmax(logit, axis=1), (batch_size, ))

            results.append(y)

        results = F.separate(F.transpose(F.vstack(results)), axis=0)

        outs = []
        for y in results:
            inds = np.argwhere(y == tokens['<EOS>'])
            if len(inds) > 0:
                y = y[:inds[0, 0]]
            outs.append(y)

        return outs
Example #18
0
    def translate(self, hxs, max_length):
        """Generate target sentences given hidden states of source sentences.

        Args:
            hxs: Hidden states for source sequences.

        Returns:
            ys: Generated sequences.

        """
        batch_size, _, _ = hxs.shape
        compute_context = self.attention(hxs)
        c = Variable(self.xp.zeros((batch_size, self.n_units), 'f'))
        h = F.broadcast_to(self.bos_state, ((batch_size, self.n_units)))
        # first character's embedding
        previous_embedding = self.embed_y(
            Variable(self.xp.full((batch_size, ), EOS, 'i')))

        results = []
        for _ in range(max_length):
            context = compute_context(h)
            concatenated = F.concat((previous_embedding, context))

            c, h = self.lstm(c, h, concatenated)
            concatenated = F.concat((concatenated, h))

            logit = self.w(self.maxout(concatenated))
            y = F.reshape(F.argmax(logit, axis=1), (batch_size, ))

            results.append(y)
            previous_embedding = self.embed_y(y)

        results = F.separate(F.transpose(F.vstack(results)), axis=0)
        ys = [get_subsequence_before_eos(result.data) for result in results]
        return ys
Example #19
0
    def __call__(self, input_tensor,
                 cur_state):  # input_tensor and cur_state is B,F,C,H,W
        h_cur, c_cur = cur_state  # B, F, C, H, W
        mini_batch, frame_box_num, channel, height, width = input_tensor.shape
        combined = F.concat(
            [input_tensor, h_cur],
            axis=2)  # concatenate along channel axis. B, F, C+hidden_dim, H, W
        assert frame_box_num == self.group_num
        combined = F.reshape(combined,
                             shape=(mini_batch,
                                    frame_box_num * combined.shape[2], height,
                                    width))  # B, F * (C+hidden_dim), H, W
        conv_output = self.conv(combined)  # B, F * 4 * hidden_dim, H, W

        conv_output = F.reshape(
            conv_output,
            shape=(mini_batch, self.group_num, 4, self.hidden_dim, self.height,
                   self.width))  # B, F, 4, hidden_dim, H, W
        cc_i, cc_f, cc_o, cc_g = F.separate(conv_output,
                                            axis=2)  # B, F, hidden_dim, H, W
        i = F.sigmoid(cc_i)
        f = F.sigmoid(cc_f)
        o = F.sigmoid(cc_o)
        g = F.tanh(cc_g)
        c_next = f * c_cur + i * g
        h_next = o * F.tanh(c_next)
        return h_next, c_next
Example #20
0
    def forward(self, xs, n_speakers, activation=None):
        ilens = [x.shape[0] for x in xs]
        # xs: (B, T, F)
        xs = F.pad_sequence(xs, padding=-1)
        pad_shape = xs.shape
        # emb: (B*T, E)
        emb = self.enc(xs)
        # emb: (B, T, F)
        emb = F.separate(emb.reshape(pad_shape[0], pad_shape[1], -1), axis=0)
        emb = [F.get_item(e, slice(0, ilen)) for e, ilen in zip(emb, ilens)]
        emb2 = [cp.random.permutation(e) for e in emb]

        # get name: main-                 num_speakers=n_speakers, to_train=1
        #           validation/main-      num_speakers=n_speaker,  to_train=0
        #           validation_1/main-    num_speakers=None,       to_train=0
        name = reporter.get_current_reporter()._observer_names[id(self)]
        num_speakers = None if name == "validation_1/main" else n_speakers
        to_train = 1 if name == 'main' else 0
        # h_0: (1, B, F)
        # c_0: (1, B, F)
        h_0, c_0 = self.encoder(emb2)
        # A: (B, n_spk, F)
        # P: (B, n_spk, 1)
        A, P = self.decoder(h_0,
                            c_0,
                            n_speakers=num_speakers,
                            to_train=to_train)
        # yhat: (B, T, n_spk)
        ys = [F.matmul(e, a.T) for a, e in zip(A, emb)]

        return ys, P
Example #21
0
    def __call__(self, rois):
        batch_size, num_bboxes, num_channels, height, width = rois.shape
        rois = F.reshape(rois, (-1, num_channels, height, width))

        # if not chainer.config.user_text_recognition_grayscale_input:
        #     # convert data to grayscale
        #     assert rois.shape[1] == 3, "rois are not in RGB, can not convert them to grayscale"
        #     r, g, b = F.separate(rois, axis=1)
        #     grey = 0.299 * r + 0.587 * g + 0.114 * b
        #     rois = F.stack([grey, grey, grey], axis=1)

        h = self.feature_extractor(rois)
        _, num_channels, feature_height, feature_width = h.shape
        h = F.average_pooling_2d(h, (feature_height, feature_width))

        h = F.reshape(h, (batch_size, num_bboxes, num_channels, -1))

        all_predictions = []
        for box in F.separate(h, axis=1):
            # box_predictions = [self.classifier(self.lstm(box)) for _ in range(self.num_chars)]
            box_predictions = [
                self.classifier(box) for _ in range(self.num_chars)
            ]
            all_predictions.append(F.stack(box_predictions, axis=1))

        # return shape: batch_size, num_bboxes, num_chars, num_classes
        return F.stack(all_predictions, axis=2)
Example #22
0
 def prob(self, x):
     assert x.shape[1] == len(self.distributions)
     prob_all = 1
     for value, distribution in zip(list(F.separate(x, axis=1)),
                                    self.distributions):
         prob_all *= distribution.prob(value)
     return prob_all
Example #23
0
    def attend(self, encoded_features):
        self.out_lstm.reset_state()
        transformed_encoded_features = F.concat([
            F.expand_dims(self.transform_encoded_features(feature), axis=1)
            for feature in encoded_features
        ],
                                                axis=1)
        concat_encoded_features = F.concat(
            [F.expand_dims(e, axis=1) for e in encoded_features], axis=1)

        lstm_output = self.xp.zeros_like(encoded_features[0])
        outputs = []
        for _ in range(self.num_labels):
            transformed_lstm_output = self.transform_out_lstm_feature(
                lstm_output)
            attended_feats = []
            for transformed_encoded_feature in F.separate(
                    transformed_encoded_features, axis=1):
                attended_feat = transformed_encoded_feature + transformed_lstm_output
                attended_feat = F.tanh(attended_feat)
                attended_feats.append(
                    self.generate_attended_feat(attended_feat))

            attended_feats = F.concat(attended_feats, axis=1)
            alphas = F.softmax(attended_feats, axis=1)

            lstm_input_feature = F.batch_matmul(alphas,
                                                concat_encoded_features,
                                                transa=True)
            lstm_input_feature = F.squeeze(lstm_input_feature, axis=1)
            lstm_output = self.out_lstm(lstm_input_feature)
            outputs.append(lstm_output)
        return outputs
Example #24
0
    def __call__(self, hs, ys):
        """CTC forward.

        Args:
            hs (list of chainer.Variable | N-dimension array): Input variable from encoder.
            ys (list of chainer.Variable | N-dimension array): Input variable of decoder.

        Returns:
            chainer.Variable: A variable holding a scalar value of the CTC loss.

        """
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = linear_tensor(self.ctc_lo, F.dropout(
            F.pad_sequence(hs), ratio=self.dropout_rate))
        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim

        # zero padding for ys
        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen

        # get length info
        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
        logging.info(self.__class__.__name__ + ' input lengths:  ' + str(input_length.data))
        logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data))

        # get ctc loss
        self.loss = F.connectionist_temporal_classification(
            y_hat, y_true, 0, input_length, label_length)
        logging.info('ctc loss:' + str(self.loss.data))

        return self.loss
Example #25
0
    def __call__(self, hs, ys):
        '''CTC forward

        :param hs:
        :param ys:
        :return:
        '''
        self.loss = None
        ilens = [x.shape[0] for x in hs]
        olens = [x.shape[0] for x in ys]

        # zero padding for hs
        y_hat = linear_tensor(self.ctc_lo, F.dropout(
            F.pad_sequence(hs), ratio=self.dropout_rate))
        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim

        # zero padding for ys
        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen

        # get length info
        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
        logging.info(self.__class__.__name__ + ' input lengths:  ' + str(input_length.data))
        logging.info(self.__class__.__name__ + ' output lengths: ' + str(label_length.data))

        # get ctc loss
        self.loss = F.connectionist_temporal_classification(
            y_hat, y_true, 0, input_length, label_length)
        logging.info('ctc loss:' + str(self.loss.data))

        return self.loss
Example #26
0
    def translate(
            self,
            encoded: Variable,
            max_length: int = 100
    ) -> List[ndarray]:
        sentence_count = encoded.shape[0]

        self.setup(encoded)
        cell, state, previous_words = self.get_initial_states(sentence_count)

        result = []
        for _ in range(max_length):
            cell, state, context, concatenated = \
                self.advance_one_step(cell, state, previous_words)
            logit, state = self.compute_logit(concatenated, state, context)

            output_id = F.reshape(F.argmax(logit, axis=1), (sentence_count,))
            result.append(output_id)

            previous_words = output_id

        # Remove words after <EOS>
        outputs = F.separate(F.transpose(F.vstack(result)), axis=0)
        assert len(outputs) == sentence_count
        output_sentences = []
        for output in outputs:
            assert output.shape == (max_length,)
            indexes = np.argwhere(output.data == EOS)
            if len(indexes) > 0:
                output = output[:indexes[0, 0] + 1]
            output_sentences.append(output.data)

        return output_sentences
Example #27
0
    def calc_accuracy(self, x, t):
        batch_predictions, _, _ = x
        self.xp = cuda.get_array_module(batch_predictions[0], t)
        batch_size = t.shape[0]
        t = F.reshape(t, (batch_size, self.num_timesteps, -1))
        accuracies = []

        for predictions, labels in zip(batch_predictions, F.separate(t,
                                                                     axis=1)):
            if isinstance(predictions, list):
                predictions = F.concat(
                    [F.expand_dims(p, axis=0) for p in predictions], axis=0)
            with cuda.get_device_from_array(predictions.data):

                classification = F.softmax(predictions, axis=2)
                classification = classification.data
                classification = self.xp.argmax(classification, axis=2)
                classification = self.xp.transpose(classification, (1, 0))

                words = self.strip_prediction(classification)
                labels = self.strip_prediction(labels.data)

                num_correct_words = 0
                for word, label in zip(words, labels):
                    word = "".join(map(self.label_to_char, word))
                    label = "".join(map(self.label_to_char, label))
                    if word == label:
                        num_correct_words += 1

                accuracy = num_correct_words / len(labels)
                accuracies.append(accuracy)

        overall_accuracy = sum(accuracies) / max(len(accuracies), 1)
        self.scale_area_loss_factor(overall_accuracy)
        return overall_accuracy
Example #28
0
def _call_1layer(net: NStepRNNBase, hidden: Optional[ArrayLike],
                 input: ArrayLike):
    if hidden is not None:
        hidden = hidden[np.newaxis]
    _, hidden = net(hx=hidden, xs=F.separate(input, axis=0))
    hidden = F.stack(hidden, axis=0)
    return hidden
Example #29
0
def test_ctc_loss():
    pytest.importorskip("torch")
    pytest.importorskip("warpctc_pytorch")
    import torch
    from warpctc_pytorch import CTCLoss

    from e2e_asr_attctc_th import pad_list

    n_out = 7
    n_batch = 3
    input_length = numpy.array([11, 17, 15], dtype=numpy.int32)
    label_length = numpy.array([4, 2, 3], dtype=numpy.int32)
    np_pred = [numpy.random.rand(il, n_out).astype(
        numpy.float32) for il in input_length]
    np_target = [numpy.random.randint(
        0, n_out, size=ol, dtype=numpy.int32) for ol in label_length]

    # NOTE: np_pred[i] seems to be transposed and used axis=-1 in e2e_asr_attctc.py
    ch_pred = F.separate(F.pad_sequence(np_pred), axis=-2)
    ch_target = F.pad_sequence(np_target, padding=-1)
    ch_loss = F.connectionist_temporal_classification(
        ch_pred, ch_target, 0, input_length, label_length).data

    th_pred = pad_list([torch.autograd.Variable(torch.from_numpy(x))
                        for x in np_pred]).transpose(0, 1)
    th_target = torch.autograd.Variable(
        torch.from_numpy(numpy.concatenate(np_target)))
    th_ilen = torch.autograd.Variable(torch.from_numpy(input_length))
    th_olen = torch.autograd.Variable(torch.from_numpy(label_length))
    # NOTE: warpctc_pytorch.CTCLoss does not normalize itself by batch-size while chainer's default setting does
    th_loss = (CTCLoss()(th_pred, th_target, th_ilen,
                         th_olen) / n_batch).data.numpy()[0]
    numpy.testing.assert_allclose(th_loss, ch_loss, 0.05)
Example #30
0
    def __call__(self, images, localizations):
        points = F.spatial_transformer_grid(localizations, self.target_shape)
        rois = F.spatial_transformer_sampler(images, points)

        # h = self.data_bn(rois)
        h = F.relu(self.bn0(self.conv0(rois)))
        h = F.average_pooling_2d(h, 2, stride=2)

        h = self.rs1(h)
        h = self.rs2(h)
        h = F.max_pooling_2d(h, 2, stride=2)
        h = self.rs3(h)
        self.vis_anchor = h

        h = F.average_pooling_2d(h, 5, stride=1)

        h = F.relu(self.fc1(h))

        # for each timestep of the localization net do the 'classification'
        h = F.reshape(h, (self.num_timesteps * 2 + 1, -1, self.fc1.out_size))
        overall_predictions = []
        for timestep in F.separate(h, axis=0):
            # go 2x num_labels plus 1 timesteps because of ctc loss
            lstm_predictions = []
            self.lstm.reset_state()
            for _ in range(self.num_labels):
                lstm_prediction = self.lstm(timestep)
                classified = self.classifier(lstm_prediction)
                lstm_predictions.append(classified)
            overall_predictions.append(lstm_predictions)

        return overall_predictions, rois, points
Example #31
0
 def __call__(self, h):
     # type: (chainer.Variable) -> chainer.Variable
     xp = cuda.get_array_module(h)
     mb, node, ch = h.shape  # type: int, int, int
     if self.q_star is None:
         self.q_star = [
             xp.zeros((1, self.in_channels * 2)).astype('f')
             for _ in range(mb)
         ]
     self.hx, self.cx, q = self.lstm_layer(self.hx, self.cx, self.q_star)
     # self.hx: (mb, mb, ch)
     # self.cx: (mb, mb, ch)
     # q: List[(1, ch) * mb]
     q = functions.stack(q)  # q: (mb, 1, ch)
     q_ = functions.transpose(q, axes=(0, 2, 1))  # q_: (mb, ch, 1)
     e = functions.matmul(h, q_)  # e: (mb, node, 1)
     a = functions.softmax(e)  # a: (mb, node, 1)
     a = functions.broadcast_to(a, h.shape)  # a: (mb, node, ch)
     r = functions.sum((a * h), axis=1, keepdims=True)  # r: (mb, 1, ch)
     q_star_ = functions.concat((q, r), axis=2)  # q_star_: (mb, 1, ch*2)
     self.q_star = functions.separate(q_star_)
     return functions.reshape(q_star_, (mb, ch * 2))
Example #32
0
    def __call__(self, input_ids, input_mask, token_type_ids):
        final_hidden = self.bert.get_sequence_output(
            input_ids,
            input_mask,
            token_type_ids)
        batch_size = final_hidden.shape[0]
        seq_length = final_hidden.shape[1]
        hidden_size = final_hidden.shape[2]

        final_hidden_matrix = F.reshape(
            final_hidden, [batch_size * seq_length, hidden_size])

        logits = self.output(final_hidden_matrix)

        logits = F.reshape(logits, [batch_size, seq_length, 2])
        logits = logits - (1 - input_mask[:, :, None]) * 1000.  # ignore pads
        logits = F.transpose(logits, [2, 0, 1])

        unstacked_logits = F.separate(logits, axis=0)

        (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])
        return (start_logits, end_logits)
Example #33
0
 def forward(self, inputs, device):
     x, = inputs
     return functions.separate(x, self.axis)