Beispiel #1
0
    def apply(self, example, is_train=False, stats=None, **kwargs):

        if "src_feats" not in example:
            # Do nothing
            return example

        if self.reversible_tokenization == "joiner":
            original_src = example["src_original"] \
                if self.prior_tokenization else None
            word_to_subword_mapping = subword_map_by_joiner(
                example["src"], original_subwords=original_src)
        else:  # Spacer
            word_to_subword_mapping = subword_map_by_spacer(example["src"])

        inferred_feats = defaultdict(list)
        for subword, word_id in zip(example["src"], word_to_subword_mapping):
            for feat_name, feat_values in example["src_feats"].items():
                # Punctuation only
                if not re.sub(r'(\W)+', '', subword).strip() \
                        and not self.prior_tokenization:
                    inferred_feat = "<null>"
                else:
                    inferred_feat = feat_values[word_id]

                inferred_feats[feat_name].append(inferred_feat)

        for feat_name, feat_values in inferred_feats.items():
            example["src_feats"][feat_name] = inferred_feats[feat_name]

        return example
 def test_subword_group_joiner(self):
     data_in = [
         'however', '■,', 'according', 'to', 'the', 'logs', '■,', 'she',
         'is', 'hard', '■-■', 'working', '■.'
     ]  # noqa: E501
     true_out = [0, 0, 1, 2, 3, 4, 4, 5, 6, 7, 7, 7, 7]
     out = subword_map_by_joiner(data_in)
     self.assertEqual(out, true_out)
 def test_subword_group_naive(self):
     data_in = [
         'however', ',', 'according', 'to', 'the', 'logs', ',', 'she', 'is',
         'hard', '-', 'working', '.'
     ]  # noqa: E501
     true_out = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
     out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER)
     self.assertEqual(out, true_out)
 def test_subword_group_joiner_with_case_markup_advanced(self):
     data_in = [
         '⦅mrk_case_modifier_C⦆', 'dummy', 'text', '⦅mrk_case_modifier_C⦆',
         '1■', 'h■', 'k', '⦅mrk_begin_case_region_U⦆', 'th■',
         '⦅mrk_end_case_region_U⦆', 'n', 'more', 'dummy', 'text'
     ]  # noqa: E501
     true_out = [0, 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 5, 6]
     out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER)
     self.assertEqual(out, true_out)
 def test_subword_group_joiner_with_case_markup(self):
     data_in = [
         '⦅mrk_case_modifier_C⦆', 'however', '■,', 'according', 'to', 'the',
         'logs', '■,', '⦅mrk_begin_case_region_U⦆', 'she', 'is', 'hard',
         '■-■', 'working', '⦅mrk_end_case_region_U⦆', '■.'
     ]  # noqa: E501
     true_out = [0, 0, 0, 1, 2, 3, 4, 4, 5, 5, 6, 7, 7, 7, 7, 7]
     out = subword_map_by_joiner(data_in, marker=SubwordMarker.JOINER)
     self.assertEqual(out, true_out)
 def test_subword_group_joiner_prior_tokenization(self):
     data_in = [
         '⦅mrk_case_modifier_C⦆', 'how■', 'ever', '■,', 'according', 'to',
         'the', 'logs', '■,', '⦅mrk_begin_case_region_U⦆', 'she', 'is',
         'hard', '■-■', 'working', '⦅mrk_end_case_region_U⦆', '■.'
     ]  # noqa: E501
     original_data_in = [
         'However', '■,', 'according', 'to', 'the', 'logs', '■,', 'SHE',
         'IS', 'HARD-WORKING', '■.'
     ]  # noqa: E501
     true_out = [0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, 9, 9, 9, 9,
                 10]  # noqa: E501
     out = subword_map_by_joiner(data_in,
                                 marker=SubwordMarker.JOINER,
                                 original_subwords=original_data_in)
     self.assertEqual(out, true_out)