Beispiel #1
0
 def test_identity_unmasking(self):
     m = Masker('identity')
     for unmasked, masked, mapping in self.test_cases_identity_masking:
         self.assertTrue(
             m.unmask_segment(unmasked, masked, mapping) == unmasked,
             "Identity masker must restore masks correctly given a translated segment and a mapping"
         )
Beispiel #2
0
 def test_identity_force_mask_translation(self):
     m = Masker('identity')
     for masked, forced in self.test_cases_identity_force_mask_translation:
         self.assertTrue(
             m.force_mask_translation(masked) == forced,
             "Identity masking must replace mask tokens with Moses forced translation directives"
         )
Beispiel #3
0
 def test_identity_masking_do_not_escape_option(self):
     m = Masker('identity', escape=False)
     self.assertTrue(
         m.mask_segment("the ships & hung < in the > [ sky ] .")[0] ==
         "the ships & hung < in the > [ sky ] .",
         "Masker must not escape characters reserved in Moses if requested explicitly"
     )
Beispiel #4
0
 def test_identity_masking_return_mapping(self):
     m = Masker('identity')
     for unmasked, masked, mapping in self.test_cases_identity_masking:
         self.assertTrue(
             m.mask_segment(unmasked)[1] == mapping,
             "Identity masker must return a list of mappings (mask_token, original_content)."
         )
Beispiel #5
0
 def test_identity_masking(self):
     m = Masker('identity')
     for unmasked, masked, mapping in self.test_cases_identity_masking:
         self.assertTrue(
             m.mask_segment(unmasked)[0] == masked,
             "Identity masking must replace matching tokens with unique masks"
         )
Beispiel #6
0
 def test_alignment_mapping(self):
     m = Masker('alignment')
     for unmasked, masked, mapping in self.test_cases_alignment_masking:
         self.assertTrue(
             _mappings_equal(m.mask_segment(unmasked)[1], mapping),
             "Alignment masking must correctly record the mapping between mask tokens and their original content"
         )
Beispiel #7
0
 def test_alignment_force_mask_translation(self):
     m = Masker('alignment')
     for masked, forced in self.test_cases_alignment_force_mask_translation:
         self.assertTrue(
             m.force_mask_translation(masked) == forced,
             "Alignment masking must replace mask tokens with forced translation directives if requested"
         )
Beispiel #8
0
 def test_alignment_masking(self):
     m = Masker('alignment')
     for unmasked, masked, mapping in self.test_cases_alignment_masking:
         self.assertTrue(
             m.mask_segment(unmasked)[0] == masked,
             "Alignment masking must insert mask tokens in the correct places"
         )
Beispiel #9
0
 def test_alignment_unmasking(self):
     m = Masker('alignment')
     for source, target, mapping, alignment, final_result in self.test_cases_alignment_unmasking:
         self.assertTrue(
             m.unmask_segment(source, target, mapping,
                              alignment) == final_result,
             "Alignment masking must restore markup in translated text based on the source segment, target segment, mapping and alignment"
         )
Beispiel #10
0
 def __init__(self, xml_strategy):
     self._xml_strategy = xml_strategy
     if self._xml_strategy in (XML_STRIP, XML_STRIP_REINSERT):
         self._reinserter = Reinserter(
             XML_STRATEGIES_DEFAULTS[self._xml_strategy],
             force_all=FORCE_REINSERT_ALL)
     elif self._xml_strategy == XML_MASK:
         self._masker = Masker(
             strategy=XML_STRATEGIES_DEFAULTS[self._xml_strategy],
             escape=True,
             force_all=FORCE_REINSERT_ALL,
             remove_all=REMOVE_ALL_MASKS)
Beispiel #11
0
    def test_identity_mask_tokens(self):
        m = Masker('identity')
        result, mapping = m.mask_tokens(
            'Email me at [email protected] or <a> http://www.statmt.org </a>'.
            split(" "))

        self.assertTrue(
            result == [
                'Email', 'me', 'at', '__email_0__', 'or', '__xml_0__',
                '__url_0__', '__xml_1__'
            ], "Identity masking did not mask a list of tokens correctly")
        self.assertTrue(
            _mappings_equal(mapping, [('__url_0__', 'http://www.statmt.org'),
                                      ('__xml_0__', '<a>'),
                                      ('__xml_1__', '</a>'),
                                      ('__email_0__', '*****@*****.**')]),
            "Identity masking did not return the correct mapping for a list of input tokens"
        )
Beispiel #12
0
 def test_identity_masking_escape_characters(self):
     m = Masker('identity')
     self.assertTrue(
         m.mask_segment("the ships & hung < in the > [ sky ] .")[0] ==
         "the ships &amp; hung &lt; in the &gt; &#91; sky &#93; .",
         "Masker must escape characters reserved in Moses by default")
Beispiel #13
0
class XmlProcessor(object):
    '''
    Process XML markup properly before training, and before
        and after translation.
    '''
    def __init__(self, xml_strategy):
        self._xml_strategy = xml_strategy
        if self._xml_strategy in (XML_STRIP, XML_STRIP_REINSERT):
            self._reinserter = Reinserter(
                XML_STRATEGIES_DEFAULTS[self._xml_strategy],
                force_all=FORCE_REINSERT_ALL)
        elif self._xml_strategy == XML_MASK:
            self._masker = Masker(
                strategy=XML_STRATEGIES_DEFAULTS[self._xml_strategy],
                escape=True,
                force_all=FORCE_REINSERT_ALL,
                remove_all=REMOVE_ALL_MASKS)

    def _strip_markup(self, segment, keep_escaped_markup=True):
        '''
        Removes all XML markup from a segment and normalizes
            whitespace between tokens before returning.
        @param segment the string from which XML markup
            should be removed
        @param keep_escaped_markup whether markup that is escaped in the
            original segment should be removed as well, and
            only its text content should be kept
        '''
        # unescaped markup
        try:
            tree = etree.fromstring('<root>' + segment + '</root>')
            segment = etree.tostring(tree, encoding='unicode', method='text')
        except:
            # malformed fragment, fall back strategy
            tokens = []
            for token in re.split("(<[^<>]+>)", segment):
                if not re.match("<[^<>]+>", token):
                    tokens.extend(token.split(" "))
            segment = " ".join(tokens)
        # markup that was escaped in the original segment, now surfaced
        if '<' in segment and not keep_escaped_markup:
            segment = re.sub('<[^>]*>', '', segment)
        else:
            segment = xml.sax.saxutils.escape(segment)
        # normalize whitespace
        segment = re.sub(' +', ' ', segment).strip()
        return cleaner.escape_special_chars(segment)

    def _mask_markup(self, segment):
        '''
        Replaces XML markup with mask tokens.
        @param segment the segment to be masked
        @return the masked segment and the mapping
            between mask tokens and original content
        '''
        return self._masker.mask_segment(segment)

    def _unmask_markup(self,
                       masked_source_segment,
                       target_segment,
                       mapping,
                       alignment=None):
        '''
        When a mask token is found, reinsert the original
            XML markup content.
        @param masked_source_segment a source language segment with mask tokens
        @param target_segment a translation with mask tokens
        @param mapping a dictionary containing the mask tokens
            and the original content
        @param word alignment between the source and target segment
        '''
        return self._masker.unmask_segment(masked_source_segment,
                                           target_segment, mapping, alignment)

    def _reinsert_markup(self, source_segment, target_segment):
        '''
        Reinserts XML markup in a segment where markup was
            stripped before translation.
        @param source_segment the original segment in the source language
            before XML markup was removed, but after markup-aware tokenization
        @param target_segment a TranslatedSegment object, containing a translation
            without markup, segmentation and alignment information
        '''
        return self._reinserter.reinsert_markup(source_segment,
                                                target_segment.translation,
                                                target_segment.segmentation,
                                                target_segment.alignment)

    def force_mask_translation(self, segment):
        '''
        Enforces the translation of mask tokens.
        '''
        return self._masker.force_mask_translation(segment)

    # Exposed methods
    def preprocess_markup(self, segment):
        '''
        Strips or masks XML markup before translation, depending
            on the markup strategy.
        '''
        if self._xml_strategy in (XML_STRIP, XML_STRIP_REINSERT):
            return self._strip_markup(segment), None
        elif self._xml_strategy == XML_MASK:
            return self._mask_markup(segment)
        elif self._xml_strategy == XML_PASS_THROUGH:
            return segment, None  # then return segment unchanged

    def postprocess_markup(self,
                           source_segment,
                           target_segment,
                           mapping=None,
                           masked_source_segment=None):
        '''
        Unmasks or restores XML markup after translation, depending
            on the markup strategy.
        '''
        if self._xml_strategy == XML_STRIP_REINSERT:
            return self._reinsert_markup(source_segment, target_segment)
        elif self._xml_strategy == XML_STRIP:
            # in this case, do nothing / todo: well, remove markup if any?
            return target_segment.translation
        elif self._xml_strategy == XML_MASK:
            return self._unmask_markup(masked_source_segment,
                                       target_segment.translation, mapping,
                                       target_segment.alignment)
        elif self._xml_strategy == XML_PASS_THROUGH:
            return target_segment.translation  # then return segment unchanged
Beispiel #14
0
 def _load_masker(self):
     self._masker = Masker(self._masking_strategy)
     self._components.append(self._masker)