Ejemplo n.º 1
0
class TestSymmetry(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.DEBUG)
        logging.getLogger('lgr.validate').addHandler(ch)

    def test_empty_lgr(self):
        success, result = check_symmetry(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing symmetry',
                                      'repertoire': []})

    def test_no_symmetric_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        success, result = check_symmetry(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         "CP U+0061: Variant U+0062 is not in repertoire.\n")
        self.assertFalse(success)
        self.assertDictEqual(result, {'description': 'Testing symmetry',
                                      'repertoire': [{'char': self.lgr.get_char([0x0061]),
                                                      'variant': self.lgr.get_variant([0x0061], (0x0062, ))[0],
                                                      'type': 'not-in-repertoire'}]})

    def test_no_symmetric_in_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        success, result = check_symmetry(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         'CP U+0062 should have CP U+0061 in its variants.\n')
        self.assertFalse(success)
        self.assertDictEqual(result, {'description': 'Testing symmetry',
                                      'repertoire': [{'char': self.lgr.get_variant([0x0061], (0x0062, ))[0],
                                                      'variant': self.lgr.get_char([0x0061]),
                                                      'type': 'missing'}]})

    def test_symmetry_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0061])
        success, result = check_symmetry(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing symmetry',
                                      'repertoire': []})
Ejemplo n.º 2
0
class TestTransitivity(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.DEBUG)
        logging.getLogger('lgr.validate').addHandler(ch)

    def test_empty_lgr(self):
        success, result = check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing transitivity',
                                      'repertoire': []})

    def test_no_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        success, result = check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing transitivity',
                                      'repertoire': []})

    def test_no_transitivity(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0063])
        self.lgr.add_cp([0x0063])
        success, result = check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         'CP U+0061 should have CP U+0063 in its variants.\n')
        self.assertFalse(success)
        self.assertDictEqual(result, {'description': 'Testing transitivity',
                                      'repertoire': [{'char': self.lgr.get_char([0x0061]),
                                                      'variant': self.lgr.get_variant([0x0062], (0x0063, ))[0]}]})

    def test_transitivity_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_variant([0x0061], [0x0063])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0063])
        self.lgr.add_cp([0x0063])
        success, result = check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing transitivity',
                                      'repertoire': []})
Ejemplo n.º 3
0
class TestConditionalVariants(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.DEBUG)
        logging.getLogger('lgr.validate').addHandler(ch)

    def test_empty_lgr(self):
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)

    def test_no_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)

    def test_no_rule(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule")
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         "CP U+0061: Variant 'U+0062' \"when\" attribute "
                         "'when-rule' is not an existing rule name.\n")

    def test_conditional_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule")
        self.lgr.rules.append("when-rule")
        check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
Ejemplo n.º 4
0
class TestTransitivity(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.DEBUG)
        logging.getLogger('lgr.validate').addHandler(ch)

    def test_empty_lgr(self):
        check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)

    def test_no_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)

    def test_no_transitivity(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0063])
        self.lgr.add_cp([0x0063])
        check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         'CP U+0061 should have CP U+0063 in its variants.\n')

    def test_transitivity_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_variant([0x0061], [0x0063])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0063])
        self.lgr.add_cp([0x0063])
        check_transitivity(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
Ejemplo n.º 5
0
def rebuild_lgr(lgr, options):
    """
    Rebuild an LGR with given parameters.

    options argument can contain:
        * unicode_version: The target Unicode version to be used
          when rebuilding the LGR. If None is given, use the current one.
        * validating_repertoire: The validating repertoire used
          for checking code points.
        * unidb: Munidata's Unicode database. If None, skip Unicode checks.

    :param LGR lgr: The LGR to rebuild.
    :param dict options: Dictionary of options to the validation function.
    """
    # Local import to prevent import cycles
    from lgr.core import LGR

    unicode_version = options.get('unicode_version',
                                  lgr.metadata.unicode_version)
    validating_repertoire = options.get('validating_repertoire', None)

    description = "Rebuilding LGR with Unicode version {}".format(
        unicode_version)
    if validating_repertoire is not None:
        description += " and validating repertoire '{}'".format(
            validating_repertoire)
    result = {
        'description': description,
        'repertoire': {}  # XXX: Cannot use defaultdict because of django...
    }

    logger.info(
        "Rebuilding LGR '%s' with Unicode version %s "
        "and Validating Repertoire '%s'", lgr, unicode_version,
        validating_repertoire)

    unidb = options.get('unidb', None)
    if unidb is not None:
        unidb_version = unidb.get_unicode_version()
        if unidb_version != unicode_version:
            result['generic'] = "Target Unicode version {} " \
                                "differs from UnicodeDatabase {}".format(unicode_version,
                                                                         unidb_version)
            logger.warning(
                "Target Unicode version %s differs "
                "from UnicodeDatabase %s", unicode_version, unidb_version)

    # For now, simply copy the metadata and references of the source LGR
    target_metadata = copy.deepcopy(lgr.metadata)
    target_metadata.unicode_version = unicode_version
    target_reference_manager = copy.deepcopy(lgr.reference_manager)

    target_lgr = LGR(name=lgr.name,
                     metadata=target_metadata,
                     reference_manager=target_reference_manager,
                     unicode_database=unidb)

    for char in lgr.repertoire:
        if isinstance(char, RangeChar):
            range_ok = True
            for cp, status in target_lgr.check_range(char.first_cp,
                                                     char.last_cp,
                                                     validating_repertoire):
                if status is not None:
                    result['repertoire'].setdefault(char, {}).setdefault(
                        'errors', []).append(status)
                    range_ok = False
                in_script, _ = lgr.cp_in_script([cp])
                if not in_script:
                    result['repertoire'].setdefault(char, {}).setdefault(
                        'warnings', []).append(CharNotInScript(cp))
                    range_ok = False

            if not range_ok:
                continue

            try:
                target_lgr.add_range(
                    char.first_cp,
                    char.last_cp,
                    comment=char.comment,
                    ref=char.references,
                    tag=char.tags,
                    when=char.when,
                    not_when=char.not_when,
                    validating_repertoire=validating_repertoire,
                    override_repertoire=False)
            except LGRException as exc:
                result['repertoire'].setdefault(char,
                                                {}).setdefault('errors',
                                                               []).append(exc)
                logger.error("Cannot add range '%s-%s'",
                             format_cp(char.first_cp), format_cp(char.last_cp))
            continue

        in_script, _ = lgr.cp_in_script(char.cp)
        if not in_script:
            result['repertoire'].setdefault(char, {}).setdefault(
                'warnings', []).append(CharNotInScript(char.cp))
        # Insert code point
        try:
            target_lgr.add_cp(char.cp,
                              comment=char.comment,
                              ref=char.references,
                              tag=char.tags,
                              when=char.when,
                              not_when=char.not_when,
                              validating_repertoire=validating_repertoire,
                              override_repertoire=False)
        except LGRException as exc:
            result['repertoire'].setdefault(char,
                                            {}).setdefault('errors',
                                                           []).append(exc)
            logger.error("Cannot add code point '%s'", format_cp(char.cp))
            if not isinstance(exc, CharInvalidIdnaProperty
                              ):  # Cannot include non-IDNA valid code points
                target_lgr.add_cp(char.cp,
                                  comment=char.comment,
                                  ref=char.references,
                                  tag=char.tags,
                                  when=char.when,
                                  not_when=char.not_when,
                                  force=True)

        # Create variants
        for var in char.get_variants():
            try:
                target_lgr.add_variant(
                    char.cp,
                    variant_cp=var.cp,
                    variant_type=var.type,
                    when=var.when,
                    not_when=var.not_when,
                    comment=var.comment,
                    ref=var.references,
                    validating_repertoire=validating_repertoire,
                    override_repertoire=True)
            except LGRException as exc:
                result['repertoire'].setdefault(char, {}).setdefault(
                    'variants', {}).setdefault(var, []).append(exc)
                logger.error("Cannot add variant '%s' to code point '%s'",
                             format_cp(var.cp), format_cp(char.cp))
                if not isinstance(
                        exc, CharInvalidIdnaProperty
                ):  # Cannot include non-IDNA valid code points
                    target_lgr.add_variant(char.cp,
                                           variant_cp=var.cp,
                                           variant_type=var.type,
                                           when=var.when,
                                           not_when=var.not_when,
                                           comment=var.comment,
                                           ref=var.references,
                                           force=True)

    logger.info("Rebuilding LGR '%s done", lgr)

    return True, result
Ejemplo n.º 6
0
class RFC4290Parser(LGRParser):
    def unicode_version(self):
        # No Unicode version defined in file
        return ""

    def validate_document(self, schema=None):
        # No validation of document done for now
        return True

    def parse_document(self):
        if not self.filename and isinstance(self.source, str):
            self.filename = os.path.basename(self.source)

        self._lgr = LGR(name=self.filename)

        logger.debug('Start parsing of file: %s', self.filename)

        if hasattr(self.source, "read"):
            self._parse_doc(self.source)
        else:
            with io.open(self.source, 'r', encoding='utf-8') as rule_file:
                self._parse_doc(rule_file)

        return self._lgr

    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.
    
        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue
            if UNICODE_CODEPOINT_RE.match(line) is None:
                # Line is not starting with a valid unicode code point, skip
                continue

            # Remove comments and split base character from variant(s)
            char_variant = line.split('#')[0].split('|')
            char = char_variant[0]

            try:
                codepoints = parse_char(char)
                self._lgr.add_cp(codepoints)
            except ValueError:
                logger.error("Invalid character '%s' at line %d", char,
                             line_num)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)

            # Handle variants, if any
            if len(char_variant) > 1:
                variants = char_variant[1].split(':')

                for var in variants:
                    try:
                        var_codepoints = parse_char(var)
                        self._lgr.add_variant(codepoints, var_codepoints)
                    except ValueError:
                        logger.error("Invalid variant '%s' at line %d", var,
                                     line_num)
                    except LGRException as exc:
                        logger.error(
                            "Cannot add variant '%s' to code point '%s' at line %d: %s",
                            format_cp(var_codepoints), format_cp(codepoints),
                            line_num, exc)
Ejemplo n.º 7
0
class RFC3743Parser(LGRParser):
    def unicode_version(self):
        # No Unicode version defined in file
        return ""

    def validate_document(self, schema=None):
        # No validation of document done for now
        return ""

    def parse_document(self):
        if not self.filename and isinstance(self.source, str):
            self.filename = os.path.basename(self.source)

        self._lgr = LGR(name=self.filename)

        logger.debug('Start parsing of file: %s', self.filename)

        if hasattr(self.source, "read"):
            self._parse_doc(self.source)
        else:
            with io.open(self.source, 'r', encoding='utf-8') as rule_file:
                self._parse_doc(rule_file)

        return self._lgr

    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.

        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue

            reference = REFERENCE_RE.match(line)
            if reference is not None:
                ref_id = reference.group('ref_id')
                value = reference.group('value')
                comment = reference.group('comment')
                try:
                    self._lgr.add_reference(value,
                                            ref_id=ref_id,
                                            comment=comment)
                except LGRException:
                    logger.error("Invalid reference '%s' on line %d", line,
                                 line_num)
                continue

            version = VERSION_RE.match(line)
            if version is not None:
                version_no = version.group('version_no')
                date = version.group('date')
                comment = version.group('comment')

                try:
                    self._lgr.metadata.version = Version(version_no,
                                                         comment=comment)
                    self._lgr.metadata.date = date
                except LGRException:
                    logger.error("Invalid version '%s' on line %d", line,
                                 line_num)
                continue

            if UNICODE_CODEPOINT_RE.match(line) is None:
                logger.debug("Skipping non-parsable line %d:\n%s", line_num,
                             line)
                # Line is not starting with a valid unicode code point, skip
                continue

            # Split base character from variant(s)
            char_variant = line.split(';')
            char = char_variant[0]

            try:
                [(codepoints, references)] = parse_char(char)
                self._lgr.add_cp(codepoints, ref=references)
            except ValueError:
                logger.error("Invalid character '%s' at line %d", char,
                             line_num)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)

            if len(char_variant) > 1:
                preferred_variants = char_variant[1].strip()
                if len(preferred_variants
                       ) > 0 and preferred_variants[0] != '#':
                    # From RFC7940, Section 7.3. Recommended Disposition Values:
                    # activated  The resulting string should be activated for use.  (This
                    # is the same as a Preferred Variant [RFC3743].)
                    var_type = "activated"
                    self.insert_variant(line_num, codepoints,
                                        preferred_variants, var_type)

            if len(char_variant) > 2:
                variants = char_variant[2].strip()
                if len(variants) > 0 and variants[0] != '#':
                    self.insert_variant(line_num, codepoints, variants)

    def insert_variant(self, line_num, codepoints, var, var_type=None):
        try:
            variants = parse_char(var)
        except ValueError:
            logger.error("Invalid variant '%s' at line %d", var, line_num)
            return

        for (var_codepoints, references) in variants:
            try:
                self._lgr.add_variant(codepoints,
                                      var_codepoints,
                                      ref=references,
                                      variant_type=var_type)
            except LGRException as exc:
                logger.error(
                    "Cannot add variant '%s' to code point '%s' at line %d: %s",
                    format_cp(var_codepoints), format_cp(codepoints), line_num,
                    exc)
Ejemplo n.º 8
0
class XMLParser(LGRParser):
    # Keep content intact, so do not strip CDATA section
    # (used in the <meta>/<description> element).
    # Do not resolve entities.
    # Skip comment, as we do not care.
    PARSER_OPTIONS = {
        'resolve_entities': False,
        'strip_cdata': False,
        'remove_comments': True
    }

    def __init__(self, *args, **kwargs):
        if 'force_mode' in kwargs:
            force_mode = kwargs['force_mode']
            del kwargs['force_mode']
        else:
            force_mode = True

        super(XMLParser, self).__init__(*args, **kwargs)
        self.force_mode = force_mode
        self.rfc7940_checks = LGRFormatTestResults()

    def validate_document(self, rng_schema_path):
        # Construct the RelaxNG validator
        schema = etree.RelaxNG(file=rng_schema_path)

        # Parse the XML file
        parser = etree.XMLParser(**self.PARSER_OPTIONS)
        doc = etree.parse(self.source, parser=parser)

        logger.debug("Validating document '%s' with RNG '%s'", self.source,
                     rng_schema_path)

        error_log = None
        if not schema.validate(doc):
            logger.warning("Validation of document '%s' failed", self.source)
            self.rfc7940_checks.error('schema')
            error_log = schema.error_log
            if len(error_log) == 0:
                # Bug in LXML, see https://bugs.launchpad.net/lxml/+bug/1526522
                error_log = "CANNOT VALIDATE XML"

        self.rfc7940_checks.tested('schema')
        return error_log

    def unicode_version(self):
        logger.debug("Get unicode version from meta")
        # Only parse the "meta" element
        # Skip comment, as we do not care.
        context = etree.iterparse(self.source,
                                  tag=META_TAG,
                                  **self.PARSER_OPTIONS)
        self._fast_iter(context)
        unicode_version = self._lgr.metadata.unicode_version
        self._lgr = None

        # FD is now potentially at the end of the documents,
        # set it back to start
        if hasattr(self.source, "seek"):
            self.source.seek(0)
        return unicode_version

    def parse_document(self):
        logger.debug('Start parsing of file: %s', self.filename)

        # Keep content intact, so do not strip CDATA section
        # (used in the <meta>/<description> element).
        # Do not resolve entities.
        # Skip comment, as we do not care.
        context = etree.iterparse(self.source, **self.PARSER_OPTIONS)

        self._fast_iter(context)

        # FD is now potentially at the end of the documents,
        # set it back to start
        if hasattr(self.source, "seek"):
            self.source.seek(0)

        self.rfc7940_checks.tested('parse_xml')
        return self._lgr

    def _process_meta(self, elem):
        """
        Process the <meta> element of an LGR XML file.
        """
        metadata = Metadata(self.rfc7940_checks)
        reference_manager = ReferenceManager()
        MAPPER = {
            DATE_TAG:
            lambda d: metadata.set_date(d, force=self.force_mode),
            VALIDITY_START_TAG:
            lambda d: metadata.set_validity_start(d, force=self.force_mode),
            VALIDITY_END_TAG:
            lambda d: metadata.set_validity_end(d, force=self.force_mode),
            UNICODE_VERSION_TAG:
            lambda d: metadata.set_unicode_version(d, force=self.force_mode),
        }
        unicode_version_tag_found = False
        for child in elem:
            tag = child.tag
            logger.debug("Got '%s' element", tag)
            if tag in MAPPER:
                MAPPER[tag](child.text)
                if tag == UNICODE_VERSION_TAG:
                    unicode_version_tag_found = True
            elif tag == VERSION_TAG:
                metadata.version = Version(child.text,
                                           child.get('comment', None))
            elif tag == LANGUAGE_TAG:
                metadata.add_language(child.text, force=self.force_mode)
            elif tag == SCOPE_TAG:
                metadata.scopes.append(
                    Scope(child.text, child.get('type', None)))
            elif tag == DESCRIPTION_TAG:
                # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449
                # For now, manually replace CRLF with LF
                metadata.description = Description(
                    child.text.replace('\r\n', '\n'), child.get('type', None))
            elif tag == REFERENCES_TAG:
                for reference in child:
                    value = reference.text
                    # Don't convert it to an int since ref_id may be a string
                    ref_id = reference.get('id')
                    comment = reference.get('comment', None)
                    reference_manager.add_reference(value,
                                                    comment=comment,
                                                    ref_id=ref_id)
                # Since we have processed <reference> elements here, let's clean-up
                child.clear()
            else:
                logger.warning("Unhandled '%s' element in <meta> section", tag)
                self.rfc7940_checks.error('parse_xml')
            child.clear()

        self.rfc7940_checks.add_test_result('explicit_unicode_version',
                                            unicode_version_tag_found)
        self._lgr = LGR(name=self.filename,
                        metadata=metadata,
                        reference_manager=reference_manager,
                        unicode_database=self._unicode_database)

    def _process_data(self, elem):
        """
        Process the <data> element of an LGR XML file.
        """

        # It is RECOMMENDED to list all "char" elements in ascending order of
        # the "cp" attribute. The below variable is used when verifying that.
        previous_codepoint = []

        for child in elem:
            comment = child.get('comment', None)
            when = child.get('when', None)
            not_when = child.get('not-when', None)

            # Handle references
            ref = string_to_list(child.get('ref', ''))

            # Handle tags
            tag = string_to_list(child.get('tag', ''))

            if child.tag == CHAR_TAG:
                codepoint = [int(c, 16) for c in child.get('cp').split()]

                if codepoint <= previous_codepoint:
                    if previous_codepoint[0:len(codepoint)] == codepoint:
                        # Not clear what order is to be recommended here
                        self.rfc7940_checks.error(
                            'char_strict_ascending_order')
                    else:
                        logger.warning(
                            "cp attribute not in ascending order: '%s'",
                            child.get('cp'))
                        self.rfc7940_checks.error('char_ascending_order')
                previous_codepoint = codepoint

                try:
                    self._lgr.add_cp(codepoint,
                                     comment=comment,
                                     ref=ref,
                                     tag=tag,
                                     when=when,
                                     not_when=not_when,
                                     force=self.force_mode)
                except LGRException as exc:
                    logger.error("Cannot add code point '%s': %s",
                                 format_cp(codepoint), exc)
                    self.rfc7940_checks.error('parse_xml')
                    self.rfc7940_checks.error('codepoint_valid')
                    if not self.force_mode:
                        raise

                # Variants of char
                for variant in child.iter(VARIANT_TAG):
                    var_codepoint = [
                        int(c, 16) for c in variant.get('cp').split()
                    ]
                    when = variant.get('when', None)
                    not_when = variant.get('not-when', None)
                    variant_type = variant.get('type', None)
                    comment = variant.get('comment', None)

                    # Handle references
                    ref = string_to_list(variant.get('ref', ''))

                    try:
                        self._lgr.add_variant(codepoint,
                                              var_codepoint,
                                              variant_type=variant_type,
                                              when=when,
                                              not_when=not_when,
                                              comment=comment,
                                              ref=ref,
                                              force=self.force_mode)
                    except LGRException as exc:
                        logger.error(
                            "Cannot add variant '%s' "
                            "to code point '%s': %s", format_cp(var_codepoint),
                            format_cp(codepoint), exc)
                        self.rfc7940_checks.error('parse_xml')
                        self.rfc7940_checks.error('codepoint_valid')
                        if not self.force_mode:
                            raise
            elif child.tag == RANGE_TAG:
                first_cp = int(child.get('first-cp'), 16)
                last_cp = int(child.get('last-cp'), 16)

                try:
                    self._lgr.add_range(first_cp,
                                        last_cp,
                                        comment=comment,
                                        ref=ref,
                                        tag=tag,
                                        when=when,
                                        not_when=not_when,
                                        force=self.force_mode)
                except LGRException as exc:
                    self.rfc7940_checks.error('parse_xml')
                    self.rfc7940_checks.error('codepoint_valid')
                    logger.error("Cannot add range '%s-%s': %s",
                                 format_cp(first_cp), format_cp(last_cp), exc)
                    if not self.force_mode:
                        raise

            child.clear()

        self.rfc7940_checks.tested('char_ascending_order')
        self.rfc7940_checks.tested('char_strict_ascending_order')

    def _process_rules(self, elem):
        """
        Process the <rules> element of an LGR XML file.
        """
        # Keep "text" version of the rules since we don't do anything with them.
        for child in elem:
            if child.tag in COMBINATOR_TAGS + (CLASS_TAG, ):
                cls = self._parse_class(child)
                self._lgr.add_class(cls, force=self.force_mode)
                child = drop_ns(child)
                self._lgr.classes_xml.append(
                    etree.tostring(child, encoding=text_type))
            elif child.tag == RULE_TAG:
                rule = self._parse_rule(child)
                self._lgr.add_rule(rule, force=self.force_mode)
                child = drop_ns(child)
                self._lgr.rules_xml.append(
                    etree.tostring(child, encoding=text_type))
            elif child.tag == ACTION_TAG:
                action = self._parse_action(child)
                self._lgr.add_action(action, force=self.force_mode)
                child = drop_ns(child)
                self._lgr.actions_xml.append(
                    etree.tostring(child, encoding=text_type))
            else:
                logger.warning("Unhandled '%s' element in <rules> section",
                               child.tag)
                self.rfc7940_checks.error("parse_xml")
            child.clear()

    def _parse_rule(self, elem):
        """
        Parse a <rule> element.

        :return: The rule object created.
        """
        rule = Rule(name=elem.get('name', None),
                    comment=elem.get('comment', None),
                    ref=string_to_list(elem.get('ref', '')),
                    by_ref=elem.get('by-ref', None))

        for child in elem:
            self._parse_rule_helper(child, rule)

        return rule

    def _parse_rule_helper(self, child, rule):
        """
        Helper to parse the content of a <rule> element.

        This function is to be called on children of a top-level <rule>.

        :param child: Child element of a top-level <rule> element.
        :param rule: The top-level rule element to add the content to.
        """
        tag = child.tag
        comment = child.get('comment', None)
        count = child.get('count', None)

        if tag == ANCHOR_TAG:
            rule.add_child(AnchorMatcher(comment=comment))
        elif tag == ANY_TAG:
            rule.add_child(AnyMatcher(comment=comment, count=count))
        elif tag == CHAR_TAG:
            rule.add_child(
                CharMatcher(cp_or_sequence_from_class(child),
                            comment=comment,
                            count=count))
        elif tag == CHOICE_TAG:
            choice = ChoiceMatcher(comment=comment, count=count)
            for matcher in child:
                self._parse_rule_helper(matcher, choice)
            rule.add_child(choice)
        elif tag == END_TAG:
            rule.add_child(EndMatcher(comment=comment))
        elif tag == LOOKAHEAD_TAG:
            look_ahead = LookAheadMatcher(comment=comment)
            for matcher in child:
                self._parse_rule_helper(matcher, look_ahead)
            rule.add_child(look_ahead)
        elif tag == LOOKBEHIND_TAG:
            look_behind = LookBehindMatcher(comment=comment)
            for matcher in child:
                self._parse_rule_helper(matcher, look_behind)
            rule.add_child(look_behind)
        elif tag == START_TAG:
            rule.add_child(StartMatcher(comment=comment))
        elif tag == RULE_TAG:
            child_rule = self._parse_rule(child)
            rule.add_child(
                RuleMatcher(child_rule, comment=comment, count=count))
        elif tag == CLASS_TAG or tag in COMBINATOR_TAGS:
            rule.add_child(
                ClassMatcher(self._parse_class(child),
                             comment=comment,
                             count=count))
        else:
            logger.warning("Unhandled '%s' element in <rule> object", tag)
            self.rfc7940_checks.error('parse_xml')

    def _parse_action(self, elem):
        """
        Parse an <action> element.

        :return: The action object created.
        """
        disp = elem.get('disp')
        comment = elem.get('comment', None)

        match = elem.get('match', None)
        not_match = elem.get('not-match', None)

        any_variant = string_to_list(elem.get('any-variant', ''))
        all_variants = string_to_list(elem.get('all-variants', ''))
        only_variants = string_to_list(elem.get('only-variants', ''))

        return Action(disp,
                      comment=comment,
                      ref=string_to_list(elem.get('ref', '')),
                      match=match,
                      not_match=not_match,
                      any_variant=any_variant,
                      all_variants=all_variants,
                      only_variants=only_variants)

    def _parse_class(self, elem):
        """
        Parse an <class> element.

        :return: The Class object created.
        """
        tag = elem.tag
        name = elem.get('name', None)
        comment = elem.get('comment', None)

        if tag == CLASS_TAG:
            cls = Class(name=name,
                        comment=comment,
                        ref=string_to_list(elem.get('ref', '')),
                        from_tag=elem.get('from-tag', None),
                        unicode_property=elem.get('property', None),
                        by_ref=elem.get('by-ref', None))
            if len(elem) == 0 and elem.text:
                # No child, code point(s) defined in text
                cls.add_codepoint(cp_or_sequence_from_class(elem))
            for child in elem:
                cls.add_codepoint(cp_or_sequence_from_class(child))
        elif tag in COMBINATOR_TAGS:
            MAPPING = {
                UNION_TAG: UnionClass,
                COMPLEMENT_TAG: ComplementClass,
                INTERSECTION_TAG: IntersectionClass,
                DIFFERENCE_TAG: DifferenceClass,
                SYM_DIFFERENCE_TAG: SymmetricDifferenceClass
            }
            cls = MAPPING[tag](name=name, comment=comment)
            # TODO: ensure number of children
            for child in elem:
                cls.add_child(self._parse_class(child))
        else:
            logger.warning("Unhandled '%s' element in <class> object", tag)
            self.rfc7940_checks.error('parse_xml')

        return cls

    def _fast_iter(self, context):
        """
        Iterator used to incrementally parse the XML file.
        """
        metadata_added = False
        for _, elem in context:
            if not metadata_added and elem == DATA_TAG:
                # The optional "meta" element is not present since it must
                # preceed the required data element.
                # However, we still have to call _process_meta
                self._process_meta({})
                metadata_added = True
            if elem.tag == META_TAG:
                logger.debug("Got 'meta' element")
                self._process_meta(elem)
            elif elem.tag == DATA_TAG:
                logger.debug("Got 'data' element")
                self._process_data(elem)
            elif elem.tag == RULES_TAG:
                logger.debug("Got 'rules' element")
                self._process_rules(elem)
            else:
                continue
            # Clean-up memory
            elem.clear()
        del context
Ejemplo n.º 9
0
class TestPopulate(unittest.TestCase):
    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.INFO)
        logger = logging.getLogger('lgr.populate')
        logger.addHandler(ch)
        logger.setLevel(logging.INFO)

    def test_no_symmetric_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add missing code point 'U+0062' in LGR as it is a variant of 'U+0061'\n"
            "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n",
            log_content)
        self.assertIn(0x0062, self.lgr.repertoire)
        new_variant = self.lgr.get_char([0x0062])
        self.assertEqual([(0x0061, )],
                         [c.cp for c in new_variant.get_variants()])

    def test_no_symmetric_in_repertoire_twice(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_variant([0x0061], [0x0063])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0061])
        self.lgr.add_variant([0x0062], [0x0063])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add missing code point 'U+0063' in LGR as it is a variant of 'U+0061'\n"
            "Add code point 'U+0061' as variant of 'U+0063' for symmetry\n"
            "Add code point 'U+0062' as variant of 'U+0063' for symmetry\n",
            log_content)
        self.assertIn(0x0063, self.lgr.repertoire)
        new_variant = self.lgr.get_char([0x0063])
        self.assertEqual([(0x0061, ), (0x0062, )],
                         [c.cp for c in new_variant.get_variants()])

    def test_no_symmetric_in_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n",
            log_content)
        cp = self.lgr.get_char([0x0062])
        self.assertEqual([(0x0061, )], [c.cp for c in cp.get_variants()])

    def test_no_transitivity(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062])
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0063])
        self.lgr.add_cp([0x0063])
        populate_lgr(self.lgr)
        log_content = self.log_output.getvalue()
        self.assertEqual(
            "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n"
            "Add code point 'U+0062' as variant of 'U+0063' for symmetry\n"
            "Add code point 'U+0063' as variant of 'U+0061' for transitivity with 'U+0062'\n"
            "Add code point 'U+0061' as variant of 'U+0063' for transitivity with 'U+0062'\n",
            log_content)
        cp = self.lgr.get_char([0x0061])
        self.assertEqual([(0x0062, ), (0x0063, )],
                         [c.cp for c in cp.get_variants()])
        cp = self.lgr.get_char([0x0062])
        self.assertEqual([(0x0061, ), (0x0063, )],
                         [c.cp for c in cp.get_variants()])
        cp = self.lgr.get_char([0x0063])
        self.assertEqual([(0x0061, ), (0x0062, )],
                         [c.cp for c in cp.get_variants()])
Ejemplo n.º 10
0
class TestLGRCore(unittest.TestCase):
    def setUp(self):
        unidb = IDNADatabase('6.3.0')
        self.lgr = LGR(unicode_database=unidb)

    def test_add_single_cp_list(self):
        self.lgr.add_cp([0x0061])
        self.assertIn(0x0061, self.lgr.repertoire)

    def test_add_single_cp_int(self):
        self.lgr.add_cp(0x0061)
        self.assertIn(0x0061, self.lgr.repertoire)

    def test_add_cp_sequence(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.assertIn([0x0061, 0x0062], self.lgr.repertoire)
        self.assertNotIn(0x0061, self.lgr.repertoire)
        self.assertNotIn(0x0062, self.lgr.repertoire)

    def test_add_multiple_cp_sequences(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.lgr.add_cp([0x0061, 0x0062, 0x0063])
        self.assertIn([0x0061, 0x0062], self.lgr.repertoire)
        self.assertIn([0x0061, 0x0062, 0x0063], self.lgr.repertoire)
        self.assertNotIn(0x0061, self.lgr.repertoire)
        self.assertNotIn(0x0062, self.lgr.repertoire)
        self.assertNotIn(0x0063, self.lgr.repertoire)

    def test_add_cp_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.assertRaises(CharAlreadyExists, self.lgr.add_cp, [0x0061])
        self.assertRaises(CharAlreadyExists, self.lgr.add_cp, 0x0061)

    def test_add_cp_validation(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0061],
                        validating_repertoire=validation_lgr,
                        override_repertoire=False)
        self.assertRaises(NotInRepertoire,
                          self.lgr.add_cp, [0x0062],
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_cp_validation_override(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0061],
                        validating_repertoire=validation_lgr,
                        override_repertoire=False)
        self.lgr.add_cp([0x0062],
                        validating_repertoire=validation_lgr,
                        override_repertoire=True)
        self.assertIn(0x0062, self.lgr.repertoire)

    def test_del_single_cp_list(self):
        self.lgr.add_cp(0x0061)
        self.lgr.del_cp([0x0061])
        self.assertNotIn(0x0061, self.lgr.repertoire)

    def test_del_single_cp_int(self):
        self.lgr.add_cp([0x0061])
        self.lgr.del_cp(0x0061)
        self.assertNotIn(0x0061, self.lgr.repertoire)

    def test_del_cp_sequence(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.lgr.del_cp([0x0061, 0x0062])
        self.assertEqual(len(self.lgr.repertoire), 0)

    def test_del_cp_sequence_with_cp(self):
        self.lgr.add_cp([0x0061, 0x0062])
        self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0061)
        self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0062)
        self.assertIn([0x0061, 0x0062], self.lgr.repertoire)

    def test_add_cp_when_not_when(self):
        self.lgr.add_cp([0x0061], when='w1')
        with self.assertRaises(CharInvalidContextRule) as cm:
            self.lgr.add_cp([0x0062], when='w2', not_when='nw1')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0062])

        self.lgr.add_cp([0x0062], not_when='nw2')
        with self.assertRaises(CharInvalidContextRule) as cm:
            self.lgr.add_cp([0x0063], when='w3', not_when='nw3')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0063])

    def test_add_range(self):
        self.lgr.add_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            self.assertIn(cp, self.lgr.repertoire)

    def test_add_range_in_repertoire(self):
        self.lgr.add_range(0x0061, 0x007A)
        self.assertRaises(CharAlreadyExists, self.lgr.add_range, 0x0061,
                          0x007A)

    def test_add_range_validation(self):
        validation_lgr = LGR()
        for cp in range(0x0061, 0x007A + 1):
            validation_lgr.add_cp(cp)
        self.lgr.add_range(0x0061,
                           0x007A,
                           validating_repertoire=validation_lgr,
                           override_repertoire=False)
        self.assertRaises(NotInRepertoire,
                          self.lgr.add_range,
                          0x00F8,
                          0x00FF,
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_range_validation_with_range(self):
        validation_lgr = LGR()
        validation_lgr.add_range(0x0061, 0x007A)
        self.lgr.add_range(0x0061,
                           0x007A,
                           validating_repertoire=validation_lgr,
                           override_repertoire=False)
        self.assertRaises(NotInRepertoire,
                          self.lgr.add_range,
                          0x00F8,
                          0x00FF,
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_range_validation_override(self):
        validation_lgr = LGR()
        for cp in range(0x0061, 0x007A):
            validation_lgr.add_cp(cp)
        self.lgr.add_range(0x0031,
                           0x0032,
                           validating_repertoire=validation_lgr,
                           override_repertoire=True)
        self.assertIn(0x0031, self.lgr.repertoire)

    def test_add_range_when_not_when(self):
        self.lgr.add_range(0x0061, 0x0065, when='w1')
        with self.assertRaises(RangeInvalidContextRule) as cm:
            self.lgr.add_range(0x0066, 0x007A, when='w2', not_when='nw1')
        the_exception = cm.exception
        self.assertEqual(the_exception.first_cp, 0x0066)
        self.assertEqual(the_exception.last_cp, 0x007A)

        self.lgr.add_range(0x0066, 0x007A, not_when='nw2')
        with self.assertRaises(RangeInvalidContextRule) as cm:
            self.lgr.add_range(0x01BD, 0x01C3, when='w3', not_when='nw3')
        the_exception = cm.exception
        self.assertEqual(the_exception.first_cp, 0x01BD)
        self.assertEqual(the_exception.last_cp, 0x01C3)

    def test_expand_ranges(self):
        self.lgr.add_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            self.assertIsInstance(self.lgr.get_char(cp), RangeChar)
        self.lgr.add_range(0x01BD, 0x01C3)
        for cp in range(0x01BD, 0x01C3 + 1):
            self.assertIsInstance(self.lgr.get_char(cp), RangeChar)

        self.lgr.expand_ranges()
        for cp in range(0x0061, 0x007A + 1):
            char = self.lgr.get_char(cp)
            self.assertIsInstance(char, Char)
            self.assertNotIsInstance(char, RangeChar)
        for cp in range(0x01BD, 0x01C3 + 1):
            char = self.lgr.get_char(cp)
            self.assertIsInstance(char, Char)
            self.assertNotIsInstance(char, RangeChar)

    def test_expand_range(self):
        self.lgr.add_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            self.assertIsInstance(self.lgr.get_char(cp), RangeChar)

        self.lgr.expand_range(0x0061, 0x007A)
        for cp in range(0x0061, 0x007A + 1):
            char = self.lgr.get_char(cp)
            self.assertIsInstance(char, Char)
            self.assertNotIsInstance(char, RangeChar)

    def test_add_variant_in_repertoire(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])
        self.assertRaises(VariantAlreadyExists, self.lgr.add_variant, [0x0061],
                          [0x0030])

    def test_add_variant_validation(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        validation_lgr.add_cp([0x0030])

        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        self.assertRaises(NotInRepertoire,
                          self.lgr.add_variant, [0x0061], [0x0062],
                          validating_repertoire=validation_lgr,
                          override_repertoire=False)

    def test_add_variant_when_not_when(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030], when='w1')
        with self.assertRaises(VariantInvalidContextRule) as cm:
            self.lgr.add_variant([0x0061], [0x0031], when='w2', not_when='nw1')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])
        self.assertEqual(the_exception.variant, [0x0031])

        self.lgr.add_variant([0x0061], [0x0030], not_when='nw2')
        with self.assertRaises(VariantInvalidContextRule) as cm:
            self.lgr.add_variant([0x0061], [0x0031], when='w3', not_when='nw3')
        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])
        self.assertEqual(the_exception.variant, [0x0031])

    def test_del_cp_validation_override(self):
        validation_lgr = LGR()
        validation_lgr.add_cp([0x0061])
        validation_lgr.add_cp([0x0030])

        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        self.lgr.add_variant([0x0061], [0x0062],
                             validating_repertoire=validation_lgr,
                             override_repertoire=True)
        self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants)

    def test_get_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030])

        variants = self.lgr.get_variants([0x0061])
        self.assertIsInstance(variants, types.GeneratorType)

        variant_list = list(variants)

        self.assertEqual(len(variant_list), 1)

    def test_check_range_no_modification(self):
        self.lgr.check_range(0x0060, 0x007F)

        self.assertEqual(len(self.lgr.repertoire), 0)

    def test_check_range(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x007A])

        codepoints = self.lgr.check_range(0x0060, 0x007F)

        for result in codepoints:
            cp = result[0]
            prop = result[1]
            if cp == 0x060 or cp >= 0x007B:
                self.assertIsInstance(prop, CharInvalidIdnaProperty)
            elif cp == 0x0061 or cp == 0x007A:
                self.assertIsInstance(prop, CharAlreadyExists)
            else:
                self.assertIsNone(prop)

    def test_add_codepoints(self):
        self.lgr.add_codepoints([c for c in range(0x0061, 0x007A + 1)] +
                                [0x0107] + [0x0137, 0x0138])

        expected_output = [
            RangeChar(0x061, 0x0061, 0x007A),
            Char(0x0107),
            RangeChar(0x0137, 0x0137, 0x0138)
        ]

        self.assertEqual(expected_output, list(self.lgr.repertoire))

    def test_tags_on_codepoint(self):
        self.lgr.add_cp([0x0061], tag=['t1', 't2'])
        with self.assertRaises(LGRFormatException) as cm:
            self.lgr.add_cp([0x0062], tag=['t1', 't1'])

        the_exception = cm.exception
        self.assertEqual(the_exception.reason,
                         LGRFormatException.LGRFormatReason.DUPLICATE_TAG)

    def test_tags_on_codepoint_sequence(self):
        with self.assertRaises(LGRFormatException) as cm:
            self.lgr.add_cp([0x0061, 0x0062], tag=['t1'])

        the_exception = cm.exception
        self.assertEqual(the_exception.reason,
                         LGRFormatException.LGRFormatReason.SEQUENCE_NO_TAG)

    def test_tags_on_range(self):
        self.lgr.add_range(0x0061, 0x0062, tag=['t1', 't2'])
        with self.assertRaises(LGRFormatException) as cm:
            self.lgr.add_range(0x0063, 0x0064, tag=['t1', 't1'])

        the_exception = cm.exception
        self.assertEqual(the_exception.reason,
                         LGRFormatException.LGRFormatReason.DUPLICATE_TAG)

    def test_list_types(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0030], variant_type='BLOCK')
        self.lgr.add_variant([0x0061], [0x0031], variant_type='VALID')
        self.lgr.add_variant([0x0061], [0x0032], variant_type='BLOCK')

        self.assertEquals(self.lgr.types, set(['BLOCK', 'VALID']))

    def test_del_reference(self):
        ref_id_1 = self.lgr.add_reference("Test - 1")
        ref_id_2 = self.lgr.add_reference("Test - 2")

        self.lgr.add_cp([0x0061], ref=[ref_id_1])
        self.lgr.add_cp([0x0062], ref=[ref_id_1, ref_id_2])

        self.lgr.del_reference(ref_id_1)

        self.assertNotIn(ref_id_1, self.lgr.reference_manager)
        self.assertEquals(self.lgr.get_char([0x0061]).references, [])
        self.assertEquals(self.lgr.get_char([0x0062]).references, [ref_id_2])

    def test_add_cp_duplicate_reference(self):
        ref_id = self.lgr.add_reference("Test - 1")
        with self.assertRaises(DuplicateReference) as cm:
            self.lgr.add_cp([0x0061], ref=[ref_id, ref_id])

        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])

    def test_add_range_duplicate_reference(self):
        ref_id = self.lgr.add_reference("Test - 1")
        with self.assertRaises(DuplicateReference) as cm:
            self.lgr.add_range(0x0061, 0x0062, ref=[ref_id, ref_id])

        the_exception = cm.exception
        self.assertEqual(the_exception.cp, 0x0061)

    def test_add_variant_duplicate_reference(self):
        self.lgr.add_cp([0x0061])
        ref_id = self.lgr.add_reference("Test - 1")
        with self.assertRaises(DuplicateReference) as cm:
            self.lgr.add_variant([0x0061], [0x0062], ref=[ref_id, ref_id])

        the_exception = cm.exception
        self.assertEqual(the_exception.cp, [0x0061])

    def test_generate_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        self.lgr.add_cp([0x0063])
        self.lgr.add_cp([0x0064])

        self.lgr.add_variant([0x0061], [0x0070], variant_type="type0")
        self.lgr.add_variant([0x0062], [0x0071], variant_type="type1")
        self.lgr.add_variant([0x0062], [0x0072], variant_type="type2")

        self.assertEqual([], list(self.lgr._generate_label_variants([])))
        self.assertEqual([], list(self.lgr._generate_label_variants([0x0063])))
        self.assertEqual(
            [], list(self.lgr._generate_label_variants([0x0063, 0x0064])))
        self.assertEqual(
            set([((0x0071, 0x0063), frozenset(['type1']), False),
                 ((0x0072, 0x0063), frozenset(['type2']), False)]),
            set(self.lgr._generate_label_variants([0x0062, 0x0063])))
        self.assertEqual(
            set([
                ((0x0061, 0x0062), frozenset(), False),
                ((0x0061, 0x0071), frozenset(['type1']), False),
                ((0x0061, 0x0072), frozenset(['type2']), False),
                ((0x0070, 0x0062), frozenset(['type0']), False),
                ((0x0070, 0x0071), frozenset(['type0', 'type1']), True),
                ((0x0070, 0x0072), frozenset(['type0', 'type2']), True),
            ]), set(self.lgr._generate_label_variants([0x0061, 0x0062])))
        self.assertEqual(
            set([
                ((0x0061, 0x0062, 0x0062), frozenset(), False),
                ((0x0061, 0x0062, 0x0071), frozenset(['type1']), False),
                ((0x0061, 0x0062, 0x0072), frozenset(['type2']), False),
                ((0x0061, 0x0071, 0x0062), frozenset(['type1']), False),
                ((0x0061, 0x0071, 0x0071), frozenset(['type1']), False),
                ((0x0061, 0x0071, 0x0072), frozenset(['type1',
                                                      'type2']), False),
                ((0x0061, 0x0072, 0x0062), frozenset(['type2']), False),
                ((0x0061, 0x0072, 0x0071), frozenset(['type1',
                                                      'type2']), False),
                ((0x0061, 0x0072, 0x0072), frozenset(['type2']), False),
                ((0x0070, 0x0062, 0x0062), frozenset(['type0']), False),
                ((0x0070, 0x0062, 0x0071), frozenset(['type0',
                                                      'type1']), False),
                ((0x0070, 0x0062, 0x0072), frozenset(['type0',
                                                      'type2']), False),
                ((0x0070, 0x0071, 0x0062), frozenset(['type0',
                                                      'type1']), False),
                ((0x0070, 0x0071, 0x0071), frozenset(['type0',
                                                      'type1']), True),
                ((0x0070, 0x0071, 0x0072),
                 frozenset(['type0', 'type1', 'type2']), True),
                ((0x0070, 0x0072, 0x0062), frozenset(['type0',
                                                      'type2']), False),
                ((0x0070, 0x0072, 0x0071),
                 frozenset(['type0', 'type1', 'type2']), True),
                ((0x0070, 0x0072, 0x0072), frozenset(['type0',
                                                      'type2']), True),
            ]), set(self.lgr._generate_label_variants([0x0061, 0x0062,
                                                       0x0062])))

    def test_generate_variants_reflexive(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        self.lgr.add_cp([0x0063])

        self.lgr.add_variant([0x0062], [0x0062], variant_type="reflexive")
        self.lgr.add_variant([0x0063], [0x0070], variant_type="type")

        self.assertEqual([], list(self.lgr._generate_label_variants([])))
        self.assertEqual([], list(self.lgr._generate_label_variants([0x0061])))
        self.assertEqual([((0x0062, ), frozenset(['reflexive']), True)],
                         list(self.lgr._generate_label_variants([0x0062])))
        self.assertEqual(
            set([
                ((0x0062, 0x0063), frozenset(['reflexive']), False),
                ((0x0062, 0x0070), frozenset(['reflexive', 'type']), True),
            ]), set(self.lgr._generate_label_variants([0x0062, 0x0063])))

    def test_label_simple(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062, 0x0063])
        self.lgr.add_range(0x0064, 0x0068)

        valid_labels = ([0x0061], [0x0062, 0x0063], [0x0064], [0x0068],
                        [0x0061, 0x0064], [0x0061, 0x0062, 0x0063,
                                           0x0064], [0x0062, 0x0063, 0x0068])
        invalid_labels = (([0x0060], [], [0x0060]), ([0x0069], [], [0x0069]),
                          ([0x0062], [], [0x0062]), ([0x0063], [], [0x0063]),
                          ([0x0061, 0x0062], [0x0061], [0x0062]))

        for label in valid_labels:
            self.assertEqual((True, label, []),
                             self.lgr._test_preliminary_eligibility(label))
        for (label, label_part, not_in_lgr) in invalid_labels:
            self.assertEqual((False, label_part, not_in_lgr),
                             self.lgr._test_preliminary_eligibility(label))

    def test_label_eligibility_multiple_choices(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0061, 0x0062, 0x0063])
        self.lgr.add_cp([0x0064])

        self.assertEqual(self.lgr._test_preliminary_eligibility([0x0062]),
                         (False, [], [0x0062]))
        self.assertEqual(
            self.lgr._test_preliminary_eligibility(
                [0x0061, 0x0062, 0x0063, 0x0064]),
            (True, [0x0061, 0x0062, 0x0063, 0x0064], []))

    def test_label_delayed_eligibilty(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0061], 'block')
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0062], 'invalid')
        self.lgr.add_cp([0x0063, 0x0064])
        self.lgr.add_variant([0x0063, 0x0064], [0x0063, 0x0064], 'invalid')

        self.assertEqual(self.lgr._test_label_disposition([0x0062]),
                         ('invalid', 0))
        self.assertEqual(self.lgr._test_label_disposition([0x0063, 0x0064]),
                         ('invalid', 0))
        self.assertEqual(self.lgr._test_label_disposition([0x0061, 0x0062]),
                         ('invalid', 0))

    def test_label_length(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0061], 'disp')
        self.lgr.add_cp([0x0062])
        self.lgr.add_variant([0x0062], [0x0062], 'disp')

        self.assertEqual(PROTOCOL_LABEL_MAX_LENGTH,
                         self.lgr.max_label_length())

        for i in range(80):
            self.lgr.add_variant([0x0062], [0x074D + i], 'disp')

        # 41: mean number of variants per character
        self.assertEqual(int(math.log(MAX_NUMBER_GENERATED_VARIANTS, 41)),
                         self.lgr.max_label_length())
Ejemplo n.º 11
0
class TestStats(unittest.TestCase):

    STATS = {
        'codepoint_number': 0,

        'range_number': 0,
        'largest_range': None,
        'largest_range_len': 0,

        'sequence_number': 0,
        'largest_sequence': None,
        'largest_sequence_len': 0,

        'codepoints_with_variants': 0,
        'mapping_number': 0,
        'variants_by_type': {},
        'largest_variant_set': 0,

        'average_variants': 0,

        'codepoints_by_tag': {},

        'rule_number': 0
    }

    def setUp(self):
        self.lgr = LGR()

    def test_empty_lgr(self):
        __, result = compute_stats(self.lgr, {})
        self.assertDictEqual(result, {'description': 'Generate stats',
                                      'stats': self.STATS})

    def test_lgr_chars(self):
        self.lgr.add_cp(0x0061)
        self.lgr.add_cp(0x0062, tag=['test'])
        __, result = compute_stats(self.lgr, {})
        stats = self.STATS.copy()
        stats['codepoint_number'] = 2
        stats['codepoints_by_tag'] = {'test': 1}
        self.assertDictEqual(result, {'description': 'Generate stats',
                                      'stats': stats})

    def test_lgr_ranges(self):
        self.lgr.add_range(0x0061, 0x0065)
        self.lgr.add_range(0x0066, 0x0068)
        __, result = compute_stats(self.lgr, {})
        stats = self.STATS.copy()
        stats['codepoint_number'] = 8
        stats['range_number'] = 2
        stats['largest_range'] = RangeChar(0x0061, 0x0061, 0x0065)
        stats['largest_range_len'] = 5
        self.assertDictEqual(result, {'description': 'Generate stats',
                                      'stats': stats})

    def test_lgr_sequence(self):
        self.lgr.add_cp([0x0061, 0x0062, 0x0063])
        self.lgr.add_cp([0x0061, 0x0062])
        __, result = compute_stats(self.lgr, {})
        stats = self.STATS.copy()
        stats['codepoint_number'] = 2
        stats['sequence_number'] = 2
        stats['largest_sequence'] = CharSequence(cp_or_sequence=(0x0061, 0x0062, 0x0063))
        stats['largest_sequence_len'] = 3
        self.assertDictEqual(result, {'description': 'Generate stats',
                                      'stats': stats})

    def test_lgr_variants(self):
        self.lgr.add_cp(0x0061)
        self.lgr.add_cp(0x0062)
        self.lgr.add_cp(0x0063)
        self.lgr.add_variant(0x0061, 0x0062)
        self.lgr.add_variant(0x0061, 0x0063)
        self.lgr.add_variant(0x0062, 0x0061)
        self.lgr.add_variant(0x0063, 0x0061, variant_type='blocked')
        __, result = compute_stats(self.lgr, {})
        stats = self.STATS.copy()
        stats['codepoint_number'] = 3
        stats['codepoints_with_variants'] = 3
        stats['mapping_number'] = 4
        stats['variants_by_type'] = {None: 3, 'blocked': 1}
        stats['largest_variant_set'] = 3
        stats['average_variants'] = round(4 / 3, 1)
        self.assertDictEqual(result, {'description': 'Generate stats',
                                      'stats': stats})

    def test_lgr_rules(self):
        rule1 = Rule(name='rule1')
        rule2 = Rule(name='rule2')
        self.lgr.add_rule(rule1)
        self.lgr.add_rule(rule2)
        __, result = compute_stats(self.lgr, {})
        stats = self.STATS.copy()
        stats['rule_number'] = 2
        self.assertDictEqual(result, {'description': 'Generate stats',
                                      'stats': stats})
Ejemplo n.º 12
0
class TestRebuildLGR(unittest.TestCase):

    DEFAULT_UNICODE_VERSION = '6.3.0'

    def setUp(self):
        self.lgr = LGR()

    def test_empty_lgr(self):
        __, result = rebuild_lgr(self.lgr, {})
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                        self.DEFAULT_UNICODE_VERSION),
                                      'repertoire': {}})

    def test_lgr_non_default_unicode(self):
        self.lgr.metadata.set_unicode_version('6.2.0')
        __, result = rebuild_lgr(self.lgr, {})
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version 6.2.0',
                                      'repertoire': {}})

    def test_lgr_validating_repertoire(self):
        validating_repertoire = LGR(name='validating')
        __, result = rebuild_lgr(self.lgr, {'validating_repertoire': validating_repertoire})
        self.assertDictEqual(result, {'description': "Rebuilding LGR with Unicode version {} "
                                                     "and validating repertoire '{}'".format(
                                                                self.DEFAULT_UNICODE_VERSION, validating_repertoire),
                                      'repertoire': {}})

    def test_lgr_unidb_same_unicode(self):
        unidb = IDNADatabase('6.3.0')
        __, result = rebuild_lgr(self.lgr, {'unidb': unidb})
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                    self.DEFAULT_UNICODE_VERSION),
                                      'repertoire': {}})

    def test_lgr_unidb_different_unicode(self):
        unidb = IDNADatabase('6.2.0')
        __, result = rebuild_lgr(self.lgr, {'unidb': unidb})
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                    self.DEFAULT_UNICODE_VERSION),
                                      'generic': "Target Unicode version {} differs from UnicodeDatabase {}".format(
                                                                                    self.DEFAULT_UNICODE_VERSION,
                                                                                    '6.2.0'),
                                      'repertoire': {}})

    def test_lgr_wrong_range_char(self):
        self.lgr.add_range(0x0060, 0x0063, force=True)
        r = RangeChar(0x0060, 0x0060, 0x0063)
        unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION)
        self.lgr.unicode_database = unidb
        _, result = rebuild_lgr(self.lgr, {'unidb': unidb})
        errors = result.get('repertoire', {}).get(r, {'errors': []})['errors']
        self.assertEqual(len(errors), 1)
        self.assertIsInstance(errors[0], CharInvalidIdnaProperty)
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                    self.DEFAULT_UNICODE_VERSION),
                                      'repertoire': {r: {'errors': errors}}})

    def test_lgr_wrong_char(self):
        self.lgr.add_cp(0x0060)
        char = self.lgr.get_char([0x0060])
        unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION)
        self.lgr.unicode_database = unidb
        _, result = rebuild_lgr(self.lgr, {'unidb': unidb})
        errors = result.get('repertoire', {}).get(char, {'errors': []})['errors']
        self.assertEqual(len(errors), 1)
        self.assertIsInstance(errors[0], CharInvalidIdnaProperty)
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                    self.DEFAULT_UNICODE_VERSION),
                                      'repertoire': {char: {'errors': errors}}})

    def test_lgr_wrong_variant(self):
        self.lgr.add_cp(0x0061)
        self.lgr.add_variant(0x0061, 0x0060)
        char = self.lgr.get_char([0x0061])
        var = char.get_variant((0x0060, ))[0]
        unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION)
        self.lgr.unicode_database = unidb
        _, result = rebuild_lgr(self.lgr, {'unidb': unidb})
        errors = result.get('repertoire', {}).get(char, {}).get('variants', {}).get(var, [])
        self.assertEqual(len(errors), 1)
        self.assertIsInstance(errors[0], CharInvalidIdnaProperty)
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                    self.DEFAULT_UNICODE_VERSION),
                                      'repertoire': {char: {'variants': {var: errors}}}})

    def test_lgr_ok(self):
        self.lgr.add_range(0x0061, 0x0063, force=True)
        self.lgr.add_cp(0x0064)
        self.lgr.add_cp(0x0065)
        self.lgr.add_variant(0x0064, 0x0065)
        self.lgr.add_variant(0x0065, 0x0064)
        unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION)
        self.lgr.unicode_database = unidb
        _, result = rebuild_lgr(self.lgr, {'unidb': unidb})
        self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format(
                                                                                        self.DEFAULT_UNICODE_VERSION),
                                      'repertoire': {}})
Ejemplo n.º 13
0
class TestConditionalVariants(unittest.TestCase):

    def setUp(self):
        self.lgr = LGR()
        # Configure log system to redirect validation logs to local attribute
        self.log_output = StringIO()
        ch = logging.StreamHandler(self.log_output)
        ch.setLevel(logging.DEBUG)
        logging.getLogger('lgr.validate').addHandler(ch)

    def test_empty_lgr(self):
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': []})

    def test_no_variants(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_cp([0x0062])
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': []})

    def test_no_rule_when(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule")
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         "CP U+0061: Variant 'U+0062' \"when\" attribute "
                         "'when-rule' is not an existing rule name.\n")
        self.assertFalse(success)
        var = self.lgr.get_variant([0x0061], (0x0062, ))[0]
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': [{'char': self.lgr.get_char([0x0061]),
                                                      'variant': var,
                                                      'rule_type': 'when',
                                                      'rule': var.when}]})

    def test_no_rule_not_when(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], not_when="not-when-rule")
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         "CP U+0061: Variant 'U+0062' \"not-when\" attribute "
                         "'not-when-rule' is not an existing rule name.\n")
        self.assertFalse(success)
        var = self.lgr.get_variant([0x0061], (0x0062, ))[0]
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': [{'char': self.lgr.get_char([0x0061]),
                                                      'variant': var,
                                                      'rule_type': 'not-when',
                                                      'rule': var.not_when}]})

    def test_no_rule_when_not_when(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule", not_when="not-when-rule", force=True)
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertGreater(len(log_content), 0)
        self.assertEqual(log_content,
                         "CP U+0061: Variant 'U+0062' \"when\" attribute "
                         "'when-rule' is not an existing rule name.\n"
                         "CP U+0061: Variant 'U+0062' \"not-when\" attribute "
                         "'not-when-rule' is not an existing rule name.\n")
        self.assertFalse(success)
        var = self.lgr.get_variant([0x0061], (0x0062, ))[0]
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': [{'char': self.lgr.get_char([0x0061]),
                                                      'variant': var,
                                                      'rule_type': 'when',
                                                      'rule': var.when},
                                                     {'char': self.lgr.get_char([0x0061]),
                                                      'variant': var,
                                                      'rule_type': 'not-when',
                                                      'rule': var.not_when}
                                                     ]})

    def test_conditional_when_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], when="when-rule")
        self.lgr.rules.append("when-rule")
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': []})

    def test_conditional_not_when_ok(self):
        self.lgr.add_cp([0x0061])
        self.lgr.add_variant([0x0061], [0x0062], not_when="not-when-rule")
        self.lgr.rules.append("not-when-rule")
        success, result = check_conditional_variants(self.lgr, {})
        log_content = self.log_output.getvalue()
        self.assertEqual(len(log_content), 0)
        self.assertTrue(success)
        self.assertDictEqual(result, {'description': 'Testing conditional variants',
                                      'repertoire': []})