def test_add_range_validation_with_range(self): validation_lgr = LGR() validation_lgr.add_range(0x0061, 0x007A) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False)
def make_idna_repertoire(version): """ Make a repertoire from IDNA tables. Parse IDNA table registry, convert it to an LGR XML format, and output it on stdout. Input: * version: The unicode version to use. """ from lgr.core import LGR from lgr.parser.xml_serializer import serialize_lgr_xml lgr = LGR('idna2008-%s' % version) idna_url = IDNATABLES_URL.format(version=version) logger.debug("Fetching and parsing '%s'", idna_url) registry = etree.parse(idna_url) # To keep '{}' when string-formatting namespace = "{{{0}}}".format(IDNATABLES_NS) registry_id = "idna-tables-properties" if list(map(int, version.split('.'))) <= [6, 0, 0]: registry_id = "idna-tables-{}-properties".format(version) record_xpath = '{0}registry[@id="{1}"]/{0}record'.format( namespace, registry_id) for record in registry.findall(record_xpath): codepoint = record.find(CODEPOINT_TAG).text prop = record.find(PROPERTY_TAG).text if prop not in ['PVALID', 'CONTEXTO', 'CONTEXTJ']: continue if codepoint.find('-') > 0: # Codepoint is a range (first_cp, last_cp) = [int(c, 16) for c in codepoint.split('-')] lgr.add_range(first_cp, last_cp) else: # Single codepoint lgr.add_cp(int(codepoint, 16)) lgr_root = serialize_lgr_xml(lgr, pretty_print=True, encoding='unicode', xml_declaration=False) print(lgr_root)
def rebuild_lgr(lgr, options): """ Rebuild an LGR with given parameters. options argument can contain: * unicode_version: The target Unicode version to be used when rebuilding the LGR. If None is given, use the current one. * validating_repertoire: The validating repertoire used for checking code points. * unidb: Munidata's Unicode database. If None, skip Unicode checks. :param LGR lgr: The LGR to rebuild. :param dict options: Dictionary of options to the validation function. """ # Local import to prevent import cycles from lgr.core import LGR unicode_version = options.get('unicode_version', lgr.metadata.unicode_version) validating_repertoire = options.get('validating_repertoire', None) description = "Rebuilding LGR with Unicode version {}".format( unicode_version) if validating_repertoire is not None: description += " and validating repertoire '{}'".format( validating_repertoire) result = { 'description': description, 'repertoire': {} # XXX: Cannot use defaultdict because of django... } logger.info( "Rebuilding LGR '%s' with Unicode version %s " "and Validating Repertoire '%s'", lgr, unicode_version, validating_repertoire) unidb = options.get('unidb', None) if unidb is not None: unidb_version = unidb.get_unicode_version() if unidb_version != unicode_version: result['generic'] = "Target Unicode version {} " \ "differs from UnicodeDatabase {}".format(unicode_version, unidb_version) logger.warning( "Target Unicode version %s differs " "from UnicodeDatabase %s", unicode_version, unidb_version) # For now, simply copy the metadata and references of the source LGR target_metadata = copy.deepcopy(lgr.metadata) target_metadata.unicode_version = unicode_version target_reference_manager = copy.deepcopy(lgr.reference_manager) target_lgr = LGR(name=lgr.name, metadata=target_metadata, reference_manager=target_reference_manager, unicode_database=unidb) for char in lgr.repertoire: if isinstance(char, RangeChar): range_ok = True for cp, status in target_lgr.check_range(char.first_cp, char.last_cp, validating_repertoire): if status is not None: result['repertoire'].setdefault(char, {}).setdefault( 'errors', []).append(status) range_ok = False in_script, _ = lgr.cp_in_script([cp]) if not in_script: result['repertoire'].setdefault(char, {}).setdefault( 'warnings', []).append(CharNotInScript(cp)) range_ok = False if not range_ok: continue try: target_lgr.add_range( char.first_cp, char.last_cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, validating_repertoire=validating_repertoire, override_repertoire=False) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault('errors', []).append(exc) logger.error("Cannot add range '%s-%s'", format_cp(char.first_cp), format_cp(char.last_cp)) continue in_script, _ = lgr.cp_in_script(char.cp) if not in_script: result['repertoire'].setdefault(char, {}).setdefault( 'warnings', []).append(CharNotInScript(char.cp)) # Insert code point try: target_lgr.add_cp(char.cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, validating_repertoire=validating_repertoire, override_repertoire=False) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault('errors', []).append(exc) logger.error("Cannot add code point '%s'", format_cp(char.cp)) if not isinstance(exc, CharInvalidIdnaProperty ): # Cannot include non-IDNA valid code points target_lgr.add_cp(char.cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, force=True) # Create variants for var in char.get_variants(): try: target_lgr.add_variant( char.cp, variant_cp=var.cp, variant_type=var.type, when=var.when, not_when=var.not_when, comment=var.comment, ref=var.references, validating_repertoire=validating_repertoire, override_repertoire=True) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault( 'variants', {}).setdefault(var, []).append(exc) logger.error("Cannot add variant '%s' to code point '%s'", format_cp(var.cp), format_cp(char.cp)) if not isinstance( exc, CharInvalidIdnaProperty ): # Cannot include non-IDNA valid code points target_lgr.add_variant(char.cp, variant_cp=var.cp, variant_type=var.type, when=var.when, not_when=var.not_when, comment=var.comment, ref=var.references, force=True) logger.info("Rebuilding LGR '%s done", lgr) return True, result
class XMLParser(LGRParser): # Keep content intact, so do not strip CDATA section # (used in the <meta>/<description> element). # Do not resolve entities. # Skip comment, as we do not care. PARSER_OPTIONS = { 'resolve_entities': False, 'strip_cdata': False, 'remove_comments': True } def __init__(self, *args, **kwargs): if 'force_mode' in kwargs: force_mode = kwargs['force_mode'] del kwargs['force_mode'] else: force_mode = True super(XMLParser, self).__init__(*args, **kwargs) self.force_mode = force_mode self.rfc7940_checks = LGRFormatTestResults() def validate_document(self, rng_schema_path): # Construct the RelaxNG validator schema = etree.RelaxNG(file=rng_schema_path) # Parse the XML file parser = etree.XMLParser(**self.PARSER_OPTIONS) doc = etree.parse(self.source, parser=parser) logger.debug("Validating document '%s' with RNG '%s'", self.source, rng_schema_path) error_log = None if not schema.validate(doc): logger.warning("Validation of document '%s' failed", self.source) self.rfc7940_checks.error('schema') error_log = schema.error_log if len(error_log) == 0: # Bug in LXML, see https://bugs.launchpad.net/lxml/+bug/1526522 error_log = "CANNOT VALIDATE XML" self.rfc7940_checks.tested('schema') return error_log def unicode_version(self): logger.debug("Get unicode version from meta") # Only parse the "meta" element # Skip comment, as we do not care. context = etree.iterparse(self.source, tag=META_TAG, **self.PARSER_OPTIONS) self._fast_iter(context) unicode_version = self._lgr.metadata.unicode_version self._lgr = None # FD is now potentially at the end of the documents, # set it back to start if hasattr(self.source, "seek"): self.source.seek(0) return unicode_version def parse_document(self): logger.debug('Start parsing of file: %s', self.filename) # Keep content intact, so do not strip CDATA section # (used in the <meta>/<description> element). # Do not resolve entities. # Skip comment, as we do not care. context = etree.iterparse(self.source, **self.PARSER_OPTIONS) self._fast_iter(context) # FD is now potentially at the end of the documents, # set it back to start if hasattr(self.source, "seek"): self.source.seek(0) self.rfc7940_checks.tested('parse_xml') return self._lgr def _process_meta(self, elem): """ Process the <meta> element of an LGR XML file. """ metadata = Metadata(self.rfc7940_checks) reference_manager = ReferenceManager() MAPPER = { DATE_TAG: lambda d: metadata.set_date(d, force=self.force_mode), VALIDITY_START_TAG: lambda d: metadata.set_validity_start(d, force=self.force_mode), VALIDITY_END_TAG: lambda d: metadata.set_validity_end(d, force=self.force_mode), UNICODE_VERSION_TAG: lambda d: metadata.set_unicode_version(d, force=self.force_mode), } unicode_version_tag_found = False for child in elem: tag = child.tag logger.debug("Got '%s' element", tag) if tag in MAPPER: MAPPER[tag](child.text) if tag == UNICODE_VERSION_TAG: unicode_version_tag_found = True elif tag == VERSION_TAG: metadata.version = Version(child.text, child.get('comment', None)) elif tag == LANGUAGE_TAG: metadata.add_language(child.text, force=self.force_mode) elif tag == SCOPE_TAG: metadata.scopes.append( Scope(child.text, child.get('type', None))) elif tag == DESCRIPTION_TAG: # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449 # For now, manually replace CRLF with LF metadata.description = Description( child.text.replace('\r\n', '\n'), child.get('type', None)) elif tag == REFERENCES_TAG: for reference in child: value = reference.text # Don't convert it to an int since ref_id may be a string ref_id = reference.get('id') comment = reference.get('comment', None) reference_manager.add_reference(value, comment=comment, ref_id=ref_id) # Since we have processed <reference> elements here, let's clean-up child.clear() else: logger.warning("Unhandled '%s' element in <meta> section", tag) self.rfc7940_checks.error('parse_xml') child.clear() self.rfc7940_checks.add_test_result('explicit_unicode_version', unicode_version_tag_found) self._lgr = LGR(name=self.filename, metadata=metadata, reference_manager=reference_manager, unicode_database=self._unicode_database) def _process_data(self, elem): """ Process the <data> element of an LGR XML file. """ # It is RECOMMENDED to list all "char" elements in ascending order of # the "cp" attribute. The below variable is used when verifying that. previous_codepoint = [] for child in elem: comment = child.get('comment', None) when = child.get('when', None) not_when = child.get('not-when', None) # Handle references ref = string_to_list(child.get('ref', '')) # Handle tags tag = string_to_list(child.get('tag', '')) if child.tag == CHAR_TAG: codepoint = [int(c, 16) for c in child.get('cp').split()] if codepoint <= previous_codepoint: if previous_codepoint[0:len(codepoint)] == codepoint: # Not clear what order is to be recommended here self.rfc7940_checks.error( 'char_strict_ascending_order') else: logger.warning( "cp attribute not in ascending order: '%s'", child.get('cp')) self.rfc7940_checks.error('char_ascending_order') previous_codepoint = codepoint try: self._lgr.add_cp(codepoint, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: logger.error("Cannot add code point '%s': %s", format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise # Variants of char for variant in child.iter(VARIANT_TAG): var_codepoint = [ int(c, 16) for c in variant.get('cp').split() ] when = variant.get('when', None) not_when = variant.get('not-when', None) variant_type = variant.get('type', None) comment = variant.get('comment', None) # Handle references ref = string_to_list(variant.get('ref', '')) try: self._lgr.add_variant(codepoint, var_codepoint, variant_type=variant_type, when=when, not_when=not_when, comment=comment, ref=ref, force=self.force_mode) except LGRException as exc: logger.error( "Cannot add variant '%s' " "to code point '%s': %s", format_cp(var_codepoint), format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise elif child.tag == RANGE_TAG: first_cp = int(child.get('first-cp'), 16) last_cp = int(child.get('last-cp'), 16) try: self._lgr.add_range(first_cp, last_cp, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') logger.error("Cannot add range '%s-%s': %s", format_cp(first_cp), format_cp(last_cp), exc) if not self.force_mode: raise child.clear() self.rfc7940_checks.tested('char_ascending_order') self.rfc7940_checks.tested('char_strict_ascending_order') def _process_rules(self, elem): """ Process the <rules> element of an LGR XML file. """ # Keep "text" version of the rules since we don't do anything with them. for child in elem: if child.tag in COMBINATOR_TAGS + (CLASS_TAG, ): cls = self._parse_class(child) self._lgr.add_class(cls, force=self.force_mode) child = drop_ns(child) self._lgr.classes_xml.append( etree.tostring(child, encoding=text_type)) elif child.tag == RULE_TAG: rule = self._parse_rule(child) self._lgr.add_rule(rule, force=self.force_mode) child = drop_ns(child) self._lgr.rules_xml.append( etree.tostring(child, encoding=text_type)) elif child.tag == ACTION_TAG: action = self._parse_action(child) self._lgr.add_action(action, force=self.force_mode) child = drop_ns(child) self._lgr.actions_xml.append( etree.tostring(child, encoding=text_type)) else: logger.warning("Unhandled '%s' element in <rules> section", child.tag) self.rfc7940_checks.error("parse_xml") child.clear() def _parse_rule(self, elem): """ Parse a <rule> element. :return: The rule object created. """ rule = Rule(name=elem.get('name', None), comment=elem.get('comment', None), ref=string_to_list(elem.get('ref', '')), by_ref=elem.get('by-ref', None)) for child in elem: self._parse_rule_helper(child, rule) return rule def _parse_rule_helper(self, child, rule): """ Helper to parse the content of a <rule> element. This function is to be called on children of a top-level <rule>. :param child: Child element of a top-level <rule> element. :param rule: The top-level rule element to add the content to. """ tag = child.tag comment = child.get('comment', None) count = child.get('count', None) if tag == ANCHOR_TAG: rule.add_child(AnchorMatcher(comment=comment)) elif tag == ANY_TAG: rule.add_child(AnyMatcher(comment=comment, count=count)) elif tag == CHAR_TAG: rule.add_child( CharMatcher(cp_or_sequence_from_class(child), comment=comment, count=count)) elif tag == CHOICE_TAG: choice = ChoiceMatcher(comment=comment, count=count) for matcher in child: self._parse_rule_helper(matcher, choice) rule.add_child(choice) elif tag == END_TAG: rule.add_child(EndMatcher(comment=comment)) elif tag == LOOKAHEAD_TAG: look_ahead = LookAheadMatcher(comment=comment) for matcher in child: self._parse_rule_helper(matcher, look_ahead) rule.add_child(look_ahead) elif tag == LOOKBEHIND_TAG: look_behind = LookBehindMatcher(comment=comment) for matcher in child: self._parse_rule_helper(matcher, look_behind) rule.add_child(look_behind) elif tag == START_TAG: rule.add_child(StartMatcher(comment=comment)) elif tag == RULE_TAG: child_rule = self._parse_rule(child) rule.add_child( RuleMatcher(child_rule, comment=comment, count=count)) elif tag == CLASS_TAG or tag in COMBINATOR_TAGS: rule.add_child( ClassMatcher(self._parse_class(child), comment=comment, count=count)) else: logger.warning("Unhandled '%s' element in <rule> object", tag) self.rfc7940_checks.error('parse_xml') def _parse_action(self, elem): """ Parse an <action> element. :return: The action object created. """ disp = elem.get('disp') comment = elem.get('comment', None) match = elem.get('match', None) not_match = elem.get('not-match', None) any_variant = string_to_list(elem.get('any-variant', '')) all_variants = string_to_list(elem.get('all-variants', '')) only_variants = string_to_list(elem.get('only-variants', '')) return Action(disp, comment=comment, ref=string_to_list(elem.get('ref', '')), match=match, not_match=not_match, any_variant=any_variant, all_variants=all_variants, only_variants=only_variants) def _parse_class(self, elem): """ Parse an <class> element. :return: The Class object created. """ tag = elem.tag name = elem.get('name', None) comment = elem.get('comment', None) if tag == CLASS_TAG: cls = Class(name=name, comment=comment, ref=string_to_list(elem.get('ref', '')), from_tag=elem.get('from-tag', None), unicode_property=elem.get('property', None), by_ref=elem.get('by-ref', None)) if len(elem) == 0 and elem.text: # No child, code point(s) defined in text cls.add_codepoint(cp_or_sequence_from_class(elem)) for child in elem: cls.add_codepoint(cp_or_sequence_from_class(child)) elif tag in COMBINATOR_TAGS: MAPPING = { UNION_TAG: UnionClass, COMPLEMENT_TAG: ComplementClass, INTERSECTION_TAG: IntersectionClass, DIFFERENCE_TAG: DifferenceClass, SYM_DIFFERENCE_TAG: SymmetricDifferenceClass } cls = MAPPING[tag](name=name, comment=comment) # TODO: ensure number of children for child in elem: cls.add_child(self._parse_class(child)) else: logger.warning("Unhandled '%s' element in <class> object", tag) self.rfc7940_checks.error('parse_xml') return cls def _fast_iter(self, context): """ Iterator used to incrementally parse the XML file. """ metadata_added = False for _, elem in context: if not metadata_added and elem == DATA_TAG: # The optional "meta" element is not present since it must # preceed the required data element. # However, we still have to call _process_meta self._process_meta({}) metadata_added = True if elem.tag == META_TAG: logger.debug("Got 'meta' element") self._process_meta(elem) elif elem.tag == DATA_TAG: logger.debug("Got 'data' element") self._process_data(elem) elif elem.tag == RULES_TAG: logger.debug("Got 'rules' element") self._process_rules(elem) else: continue # Clean-up memory elem.clear() del context
class TestLGRCore(unittest.TestCase): def setUp(self): unidb = IDNADatabase('6.3.0') self.lgr = LGR(unicode_database=unidb) def test_add_single_cp_list(self): self.lgr.add_cp([0x0061]) self.assertIn(0x0061, self.lgr.repertoire) def test_add_single_cp_int(self): self.lgr.add_cp(0x0061) self.assertIn(0x0061, self.lgr.repertoire) def test_add_cp_sequence(self): self.lgr.add_cp([0x0061, 0x0062]) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) self.assertNotIn(0x0061, self.lgr.repertoire) self.assertNotIn(0x0062, self.lgr.repertoire) def test_add_multiple_cp_sequences(self): self.lgr.add_cp([0x0061, 0x0062]) self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) self.assertIn([0x0061, 0x0062, 0x0063], self.lgr.repertoire) self.assertNotIn(0x0061, self.lgr.repertoire) self.assertNotIn(0x0062, self.lgr.repertoire) self.assertNotIn(0x0063, self.lgr.repertoire) def test_add_cp_in_repertoire(self): self.lgr.add_cp([0x0061]) self.assertRaises(CharAlreadyExists, self.lgr.add_cp, [0x0061]) self.assertRaises(CharAlreadyExists, self.lgr.add_cp, 0x0061) def test_add_cp_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_cp, [0x0062], validating_repertoire=validation_lgr, override_repertoire=False) def test_add_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.lgr.add_cp([0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0062, self.lgr.repertoire) def test_del_single_cp_list(self): self.lgr.add_cp(0x0061) self.lgr.del_cp([0x0061]) self.assertNotIn(0x0061, self.lgr.repertoire) def test_del_single_cp_int(self): self.lgr.add_cp([0x0061]) self.lgr.del_cp(0x0061) self.assertNotIn(0x0061, self.lgr.repertoire) def test_del_cp_sequence(self): self.lgr.add_cp([0x0061, 0x0062]) self.lgr.del_cp([0x0061, 0x0062]) self.assertEqual(len(self.lgr.repertoire), 0) def test_del_cp_sequence_with_cp(self): self.lgr.add_cp([0x0061, 0x0062]) self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0061) self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0062) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) def test_add_cp_when_not_when(self): self.lgr.add_cp([0x0061], when='w1') with self.assertRaises(CharInvalidContextRule) as cm: self.lgr.add_cp([0x0062], when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0062]) self.lgr.add_cp([0x0062], not_when='nw2') with self.assertRaises(CharInvalidContextRule) as cm: self.lgr.add_cp([0x0063], when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0063]) def test_add_range(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIn(cp, self.lgr.repertoire) def test_add_range_in_repertoire(self): self.lgr.add_range(0x0061, 0x007A) self.assertRaises(CharAlreadyExists, self.lgr.add_range, 0x0061, 0x007A) def test_add_range_validation(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A + 1): validation_lgr.add_cp(cp) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False) def test_add_range_validation_with_range(self): validation_lgr = LGR() validation_lgr.add_range(0x0061, 0x007A) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False) def test_add_range_validation_override(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A): validation_lgr.add_cp(cp) self.lgr.add_range(0x0031, 0x0032, validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0031, self.lgr.repertoire) def test_add_range_when_not_when(self): self.lgr.add_range(0x0061, 0x0065, when='w1') with self.assertRaises(RangeInvalidContextRule) as cm: self.lgr.add_range(0x0066, 0x007A, when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.first_cp, 0x0066) self.assertEqual(the_exception.last_cp, 0x007A) self.lgr.add_range(0x0066, 0x007A, not_when='nw2') with self.assertRaises(RangeInvalidContextRule) as cm: self.lgr.add_range(0x01BD, 0x01C3, when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.first_cp, 0x01BD) self.assertEqual(the_exception.last_cp, 0x01C3) def test_expand_ranges(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.add_range(0x01BD, 0x01C3) for cp in range(0x01BD, 0x01C3 + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.expand_ranges() for cp in range(0x0061, 0x007A + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) for cp in range(0x01BD, 0x01C3 + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) def test_expand_range(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.expand_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) def test_add_variant_in_repertoire(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.assertRaises(VariantAlreadyExists, self.lgr.add_variant, [0x0061], [0x0030]) def test_add_variant_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.assertRaises(NotInRepertoire, self.lgr.add_variant, [0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=False) def test_add_variant_when_not_when(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030], when='w1') with self.assertRaises(VariantInvalidContextRule) as cm: self.lgr.add_variant([0x0061], [0x0031], when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) self.assertEqual(the_exception.variant, [0x0031]) self.lgr.add_variant([0x0061], [0x0030], not_when='nw2') with self.assertRaises(VariantInvalidContextRule) as cm: self.lgr.add_variant([0x0061], [0x0031], when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) self.assertEqual(the_exception.variant, [0x0031]) def test_del_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.lgr.add_variant([0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants) def test_get_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) variants = self.lgr.get_variants([0x0061]) self.assertIsInstance(variants, types.GeneratorType) variant_list = list(variants) self.assertEqual(len(variant_list), 1) def test_check_range_no_modification(self): self.lgr.check_range(0x0060, 0x007F) self.assertEqual(len(self.lgr.repertoire), 0) def test_check_range(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x007A]) codepoints = self.lgr.check_range(0x0060, 0x007F) for result in codepoints: cp = result[0] prop = result[1] if cp == 0x060 or cp >= 0x007B: self.assertIsInstance(prop, CharInvalidIdnaProperty) elif cp == 0x0061 or cp == 0x007A: self.assertIsInstance(prop, CharAlreadyExists) else: self.assertIsNone(prop) def test_add_codepoints(self): self.lgr.add_codepoints([c for c in range(0x0061, 0x007A + 1)] + [0x0107] + [0x0137, 0x0138]) expected_output = [ RangeChar(0x061, 0x0061, 0x007A), Char(0x0107), RangeChar(0x0137, 0x0137, 0x0138) ] self.assertEqual(expected_output, list(self.lgr.repertoire)) def test_tags_on_codepoint(self): self.lgr.add_cp([0x0061], tag=['t1', 't2']) with self.assertRaises(LGRFormatException) as cm: self.lgr.add_cp([0x0062], tag=['t1', 't1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.DUPLICATE_TAG) def test_tags_on_codepoint_sequence(self): with self.assertRaises(LGRFormatException) as cm: self.lgr.add_cp([0x0061, 0x0062], tag=['t1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.SEQUENCE_NO_TAG) def test_tags_on_range(self): self.lgr.add_range(0x0061, 0x0062, tag=['t1', 't2']) with self.assertRaises(LGRFormatException) as cm: self.lgr.add_range(0x0063, 0x0064, tag=['t1', 't1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.DUPLICATE_TAG) def test_list_types(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030], variant_type='BLOCK') self.lgr.add_variant([0x0061], [0x0031], variant_type='VALID') self.lgr.add_variant([0x0061], [0x0032], variant_type='BLOCK') self.assertEquals(self.lgr.types, set(['BLOCK', 'VALID'])) def test_del_reference(self): ref_id_1 = self.lgr.add_reference("Test - 1") ref_id_2 = self.lgr.add_reference("Test - 2") self.lgr.add_cp([0x0061], ref=[ref_id_1]) self.lgr.add_cp([0x0062], ref=[ref_id_1, ref_id_2]) self.lgr.del_reference(ref_id_1) self.assertNotIn(ref_id_1, self.lgr.reference_manager) self.assertEquals(self.lgr.get_char([0x0061]).references, []) self.assertEquals(self.lgr.get_char([0x0062]).references, [ref_id_2]) def test_add_cp_duplicate_reference(self): ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_cp([0x0061], ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) def test_add_range_duplicate_reference(self): ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_range(0x0061, 0x0062, ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, 0x0061) def test_add_variant_duplicate_reference(self): self.lgr.add_cp([0x0061]) ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_variant([0x0061], [0x0062], ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) def test_generate_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) self.lgr.add_cp([0x0063]) self.lgr.add_cp([0x0064]) self.lgr.add_variant([0x0061], [0x0070], variant_type="type0") self.lgr.add_variant([0x0062], [0x0071], variant_type="type1") self.lgr.add_variant([0x0062], [0x0072], variant_type="type2") self.assertEqual([], list(self.lgr._generate_label_variants([]))) self.assertEqual([], list(self.lgr._generate_label_variants([0x0063]))) self.assertEqual( [], list(self.lgr._generate_label_variants([0x0063, 0x0064]))) self.assertEqual( set([((0x0071, 0x0063), frozenset(['type1']), False), ((0x0072, 0x0063), frozenset(['type2']), False)]), set(self.lgr._generate_label_variants([0x0062, 0x0063]))) self.assertEqual( set([ ((0x0061, 0x0062), frozenset(), False), ((0x0061, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0072), frozenset(['type2']), False), ((0x0070, 0x0062), frozenset(['type0']), False), ((0x0070, 0x0071), frozenset(['type0', 'type1']), True), ((0x0070, 0x0072), frozenset(['type0', 'type2']), True), ]), set(self.lgr._generate_label_variants([0x0061, 0x0062]))) self.assertEqual( set([ ((0x0061, 0x0062, 0x0062), frozenset(), False), ((0x0061, 0x0062, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0062, 0x0072), frozenset(['type2']), False), ((0x0061, 0x0071, 0x0062), frozenset(['type1']), False), ((0x0061, 0x0071, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0071, 0x0072), frozenset(['type1', 'type2']), False), ((0x0061, 0x0072, 0x0062), frozenset(['type2']), False), ((0x0061, 0x0072, 0x0071), frozenset(['type1', 'type2']), False), ((0x0061, 0x0072, 0x0072), frozenset(['type2']), False), ((0x0070, 0x0062, 0x0062), frozenset(['type0']), False), ((0x0070, 0x0062, 0x0071), frozenset(['type0', 'type1']), False), ((0x0070, 0x0062, 0x0072), frozenset(['type0', 'type2']), False), ((0x0070, 0x0071, 0x0062), frozenset(['type0', 'type1']), False), ((0x0070, 0x0071, 0x0071), frozenset(['type0', 'type1']), True), ((0x0070, 0x0071, 0x0072), frozenset(['type0', 'type1', 'type2']), True), ((0x0070, 0x0072, 0x0062), frozenset(['type0', 'type2']), False), ((0x0070, 0x0072, 0x0071), frozenset(['type0', 'type1', 'type2']), True), ((0x0070, 0x0072, 0x0072), frozenset(['type0', 'type2']), True), ]), set(self.lgr._generate_label_variants([0x0061, 0x0062, 0x0062]))) def test_generate_variants_reflexive(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) self.lgr.add_cp([0x0063]) self.lgr.add_variant([0x0062], [0x0062], variant_type="reflexive") self.lgr.add_variant([0x0063], [0x0070], variant_type="type") self.assertEqual([], list(self.lgr._generate_label_variants([]))) self.assertEqual([], list(self.lgr._generate_label_variants([0x0061]))) self.assertEqual([((0x0062, ), frozenset(['reflexive']), True)], list(self.lgr._generate_label_variants([0x0062]))) self.assertEqual( set([ ((0x0062, 0x0063), frozenset(['reflexive']), False), ((0x0062, 0x0070), frozenset(['reflexive', 'type']), True), ]), set(self.lgr._generate_label_variants([0x0062, 0x0063]))) def test_label_simple(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062, 0x0063]) self.lgr.add_range(0x0064, 0x0068) valid_labels = ([0x0061], [0x0062, 0x0063], [0x0064], [0x0068], [0x0061, 0x0064], [0x0061, 0x0062, 0x0063, 0x0064], [0x0062, 0x0063, 0x0068]) invalid_labels = (([0x0060], [], [0x0060]), ([0x0069], [], [0x0069]), ([0x0062], [], [0x0062]), ([0x0063], [], [0x0063]), ([0x0061, 0x0062], [0x0061], [0x0062])) for label in valid_labels: self.assertEqual((True, label, []), self.lgr._test_preliminary_eligibility(label)) for (label, label_part, not_in_lgr) in invalid_labels: self.assertEqual((False, label_part, not_in_lgr), self.lgr._test_preliminary_eligibility(label)) def test_label_eligibility_multiple_choices(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.lgr.add_cp([0x0064]) self.assertEqual(self.lgr._test_preliminary_eligibility([0x0062]), (False, [], [0x0062])) self.assertEqual( self.lgr._test_preliminary_eligibility( [0x0061, 0x0062, 0x0063, 0x0064]), (True, [0x0061, 0x0062, 0x0063, 0x0064], [])) def test_label_delayed_eligibilty(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0061], 'block') self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0062], 'invalid') self.lgr.add_cp([0x0063, 0x0064]) self.lgr.add_variant([0x0063, 0x0064], [0x0063, 0x0064], 'invalid') self.assertEqual(self.lgr._test_label_disposition([0x0062]), ('invalid', 0)) self.assertEqual(self.lgr._test_label_disposition([0x0063, 0x0064]), ('invalid', 0)) self.assertEqual(self.lgr._test_label_disposition([0x0061, 0x0062]), ('invalid', 0)) def test_label_length(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0061], 'disp') self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0062], 'disp') self.assertEqual(PROTOCOL_LABEL_MAX_LENGTH, self.lgr.max_label_length()) for i in range(80): self.lgr.add_variant([0x0062], [0x074D + i], 'disp') # 41: mean number of variants per character self.assertEqual(int(math.log(MAX_NUMBER_GENERATED_VARIANTS, 41)), self.lgr.max_label_length())
class TestStats(unittest.TestCase): STATS = { 'codepoint_number': 0, 'range_number': 0, 'largest_range': None, 'largest_range_len': 0, 'sequence_number': 0, 'largest_sequence': None, 'largest_sequence_len': 0, 'codepoints_with_variants': 0, 'mapping_number': 0, 'variants_by_type': {}, 'largest_variant_set': 0, 'average_variants': 0, 'codepoints_by_tag': {}, 'rule_number': 0 } def setUp(self): self.lgr = LGR() def test_empty_lgr(self): __, result = compute_stats(self.lgr, {}) self.assertDictEqual(result, {'description': 'Generate stats', 'stats': self.STATS}) def test_lgr_chars(self): self.lgr.add_cp(0x0061) self.lgr.add_cp(0x0062, tag=['test']) __, result = compute_stats(self.lgr, {}) stats = self.STATS.copy() stats['codepoint_number'] = 2 stats['codepoints_by_tag'] = {'test': 1} self.assertDictEqual(result, {'description': 'Generate stats', 'stats': stats}) def test_lgr_ranges(self): self.lgr.add_range(0x0061, 0x0065) self.lgr.add_range(0x0066, 0x0068) __, result = compute_stats(self.lgr, {}) stats = self.STATS.copy() stats['codepoint_number'] = 8 stats['range_number'] = 2 stats['largest_range'] = RangeChar(0x0061, 0x0061, 0x0065) stats['largest_range_len'] = 5 self.assertDictEqual(result, {'description': 'Generate stats', 'stats': stats}) def test_lgr_sequence(self): self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.lgr.add_cp([0x0061, 0x0062]) __, result = compute_stats(self.lgr, {}) stats = self.STATS.copy() stats['codepoint_number'] = 2 stats['sequence_number'] = 2 stats['largest_sequence'] = CharSequence(cp_or_sequence=(0x0061, 0x0062, 0x0063)) stats['largest_sequence_len'] = 3 self.assertDictEqual(result, {'description': 'Generate stats', 'stats': stats}) def test_lgr_variants(self): self.lgr.add_cp(0x0061) self.lgr.add_cp(0x0062) self.lgr.add_cp(0x0063) self.lgr.add_variant(0x0061, 0x0062) self.lgr.add_variant(0x0061, 0x0063) self.lgr.add_variant(0x0062, 0x0061) self.lgr.add_variant(0x0063, 0x0061, variant_type='blocked') __, result = compute_stats(self.lgr, {}) stats = self.STATS.copy() stats['codepoint_number'] = 3 stats['codepoints_with_variants'] = 3 stats['mapping_number'] = 4 stats['variants_by_type'] = {None: 3, 'blocked': 1} stats['largest_variant_set'] = 3 stats['average_variants'] = round(4 / 3, 1) self.assertDictEqual(result, {'description': 'Generate stats', 'stats': stats}) def test_lgr_rules(self): rule1 = Rule(name='rule1') rule2 = Rule(name='rule2') self.lgr.add_rule(rule1) self.lgr.add_rule(rule2) __, result = compute_stats(self.lgr, {}) stats = self.STATS.copy() stats['rule_number'] = 2 self.assertDictEqual(result, {'description': 'Generate stats', 'stats': stats})
class TestRebuildLGR(unittest.TestCase): DEFAULT_UNICODE_VERSION = '6.3.0' def setUp(self): self.lgr = LGR() def test_empty_lgr(self): __, result = rebuild_lgr(self.lgr, {}) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'repertoire': {}}) def test_lgr_non_default_unicode(self): self.lgr.metadata.set_unicode_version('6.2.0') __, result = rebuild_lgr(self.lgr, {}) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version 6.2.0', 'repertoire': {}}) def test_lgr_validating_repertoire(self): validating_repertoire = LGR(name='validating') __, result = rebuild_lgr(self.lgr, {'validating_repertoire': validating_repertoire}) self.assertDictEqual(result, {'description': "Rebuilding LGR with Unicode version {} " "and validating repertoire '{}'".format( self.DEFAULT_UNICODE_VERSION, validating_repertoire), 'repertoire': {}}) def test_lgr_unidb_same_unicode(self): unidb = IDNADatabase('6.3.0') __, result = rebuild_lgr(self.lgr, {'unidb': unidb}) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'repertoire': {}}) def test_lgr_unidb_different_unicode(self): unidb = IDNADatabase('6.2.0') __, result = rebuild_lgr(self.lgr, {'unidb': unidb}) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'generic': "Target Unicode version {} differs from UnicodeDatabase {}".format( self.DEFAULT_UNICODE_VERSION, '6.2.0'), 'repertoire': {}}) def test_lgr_wrong_range_char(self): self.lgr.add_range(0x0060, 0x0063, force=True) r = RangeChar(0x0060, 0x0060, 0x0063) unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION) self.lgr.unicode_database = unidb _, result = rebuild_lgr(self.lgr, {'unidb': unidb}) errors = result.get('repertoire', {}).get(r, {'errors': []})['errors'] self.assertEqual(len(errors), 1) self.assertIsInstance(errors[0], CharInvalidIdnaProperty) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'repertoire': {r: {'errors': errors}}}) def test_lgr_wrong_char(self): self.lgr.add_cp(0x0060) char = self.lgr.get_char([0x0060]) unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION) self.lgr.unicode_database = unidb _, result = rebuild_lgr(self.lgr, {'unidb': unidb}) errors = result.get('repertoire', {}).get(char, {'errors': []})['errors'] self.assertEqual(len(errors), 1) self.assertIsInstance(errors[0], CharInvalidIdnaProperty) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'repertoire': {char: {'errors': errors}}}) def test_lgr_wrong_variant(self): self.lgr.add_cp(0x0061) self.lgr.add_variant(0x0061, 0x0060) char = self.lgr.get_char([0x0061]) var = char.get_variant((0x0060, ))[0] unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION) self.lgr.unicode_database = unidb _, result = rebuild_lgr(self.lgr, {'unidb': unidb}) errors = result.get('repertoire', {}).get(char, {}).get('variants', {}).get(var, []) self.assertEqual(len(errors), 1) self.assertIsInstance(errors[0], CharInvalidIdnaProperty) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'repertoire': {char: {'variants': {var: errors}}}}) def test_lgr_ok(self): self.lgr.add_range(0x0061, 0x0063, force=True) self.lgr.add_cp(0x0064) self.lgr.add_cp(0x0065) self.lgr.add_variant(0x0064, 0x0065) self.lgr.add_variant(0x0065, 0x0064) unidb = IDNADatabase(self.DEFAULT_UNICODE_VERSION) self.lgr.unicode_database = unidb _, result = rebuild_lgr(self.lgr, {'unidb': unidb}) self.assertDictEqual(result, {'description': 'Rebuilding LGR with Unicode version {}'.format( self.DEFAULT_UNICODE_VERSION), 'repertoire': {}})