def setUp(self): self.lgr = LGR() # Configure log system to redirect validation logs to local attribute self.log_output = StringIO() ch = logging.StreamHandler(self.log_output) ch.setLevel(logging.DEBUG) logging.getLogger('lgr.validate').addHandler(ch)
def _process_meta(self, elem): """ Process the <meta> element of an LGR XML file. """ metadata = Metadata(self.rfc7940_checks) reference_manager = ReferenceManager() MAPPER = { DATE_TAG: lambda d: metadata.set_date(d, force=self.force_mode), VALIDITY_START_TAG: lambda d: metadata.set_validity_start(d, force=self.force_mode), VALIDITY_END_TAG: lambda d: metadata.set_validity_end(d, force=self.force_mode), UNICODE_VERSION_TAG: lambda d: metadata.set_unicode_version(d, force=self.force_mode), } unicode_version_tag_found = False for child in elem: tag = child.tag logger.debug("Got '%s' element", tag) if tag in MAPPER: MAPPER[tag](child.text) if tag == UNICODE_VERSION_TAG: unicode_version_tag_found = True elif tag == VERSION_TAG: metadata.version = Version(child.text, child.get('comment', None)) elif tag == LANGUAGE_TAG: metadata.add_language(child.text, force=self.force_mode) elif tag == SCOPE_TAG: metadata.scopes.append( Scope(child.text, child.get('type', None))) elif tag == DESCRIPTION_TAG: # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449 # For now, manually replace CRLF with LF metadata.description = Description( child.text.replace('\r\n', '\n'), child.get('type', None)) elif tag == REFERENCES_TAG: for reference in child: value = reference.text # Don't convert it to an int since ref_id may be a string ref_id = reference.get('id') comment = reference.get('comment', None) reference_manager.add_reference(value, comment=comment, ref_id=ref_id) # Since we have processed <reference> elements here, let's clean-up child.clear() else: logger.warning("Unhandled '%s' element in <meta> section", tag) self.rfc7940_checks.error('parse_xml') child.clear() self.rfc7940_checks.add_test_result('explicit_unicode_version', unicode_version_tag_found) self._lgr = LGR(name=self.filename, metadata=metadata, reference_manager=reference_manager, unicode_database=self._unicode_database)
def test_add_range_validation_override(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A): validation_lgr.add_cp(cp) self.lgr.add_range(0x0031, 0x0032, validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0031, self.lgr.repertoire)
def test_add_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.lgr.add_cp([0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0062, self.lgr.repertoire)
def test_add_cp_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_cp, [0x0062], validating_repertoire=validation_lgr, override_repertoire=False)
def create(cls, name, unicode_version, validating_repertoire_name): metadata = Metadata() metadata.version = Version('1') metadata.set_unicode_version(unicode_version) lgr = LGR(name, metadata=metadata) lgr.unicode_database = unidb.manager.get_db_by_version(unicode_version) validating_repertoire = get_by_name( validating_repertoire_name) if validating_repertoire_name else None lgr_info = cls(name, lgr=lgr, validating_repertoire=validating_repertoire) return lgr_info
def parse_document(self): self._lgr = LGR(name=self.filename) logger.debug('Start parsing of file: %s', self.filename) if hasattr(self.source, "read"): self._parse_doc(self.source) else: with io.open(self.source, 'r', encoding='utf-8') as rule_file: self._parse_doc(rule_file) return self._lgr
def test_del_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.lgr.add_variant([0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants)
def test_add_range_validation_with_range(self): validation_lgr = LGR() validation_lgr.add_range(0x0061, 0x007A) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False)
class LineParser(LGRParser): def unicode_version(self): # No Unicode version defined for now return "" def validate_document(self, schema): # No validation for now return True def parse_document(self): self._lgr = LGR(name=self.filename) logger.debug('Start parsing of file: %s', self.filename) if hasattr(self.source, "read"): self._parse_doc(self.source) else: with io.open(self.source, 'r', encoding='utf-8') as rule_file: self._parse_doc(rule_file) return self._lgr def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue codepoints = [] for cp in UNICODE_CODEPOINT_RE.finditer(line): try: codepoints.append(int(cp.group(1), 16)) except ValueError: logger.error("Invalid code point '%s' at line %d", cp, line_num) try: self._lgr.add_cp(codepoints) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc)
def test_add_range_validation(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A + 1): validation_lgr.add_cp(cp) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False)
def test_lgr_validating_repertoire(self): validating_repertoire = LGR(name='validating') __, result = rebuild_lgr(self.lgr, {'validating_repertoire': validating_repertoire}) self.assertDictEqual(result, {'description': "Rebuilding LGR with Unicode version {} " "and validating repertoire '{}'".format( self.DEFAULT_UNICODE_VERSION, validating_repertoire), 'repertoire': {}})
def intersect_lgrs(lgr1, lgr2): """ Compute the intersection of 2 LGRs and returns a valid LGR. Note: Ranges have to be expanded before calling this function. :param lgr1: First LGR. :param lgr2: Second LGR. :return: New LGR: intersection of two inputs. """ name = 'Intersection of %s and %s' % (lgr1.name, lgr2.name) lgr1.expand_ranges() lgr2.expand_ranges() # Note: We need to create a copy (copy.deepcopy) for some elements # otherwise they could reference the original objects. metadata = copy.deepcopy(intersect_metadata(lgr1.metadata, lgr2.metadata)) lgr = LGR(name=name, metadata=metadata) # No need to copy references, they are new objects references = intersect_reference_manager(lgr1.reference_manager, lgr2.reference_manager) lgr.reference_manager = references first_cps = {c.cp for c in lgr1.repertoire} second_cps = {c.cp for c in lgr2.repertoire} # No need to copy char, they are new objects for cp in set.intersection(first_cps, second_cps): char1 = lgr1.get_char(cp) char2 = lgr2.get_char(cp) intersect_char(lgr, char1, char2) (actions, actions_xml) = intersect_actions(lgr1, lgr2) lgr.actions = copy.deepcopy(actions) lgr.actions_xml = actions_xml (rules, rules_xml) = intersect_rules(lgr1, lgr2) lgr.rules = copy.deepcopy(rules) lgr.rules_xml = rules_xml (classes, classes_xml) = intersect_classes(lgr1, lgr2) lgr.classes = copy.deepcopy(classes) lgr.classes_xml = classes_xml return lgr
class TestXmlValidity(unittest.TestCase): def setUp(self): self.lgr = LGR() def test_no_validation(self): success, result = check_xml_validity(self.lgr, {}) self.assertTrue(success) self.assertDictEqual(result, {}) def test_invalid_xml_lgr(self): self.lgr.add_cp(0x0061, when='#when') success, result = check_xml_validity(self.lgr, {'rng_filepath': os.path.join(RESOURCE_DIR, 'lgr.rng')}) self.assertIn('validation_result', result) validation_result = result['validation_result'] self.assertFalse(success) self.assertDictEqual(result, {'description': 'Testing XML validity using RNG', 'rng_result': False, 'validation_result': validation_result})
class TestConditionalVariants(unittest.TestCase): def setUp(self): self.lgr = LGR() # Configure log system to redirect validation logs to local attribute self.log_output = StringIO() ch = logging.StreamHandler(self.log_output) ch.setLevel(logging.DEBUG) logging.getLogger('lgr.validate').addHandler(ch) def test_empty_lgr(self): check_conditional_variants(self.lgr, {}) log_content = self.log_output.getvalue() self.assertEqual(len(log_content), 0) def test_no_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) check_conditional_variants(self.lgr, {}) log_content = self.log_output.getvalue() self.assertEqual(len(log_content), 0) def test_no_rule(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0062], when="when-rule") check_conditional_variants(self.lgr, {}) log_content = self.log_output.getvalue() self.assertGreater(len(log_content), 0) self.assertEqual(log_content, "CP U+0061: Variant 'U+0062' \"when\" attribute " "'when-rule' is not an existing rule name.\n") def test_conditional_ok(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0062], when="when-rule") self.lgr.rules.append("when-rule") check_conditional_variants(self.lgr, {}) log_content = self.log_output.getvalue() self.assertEqual(len(log_content), 0)
def test_merge_chars(self): merged_lgr = LGR() # Need to merge references first - OK since tested in previous test reference_mapping = {} merge_references(self.lgr_1, 'fr', merged_lgr, reference_mapping) merge_references(self.lgr_2, 'und-Khmer', merged_lgr, reference_mapping) merge_chars(self.lgr_1, 'fr', merged_lgr, reference_mapping, []) # Simple variant changed to blocked cp = merged_lgr.get_char(0x0041) self.assertIn('1', cp.references) variants = list(cp.get_variants()) self.assertEqual(len(variants), 1) var = variants[0] self.assertEqual(var.cp, (0x0061, )), self.assertEqual(var.type, 'blocked') # Complete merge merge_chars(self.lgr_2, 'und-Khmer', merged_lgr, reference_mapping, []) self._test_merged_chars(merged_lgr)
def test_merge_actions(self): merged_lgr = LGR() lgr = LGR() lgr.add_action(Action(match='rule-name', disp='invalid')) lgr.actions_xml.append("""<action disp="invalid" match="rule-name"/>""") merge_actions(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.actions), 1) self.assertEqual(len(merged_lgr.actions_xml), 1) self.assertEqual(merged_lgr.actions[0].match, 'fr-rule-name') # Default action should not be merged lgr = LGR() lgr.add_action(Action(disp='invalid', comment="Default action for invalid", any_variant=['invalid'])) lgr.actions_xml.append("""<action disp="invalid" match="rule-name"/>""") merge_actions(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.actions), 1) self.assertEqual(len(merged_lgr.actions_xml), 1) self.assertEqual(merged_lgr.actions[0].match, 'fr-rule-name')
def test_merge_references(self): merged_lgr = LGR() reference_mapping = {} merge_references(self.lgr_1, 'fr', merged_lgr, reference_mapping) self.assertEqual(len(reference_mapping), 1) self.assertIn('fr', reference_mapping) self.assertEqual(reference_mapping['fr'], {}) merge_references(self.lgr_2, 'und-Khmer', merged_lgr, reference_mapping) self.assertEqual(len(reference_mapping), 2) self.assertIn('und-Khmer', reference_mapping) self.assertEqual(reference_mapping['und-Khmer'], { '0': '3', '1': '4', # Generated })
def merge_lgr_set(lgr_set, name): """ Merge LGRs from a set :param lgr_set: The list of LGRs in the set :param name: Merged LGR name :return: New LGR (merge of LGR set) """ logger.debug("Merge %s", name) # order LGRs lgr_set.sort(key=lambda x: get_script(x).replace('und-', 'zzz')) # Ensure all unicode version are correct unicode_version = OrderedDict().fromkeys(lgr.metadata.unicode_version for lgr in lgr_set) if len(unicode_version) > 1: logger.warning("Different unicode version in set: %s", unicode_version.keys()) ref_mapping = {} metadata = copy.deepcopy(merge_metadata(lgr_set)) merged_lgr = LGR(name=name, metadata=metadata) previous_scripts = [] for lgr in lgr_set: script = get_script(lgr) lgr.expand_ranges() merge_references(lgr, script, merged_lgr, ref_mapping) merge_chars(lgr, script, merged_lgr, ref_mapping, previous_scripts) merge_actions(lgr, script, merged_lgr, ref_mapping) merge_rules(lgr, script, merged_lgr, ref_mapping) merge_classes(lgr, script, merged_lgr, ref_mapping) previous_scripts.append(script) # XXX As the created merged_lgr is not a valid Python LGR object, # we have to serialize it/parse it to get a valid object. merged_lgr_xml = BytesIO(serialize_lgr_xml(merged_lgr)) lgr_parser = XMLParser(source=merged_lgr_xml, filename=name) return lgr_parser.parse_document()
def make_idna_repertoire(version): """ Make a repertoire from IDNA tables. Parse IDNA table registry, convert it to an LGR XML format, and output it on stdout. Input: * version: The unicode version to use. """ from lgr.core import LGR from lgr.parser.xml_serializer import serialize_lgr_xml lgr = LGR('idna2008-%s' % version) idna_url = IDNATABLES_URL.format(version=version) logger.debug("Fetching and parsing '%s'", idna_url) registry = etree.parse(idna_url) # To keep '{}' when string-formatting namespace = "{{{0}}}".format(IDNATABLES_NS) registry_id = "idna-tables-properties" if list(map(int, version.split('.'))) <= [6, 0, 0]: registry_id = "idna-tables-{}-properties".format(version) record_xpath = '{0}registry[@id="{1}"]/{0}record'.format( namespace, registry_id) for record in registry.findall(record_xpath): codepoint = record.find(CODEPOINT_TAG).text prop = record.find(PROPERTY_TAG).text if prop not in ['PVALID', 'CONTEXTO', 'CONTEXTJ']: continue if codepoint.find('-') > 0: # Codepoint is a range (first_cp, last_cp) = [int(c, 16) for c in codepoint.split('-')] lgr.add_range(first_cp, last_cp) else: # Single codepoint lgr.add_cp(int(codepoint, 16)) lgr_root = serialize_lgr_xml(lgr, pretty_print=True, encoding='unicode', xml_declaration=False) print(lgr_root)
def rebuild_lgr(lgr, options): """ Rebuild an LGR with given parameters. options argument can contain: * unicode_version: The target Unicode version to be used when rebuilding the LGR. If None is given, use the current one. * validating_repertoire: The validating repertoire used for checking code points. * unidb: Munidata's Unicode database. If None, skip Unicode checks. :param LGR lgr: The LGR to rebuild. :param dict options: Dictionary of options to the validation function. """ # Local import to prevent import cycles from lgr.core import LGR unicode_version = options.get('unicode_version', lgr.metadata.unicode_version) validating_repertoire = options.get('validating_repertoire', None) description = "Rebuilding LGR with Unicode version {}".format( unicode_version) if validating_repertoire is not None: description += " and validating repertoire '{}'".format( validating_repertoire) result = { 'description': description, 'repertoire': {} # XXX: Cannot use defaultdict because of django... } logger.info( "Rebuilding LGR '%s' with Unicode version %s " "and Validating Repertoire '%s'", lgr, unicode_version, validating_repertoire) unidb = options.get('unidb', None) if unidb is not None: unidb_version = unidb.get_unicode_version() if unidb_version != unicode_version: result['generic'] = "Target Unicode version {} " \ "differs from UnicodeDatabase {}".format(unicode_version, unidb_version) logger.warning( "Target Unicode version %s differs " "from UnicodeDatabase %s", unicode_version, unidb_version) # For now, simply copy the metadata and references of the source LGR target_metadata = copy.deepcopy(lgr.metadata) target_metadata.unicode_version = unicode_version target_reference_manager = copy.deepcopy(lgr.reference_manager) target_lgr = LGR(name=lgr.name, metadata=target_metadata, reference_manager=target_reference_manager, unicode_database=unidb) for char in lgr.repertoire: if isinstance(char, RangeChar): range_ok = True for cp, status in target_lgr.check_range(char.first_cp, char.last_cp, validating_repertoire): if status is not None: result['repertoire'].setdefault(char, {}).setdefault( 'errors', []).append(status) range_ok = False in_script, _ = lgr.cp_in_script([cp]) if not in_script: result['repertoire'].setdefault(char, {}).setdefault( 'warnings', []).append(CharNotInScript(cp)) range_ok = False if not range_ok: continue try: target_lgr.add_range( char.first_cp, char.last_cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, validating_repertoire=validating_repertoire, override_repertoire=False) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault('errors', []).append(exc) logger.error("Cannot add range '%s-%s'", format_cp(char.first_cp), format_cp(char.last_cp)) continue in_script, _ = lgr.cp_in_script(char.cp) if not in_script: result['repertoire'].setdefault(char, {}).setdefault( 'warnings', []).append(CharNotInScript(char.cp)) # Insert code point try: target_lgr.add_cp(char.cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, validating_repertoire=validating_repertoire, override_repertoire=False) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault('errors', []).append(exc) logger.error("Cannot add code point '%s'", format_cp(char.cp)) if not isinstance(exc, CharInvalidIdnaProperty ): # Cannot include non-IDNA valid code points target_lgr.add_cp(char.cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, force=True) # Create variants for var in char.get_variants(): try: target_lgr.add_variant( char.cp, variant_cp=var.cp, variant_type=var.type, when=var.when, not_when=var.not_when, comment=var.comment, ref=var.references, validating_repertoire=validating_repertoire, override_repertoire=True) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault( 'variants', {}).setdefault(var, []).append(exc) logger.error("Cannot add variant '%s' to code point '%s'", format_cp(var.cp), format_cp(char.cp)) if not isinstance( exc, CharInvalidIdnaProperty ): # Cannot include non-IDNA valid code points target_lgr.add_variant(char.cp, variant_cp=var.cp, variant_type=var.type, when=var.when, not_when=var.not_when, comment=var.comment, ref=var.references, force=True) logger.info("Rebuilding LGR '%s done", lgr) return True, result
class RFC3743Parser(LGRParser): def unicode_version(self): # No Unicode version defined in file return "" def validate_document(self, schema=None): # No validation of document done for now return "" def parse_document(self): if not self.filename and isinstance(self.source, str): self.filename = os.path.basename(self.source) self._lgr = LGR(name=self.filename) logger.debug('Start parsing of file: %s', self.filename) if hasattr(self.source, "read"): self._parse_doc(self.source) else: with io.open(self.source, 'r', encoding='utf-8') as rule_file: self._parse_doc(rule_file) return self._lgr def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue reference = REFERENCE_RE.match(line) if reference is not None: ref_id = reference.group('ref_id') value = reference.group('value') comment = reference.group('comment') try: self._lgr.add_reference(value, ref_id=ref_id, comment=comment) except LGRException: logger.error("Invalid reference '%s' on line %d", line, line_num) continue version = VERSION_RE.match(line) if version is not None: version_no = version.group('version_no') date = version.group('date') comment = version.group('comment') try: self._lgr.metadata.version = Version(version_no, comment=comment) self._lgr.metadata.date = date except LGRException: logger.error("Invalid version '%s' on line %d", line, line_num) continue if UNICODE_CODEPOINT_RE.match(line) is None: logger.debug("Skipping non-parsable line %d:\n%s", line_num, line) # Line is not starting with a valid unicode code point, skip continue # Split base character from variant(s) char_variant = line.split(';') char = char_variant[0] try: [(codepoints, references)] = parse_char(char) self._lgr.add_cp(codepoints, ref=references) except ValueError: logger.error("Invalid character '%s' at line %d", char, line_num) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc) if len(char_variant) > 1: preferred_variants = char_variant[1].strip() if len(preferred_variants ) > 0 and preferred_variants[0] != '#': # From RFC7940, Section 7.3. Recommended Disposition Values: # activated The resulting string should be activated for use. (This # is the same as a Preferred Variant [RFC3743].) var_type = "activated" self.insert_variant(line_num, codepoints, preferred_variants, var_type) if len(char_variant) > 2: variants = char_variant[2].strip() if len(variants) > 0 and variants[0] != '#': self.insert_variant(line_num, codepoints, variants) def insert_variant(self, line_num, codepoints, var, var_type=None): try: variants = parse_char(var) except ValueError: logger.error("Invalid variant '%s' at line %d", var, line_num) return for (var_codepoints, references) in variants: try: self._lgr.add_variant(codepoints, var_codepoints, ref=references, variant_type=var_type) except LGRException as exc: logger.error( "Cannot add variant '%s' to code point '%s' at line %d: %s", format_cp(var_codepoints), format_cp(codepoints), line_num, exc)
class RFC4290Parser(LGRParser): def unicode_version(self): # No Unicode version defined in file return "" def validate_document(self, schema=None): # No validation of document done for now return True def parse_document(self): if not self.filename and isinstance(self.source, str): self.filename = os.path.basename(self.source) self._lgr = LGR(name=self.filename) logger.debug('Start parsing of file: %s', self.filename) if hasattr(self.source, "read"): self._parse_doc(self.source) else: with io.open(self.source, 'r', encoding='utf-8') as rule_file: self._parse_doc(rule_file) return self._lgr def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue if UNICODE_CODEPOINT_RE.match(line) is None: # Line is not starting with a valid unicode code point, skip continue # Remove comments and split base character from variant(s) char_variant = line.split('#')[0].split('|') char = char_variant[0] try: codepoints = parse_char(char) self._lgr.add_cp(codepoints) except ValueError: logger.error("Invalid character '%s' at line %d", char, line_num) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc) # Handle variants, if any if len(char_variant) > 1: variants = char_variant[1].split(':') for var in variants: try: var_codepoints = parse_char(var) self._lgr.add_variant(codepoints, var_codepoints) except ValueError: logger.error("Invalid variant '%s' at line %d", var, line_num) except LGRException as exc: logger.error( "Cannot add variant '%s' to code point '%s' at line %d: %s", format_cp(var_codepoints), format_cp(codepoints), line_num, exc)
class TestLGRCore(unittest.TestCase): def setUp(self): unidb = IDNADatabase('6.3.0') self.lgr = LGR(unicode_database=unidb) def test_add_single_cp_list(self): self.lgr.add_cp([0x0061]) self.assertIn(0x0061, self.lgr.repertoire) def test_add_single_cp_int(self): self.lgr.add_cp(0x0061) self.assertIn(0x0061, self.lgr.repertoire) def test_add_cp_sequence(self): self.lgr.add_cp([0x0061, 0x0062]) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) self.assertNotIn(0x0061, self.lgr.repertoire) self.assertNotIn(0x0062, self.lgr.repertoire) def test_add_multiple_cp_sequences(self): self.lgr.add_cp([0x0061, 0x0062]) self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) self.assertIn([0x0061, 0x0062, 0x0063], self.lgr.repertoire) self.assertNotIn(0x0061, self.lgr.repertoire) self.assertNotIn(0x0062, self.lgr.repertoire) self.assertNotIn(0x0063, self.lgr.repertoire) def test_add_cp_in_repertoire(self): self.lgr.add_cp([0x0061]) self.assertRaises(CharAlreadyExists, self.lgr.add_cp, [0x0061]) self.assertRaises(CharAlreadyExists, self.lgr.add_cp, 0x0061) def test_add_cp_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_cp, [0x0062], validating_repertoire=validation_lgr, override_repertoire=False) def test_add_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061], validating_repertoire=validation_lgr, override_repertoire=False) self.lgr.add_cp([0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0062, self.lgr.repertoire) def test_del_single_cp_list(self): self.lgr.add_cp(0x0061) self.lgr.del_cp([0x0061]) self.assertNotIn(0x0061, self.lgr.repertoire) def test_del_single_cp_int(self): self.lgr.add_cp([0x0061]) self.lgr.del_cp(0x0061) self.assertNotIn(0x0061, self.lgr.repertoire) def test_del_cp_sequence(self): self.lgr.add_cp([0x0061, 0x0062]) self.lgr.del_cp([0x0061, 0x0062]) self.assertEqual(len(self.lgr.repertoire), 0) def test_del_cp_sequence_with_cp(self): self.lgr.add_cp([0x0061, 0x0062]) self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0061) self.assertRaises(NotInLGR, self.lgr.del_cp, 0x0062) self.assertIn([0x0061, 0x0062], self.lgr.repertoire) def test_add_cp_when_not_when(self): self.lgr.add_cp([0x0061], when='w1') with self.assertRaises(CharInvalidContextRule) as cm: self.lgr.add_cp([0x0062], when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0062]) self.lgr.add_cp([0x0062], not_when='nw2') with self.assertRaises(CharInvalidContextRule) as cm: self.lgr.add_cp([0x0063], when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0063]) def test_add_range(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIn(cp, self.lgr.repertoire) def test_add_range_in_repertoire(self): self.lgr.add_range(0x0061, 0x007A) self.assertRaises(CharAlreadyExists, self.lgr.add_range, 0x0061, 0x007A) def test_add_range_validation(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A + 1): validation_lgr.add_cp(cp) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False) def test_add_range_validation_with_range(self): validation_lgr = LGR() validation_lgr.add_range(0x0061, 0x007A) self.lgr.add_range(0x0061, 0x007A, validating_repertoire=validation_lgr, override_repertoire=False) self.assertRaises(NotInRepertoire, self.lgr.add_range, 0x00F8, 0x00FF, validating_repertoire=validation_lgr, override_repertoire=False) def test_add_range_validation_override(self): validation_lgr = LGR() for cp in range(0x0061, 0x007A): validation_lgr.add_cp(cp) self.lgr.add_range(0x0031, 0x0032, validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn(0x0031, self.lgr.repertoire) def test_add_range_when_not_when(self): self.lgr.add_range(0x0061, 0x0065, when='w1') with self.assertRaises(RangeInvalidContextRule) as cm: self.lgr.add_range(0x0066, 0x007A, when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.first_cp, 0x0066) self.assertEqual(the_exception.last_cp, 0x007A) self.lgr.add_range(0x0066, 0x007A, not_when='nw2') with self.assertRaises(RangeInvalidContextRule) as cm: self.lgr.add_range(0x01BD, 0x01C3, when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.first_cp, 0x01BD) self.assertEqual(the_exception.last_cp, 0x01C3) def test_expand_ranges(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.add_range(0x01BD, 0x01C3) for cp in range(0x01BD, 0x01C3 + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.expand_ranges() for cp in range(0x0061, 0x007A + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) for cp in range(0x01BD, 0x01C3 + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) def test_expand_range(self): self.lgr.add_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): self.assertIsInstance(self.lgr.get_char(cp), RangeChar) self.lgr.expand_range(0x0061, 0x007A) for cp in range(0x0061, 0x007A + 1): char = self.lgr.get_char(cp) self.assertIsInstance(char, Char) self.assertNotIsInstance(char, RangeChar) def test_add_variant_in_repertoire(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.assertRaises(VariantAlreadyExists, self.lgr.add_variant, [0x0061], [0x0030]) def test_add_variant_validation(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.assertRaises(NotInRepertoire, self.lgr.add_variant, [0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=False) def test_add_variant_when_not_when(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030], when='w1') with self.assertRaises(VariantInvalidContextRule) as cm: self.lgr.add_variant([0x0061], [0x0031], when='w2', not_when='nw1') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) self.assertEqual(the_exception.variant, [0x0031]) self.lgr.add_variant([0x0061], [0x0030], not_when='nw2') with self.assertRaises(VariantInvalidContextRule) as cm: self.lgr.add_variant([0x0061], [0x0031], when='w3', not_when='nw3') the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) self.assertEqual(the_exception.variant, [0x0031]) def test_del_cp_validation_override(self): validation_lgr = LGR() validation_lgr.add_cp([0x0061]) validation_lgr.add_cp([0x0030]) self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) self.lgr.add_variant([0x0061], [0x0062], validating_repertoire=validation_lgr, override_repertoire=True) self.assertIn((0x0062, ), self.lgr.repertoire[0x0061]._variants) def test_get_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030]) variants = self.lgr.get_variants([0x0061]) self.assertIsInstance(variants, types.GeneratorType) variant_list = list(variants) self.assertEqual(len(variant_list), 1) def test_check_range_no_modification(self): self.lgr.check_range(0x0060, 0x007F) self.assertEqual(len(self.lgr.repertoire), 0) def test_check_range(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x007A]) codepoints = self.lgr.check_range(0x0060, 0x007F) for result in codepoints: cp = result[0] prop = result[1] if cp == 0x060 or cp >= 0x007B: self.assertIsInstance(prop, CharInvalidIdnaProperty) elif cp == 0x0061 or cp == 0x007A: self.assertIsInstance(prop, CharAlreadyExists) else: self.assertIsNone(prop) def test_add_codepoints(self): self.lgr.add_codepoints([c for c in range(0x0061, 0x007A + 1)] + [0x0107] + [0x0137, 0x0138]) expected_output = [ RangeChar(0x061, 0x0061, 0x007A), Char(0x0107), RangeChar(0x0137, 0x0137, 0x0138) ] self.assertEqual(expected_output, list(self.lgr.repertoire)) def test_tags_on_codepoint(self): self.lgr.add_cp([0x0061], tag=['t1', 't2']) with self.assertRaises(LGRFormatException) as cm: self.lgr.add_cp([0x0062], tag=['t1', 't1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.DUPLICATE_TAG) def test_tags_on_codepoint_sequence(self): with self.assertRaises(LGRFormatException) as cm: self.lgr.add_cp([0x0061, 0x0062], tag=['t1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.SEQUENCE_NO_TAG) def test_tags_on_range(self): self.lgr.add_range(0x0061, 0x0062, tag=['t1', 't2']) with self.assertRaises(LGRFormatException) as cm: self.lgr.add_range(0x0063, 0x0064, tag=['t1', 't1']) the_exception = cm.exception self.assertEqual(the_exception.reason, LGRFormatException.LGRFormatReason.DUPLICATE_TAG) def test_list_types(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0030], variant_type='BLOCK') self.lgr.add_variant([0x0061], [0x0031], variant_type='VALID') self.lgr.add_variant([0x0061], [0x0032], variant_type='BLOCK') self.assertEquals(self.lgr.types, set(['BLOCK', 'VALID'])) def test_del_reference(self): ref_id_1 = self.lgr.add_reference("Test - 1") ref_id_2 = self.lgr.add_reference("Test - 2") self.lgr.add_cp([0x0061], ref=[ref_id_1]) self.lgr.add_cp([0x0062], ref=[ref_id_1, ref_id_2]) self.lgr.del_reference(ref_id_1) self.assertNotIn(ref_id_1, self.lgr.reference_manager) self.assertEquals(self.lgr.get_char([0x0061]).references, []) self.assertEquals(self.lgr.get_char([0x0062]).references, [ref_id_2]) def test_add_cp_duplicate_reference(self): ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_cp([0x0061], ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) def test_add_range_duplicate_reference(self): ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_range(0x0061, 0x0062, ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, 0x0061) def test_add_variant_duplicate_reference(self): self.lgr.add_cp([0x0061]) ref_id = self.lgr.add_reference("Test - 1") with self.assertRaises(DuplicateReference) as cm: self.lgr.add_variant([0x0061], [0x0062], ref=[ref_id, ref_id]) the_exception = cm.exception self.assertEqual(the_exception.cp, [0x0061]) def test_generate_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) self.lgr.add_cp([0x0063]) self.lgr.add_cp([0x0064]) self.lgr.add_variant([0x0061], [0x0070], variant_type="type0") self.lgr.add_variant([0x0062], [0x0071], variant_type="type1") self.lgr.add_variant([0x0062], [0x0072], variant_type="type2") self.assertEqual([], list(self.lgr._generate_label_variants([]))) self.assertEqual([], list(self.lgr._generate_label_variants([0x0063]))) self.assertEqual( [], list(self.lgr._generate_label_variants([0x0063, 0x0064]))) self.assertEqual( set([((0x0071, 0x0063), frozenset(['type1']), False), ((0x0072, 0x0063), frozenset(['type2']), False)]), set(self.lgr._generate_label_variants([0x0062, 0x0063]))) self.assertEqual( set([ ((0x0061, 0x0062), frozenset(), False), ((0x0061, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0072), frozenset(['type2']), False), ((0x0070, 0x0062), frozenset(['type0']), False), ((0x0070, 0x0071), frozenset(['type0', 'type1']), True), ((0x0070, 0x0072), frozenset(['type0', 'type2']), True), ]), set(self.lgr._generate_label_variants([0x0061, 0x0062]))) self.assertEqual( set([ ((0x0061, 0x0062, 0x0062), frozenset(), False), ((0x0061, 0x0062, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0062, 0x0072), frozenset(['type2']), False), ((0x0061, 0x0071, 0x0062), frozenset(['type1']), False), ((0x0061, 0x0071, 0x0071), frozenset(['type1']), False), ((0x0061, 0x0071, 0x0072), frozenset(['type1', 'type2']), False), ((0x0061, 0x0072, 0x0062), frozenset(['type2']), False), ((0x0061, 0x0072, 0x0071), frozenset(['type1', 'type2']), False), ((0x0061, 0x0072, 0x0072), frozenset(['type2']), False), ((0x0070, 0x0062, 0x0062), frozenset(['type0']), False), ((0x0070, 0x0062, 0x0071), frozenset(['type0', 'type1']), False), ((0x0070, 0x0062, 0x0072), frozenset(['type0', 'type2']), False), ((0x0070, 0x0071, 0x0062), frozenset(['type0', 'type1']), False), ((0x0070, 0x0071, 0x0071), frozenset(['type0', 'type1']), True), ((0x0070, 0x0071, 0x0072), frozenset(['type0', 'type1', 'type2']), True), ((0x0070, 0x0072, 0x0062), frozenset(['type0', 'type2']), False), ((0x0070, 0x0072, 0x0071), frozenset(['type0', 'type1', 'type2']), True), ((0x0070, 0x0072, 0x0072), frozenset(['type0', 'type2']), True), ]), set(self.lgr._generate_label_variants([0x0061, 0x0062, 0x0062]))) def test_generate_variants_reflexive(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062]) self.lgr.add_cp([0x0063]) self.lgr.add_variant([0x0062], [0x0062], variant_type="reflexive") self.lgr.add_variant([0x0063], [0x0070], variant_type="type") self.assertEqual([], list(self.lgr._generate_label_variants([]))) self.assertEqual([], list(self.lgr._generate_label_variants([0x0061]))) self.assertEqual([((0x0062, ), frozenset(['reflexive']), True)], list(self.lgr._generate_label_variants([0x0062]))) self.assertEqual( set([ ((0x0062, 0x0063), frozenset(['reflexive']), False), ((0x0062, 0x0070), frozenset(['reflexive', 'type']), True), ]), set(self.lgr._generate_label_variants([0x0062, 0x0063]))) def test_label_simple(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0062, 0x0063]) self.lgr.add_range(0x0064, 0x0068) valid_labels = ([0x0061], [0x0062, 0x0063], [0x0064], [0x0068], [0x0061, 0x0064], [0x0061, 0x0062, 0x0063, 0x0064], [0x0062, 0x0063, 0x0068]) invalid_labels = (([0x0060], [], [0x0060]), ([0x0069], [], [0x0069]), ([0x0062], [], [0x0062]), ([0x0063], [], [0x0063]), ([0x0061, 0x0062], [0x0061], [0x0062])) for label in valid_labels: self.assertEqual((True, label, []), self.lgr._test_preliminary_eligibility(label)) for (label, label_part, not_in_lgr) in invalid_labels: self.assertEqual((False, label_part, not_in_lgr), self.lgr._test_preliminary_eligibility(label)) def test_label_eligibility_multiple_choices(self): self.lgr.add_cp([0x0061]) self.lgr.add_cp([0x0061, 0x0062, 0x0063]) self.lgr.add_cp([0x0064]) self.assertEqual(self.lgr._test_preliminary_eligibility([0x0062]), (False, [], [0x0062])) self.assertEqual( self.lgr._test_preliminary_eligibility( [0x0061, 0x0062, 0x0063, 0x0064]), (True, [0x0061, 0x0062, 0x0063, 0x0064], [])) def test_label_delayed_eligibilty(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0061], 'block') self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0062], 'invalid') self.lgr.add_cp([0x0063, 0x0064]) self.lgr.add_variant([0x0063, 0x0064], [0x0063, 0x0064], 'invalid') self.assertEqual(self.lgr._test_label_disposition([0x0062]), ('invalid', 0)) self.assertEqual(self.lgr._test_label_disposition([0x0063, 0x0064]), ('invalid', 0)) self.assertEqual(self.lgr._test_label_disposition([0x0061, 0x0062]), ('invalid', 0)) def test_label_length(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0061], 'disp') self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0062], 'disp') self.assertEqual(PROTOCOL_LABEL_MAX_LENGTH, self.lgr.max_label_length()) for i in range(80): self.lgr.add_variant([0x0062], [0x074D + i], 'disp') # 41: mean number of variants per character self.assertEqual(int(math.log(MAX_NUMBER_GENERATED_VARIANTS, 41)), self.lgr.max_label_length())
def setUp(self): self.lgr = LGR() self.root = etree.Element('lgr', nsmap=NSMAP)
def setUp(self): unidb = IDNADatabase('6.3.0') self.lgr = LGR(unicode_database=unidb)
class XMLParser(LGRParser): # Keep content intact, so do not strip CDATA section # (used in the <meta>/<description> element). # Do not resolve entities. # Skip comment, as we do not care. PARSER_OPTIONS = { 'resolve_entities': False, 'strip_cdata': False, 'remove_comments': True } def __init__(self, *args, **kwargs): if 'force_mode' in kwargs: force_mode = kwargs['force_mode'] del kwargs['force_mode'] else: force_mode = True super(XMLParser, self).__init__(*args, **kwargs) self.force_mode = force_mode self.rfc7940_checks = LGRFormatTestResults() def validate_document(self, rng_schema_path): # Construct the RelaxNG validator schema = etree.RelaxNG(file=rng_schema_path) # Parse the XML file parser = etree.XMLParser(**self.PARSER_OPTIONS) doc = etree.parse(self.source, parser=parser) logger.debug("Validating document '%s' with RNG '%s'", self.source, rng_schema_path) error_log = None if not schema.validate(doc): logger.warning("Validation of document '%s' failed", self.source) self.rfc7940_checks.error('schema') error_log = schema.error_log if len(error_log) == 0: # Bug in LXML, see https://bugs.launchpad.net/lxml/+bug/1526522 error_log = "CANNOT VALIDATE XML" self.rfc7940_checks.tested('schema') return error_log def unicode_version(self): logger.debug("Get unicode version from meta") # Only parse the "meta" element # Skip comment, as we do not care. context = etree.iterparse(self.source, tag=META_TAG, **self.PARSER_OPTIONS) self._fast_iter(context) unicode_version = self._lgr.metadata.unicode_version self._lgr = None # FD is now potentially at the end of the documents, # set it back to start if hasattr(self.source, "seek"): self.source.seek(0) return unicode_version def parse_document(self): logger.debug('Start parsing of file: %s', self.filename) # Keep content intact, so do not strip CDATA section # (used in the <meta>/<description> element). # Do not resolve entities. # Skip comment, as we do not care. context = etree.iterparse(self.source, **self.PARSER_OPTIONS) self._fast_iter(context) # FD is now potentially at the end of the documents, # set it back to start if hasattr(self.source, "seek"): self.source.seek(0) self.rfc7940_checks.tested('parse_xml') return self._lgr def _process_meta(self, elem): """ Process the <meta> element of an LGR XML file. """ metadata = Metadata(self.rfc7940_checks) reference_manager = ReferenceManager() MAPPER = { DATE_TAG: lambda d: metadata.set_date(d, force=self.force_mode), VALIDITY_START_TAG: lambda d: metadata.set_validity_start(d, force=self.force_mode), VALIDITY_END_TAG: lambda d: metadata.set_validity_end(d, force=self.force_mode), UNICODE_VERSION_TAG: lambda d: metadata.set_unicode_version(d, force=self.force_mode), } unicode_version_tag_found = False for child in elem: tag = child.tag logger.debug("Got '%s' element", tag) if tag in MAPPER: MAPPER[tag](child.text) if tag == UNICODE_VERSION_TAG: unicode_version_tag_found = True elif tag == VERSION_TAG: metadata.version = Version(child.text, child.get('comment', None)) elif tag == LANGUAGE_TAG: metadata.add_language(child.text, force=self.force_mode) elif tag == SCOPE_TAG: metadata.scopes.append( Scope(child.text, child.get('type', None))) elif tag == DESCRIPTION_TAG: # Seems to be an issue with CDATA/iterparse: https://bugs.launchpad.net/lxml/+bug/1788449 # For now, manually replace CRLF with LF metadata.description = Description( child.text.replace('\r\n', '\n'), child.get('type', None)) elif tag == REFERENCES_TAG: for reference in child: value = reference.text # Don't convert it to an int since ref_id may be a string ref_id = reference.get('id') comment = reference.get('comment', None) reference_manager.add_reference(value, comment=comment, ref_id=ref_id) # Since we have processed <reference> elements here, let's clean-up child.clear() else: logger.warning("Unhandled '%s' element in <meta> section", tag) self.rfc7940_checks.error('parse_xml') child.clear() self.rfc7940_checks.add_test_result('explicit_unicode_version', unicode_version_tag_found) self._lgr = LGR(name=self.filename, metadata=metadata, reference_manager=reference_manager, unicode_database=self._unicode_database) def _process_data(self, elem): """ Process the <data> element of an LGR XML file. """ # It is RECOMMENDED to list all "char" elements in ascending order of # the "cp" attribute. The below variable is used when verifying that. previous_codepoint = [] for child in elem: comment = child.get('comment', None) when = child.get('when', None) not_when = child.get('not-when', None) # Handle references ref = string_to_list(child.get('ref', '')) # Handle tags tag = string_to_list(child.get('tag', '')) if child.tag == CHAR_TAG: codepoint = [int(c, 16) for c in child.get('cp').split()] if codepoint <= previous_codepoint: if previous_codepoint[0:len(codepoint)] == codepoint: # Not clear what order is to be recommended here self.rfc7940_checks.error( 'char_strict_ascending_order') else: logger.warning( "cp attribute not in ascending order: '%s'", child.get('cp')) self.rfc7940_checks.error('char_ascending_order') previous_codepoint = codepoint try: self._lgr.add_cp(codepoint, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: logger.error("Cannot add code point '%s': %s", format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise # Variants of char for variant in child.iter(VARIANT_TAG): var_codepoint = [ int(c, 16) for c in variant.get('cp').split() ] when = variant.get('when', None) not_when = variant.get('not-when', None) variant_type = variant.get('type', None) comment = variant.get('comment', None) # Handle references ref = string_to_list(variant.get('ref', '')) try: self._lgr.add_variant(codepoint, var_codepoint, variant_type=variant_type, when=when, not_when=not_when, comment=comment, ref=ref, force=self.force_mode) except LGRException as exc: logger.error( "Cannot add variant '%s' " "to code point '%s': %s", format_cp(var_codepoint), format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise elif child.tag == RANGE_TAG: first_cp = int(child.get('first-cp'), 16) last_cp = int(child.get('last-cp'), 16) try: self._lgr.add_range(first_cp, last_cp, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') logger.error("Cannot add range '%s-%s': %s", format_cp(first_cp), format_cp(last_cp), exc) if not self.force_mode: raise child.clear() self.rfc7940_checks.tested('char_ascending_order') self.rfc7940_checks.tested('char_strict_ascending_order') def _process_rules(self, elem): """ Process the <rules> element of an LGR XML file. """ # Keep "text" version of the rules since we don't do anything with them. for child in elem: if child.tag in COMBINATOR_TAGS + (CLASS_TAG, ): cls = self._parse_class(child) self._lgr.add_class(cls, force=self.force_mode) child = drop_ns(child) self._lgr.classes_xml.append( etree.tostring(child, encoding=text_type)) elif child.tag == RULE_TAG: rule = self._parse_rule(child) self._lgr.add_rule(rule, force=self.force_mode) child = drop_ns(child) self._lgr.rules_xml.append( etree.tostring(child, encoding=text_type)) elif child.tag == ACTION_TAG: action = self._parse_action(child) self._lgr.add_action(action, force=self.force_mode) child = drop_ns(child) self._lgr.actions_xml.append( etree.tostring(child, encoding=text_type)) else: logger.warning("Unhandled '%s' element in <rules> section", child.tag) self.rfc7940_checks.error("parse_xml") child.clear() def _parse_rule(self, elem): """ Parse a <rule> element. :return: The rule object created. """ rule = Rule(name=elem.get('name', None), comment=elem.get('comment', None), ref=string_to_list(elem.get('ref', '')), by_ref=elem.get('by-ref', None)) for child in elem: self._parse_rule_helper(child, rule) return rule def _parse_rule_helper(self, child, rule): """ Helper to parse the content of a <rule> element. This function is to be called on children of a top-level <rule>. :param child: Child element of a top-level <rule> element. :param rule: The top-level rule element to add the content to. """ tag = child.tag comment = child.get('comment', None) count = child.get('count', None) if tag == ANCHOR_TAG: rule.add_child(AnchorMatcher(comment=comment)) elif tag == ANY_TAG: rule.add_child(AnyMatcher(comment=comment, count=count)) elif tag == CHAR_TAG: rule.add_child( CharMatcher(cp_or_sequence_from_class(child), comment=comment, count=count)) elif tag == CHOICE_TAG: choice = ChoiceMatcher(comment=comment, count=count) for matcher in child: self._parse_rule_helper(matcher, choice) rule.add_child(choice) elif tag == END_TAG: rule.add_child(EndMatcher(comment=comment)) elif tag == LOOKAHEAD_TAG: look_ahead = LookAheadMatcher(comment=comment) for matcher in child: self._parse_rule_helper(matcher, look_ahead) rule.add_child(look_ahead) elif tag == LOOKBEHIND_TAG: look_behind = LookBehindMatcher(comment=comment) for matcher in child: self._parse_rule_helper(matcher, look_behind) rule.add_child(look_behind) elif tag == START_TAG: rule.add_child(StartMatcher(comment=comment)) elif tag == RULE_TAG: child_rule = self._parse_rule(child) rule.add_child( RuleMatcher(child_rule, comment=comment, count=count)) elif tag == CLASS_TAG or tag in COMBINATOR_TAGS: rule.add_child( ClassMatcher(self._parse_class(child), comment=comment, count=count)) else: logger.warning("Unhandled '%s' element in <rule> object", tag) self.rfc7940_checks.error('parse_xml') def _parse_action(self, elem): """ Parse an <action> element. :return: The action object created. """ disp = elem.get('disp') comment = elem.get('comment', None) match = elem.get('match', None) not_match = elem.get('not-match', None) any_variant = string_to_list(elem.get('any-variant', '')) all_variants = string_to_list(elem.get('all-variants', '')) only_variants = string_to_list(elem.get('only-variants', '')) return Action(disp, comment=comment, ref=string_to_list(elem.get('ref', '')), match=match, not_match=not_match, any_variant=any_variant, all_variants=all_variants, only_variants=only_variants) def _parse_class(self, elem): """ Parse an <class> element. :return: The Class object created. """ tag = elem.tag name = elem.get('name', None) comment = elem.get('comment', None) if tag == CLASS_TAG: cls = Class(name=name, comment=comment, ref=string_to_list(elem.get('ref', '')), from_tag=elem.get('from-tag', None), unicode_property=elem.get('property', None), by_ref=elem.get('by-ref', None)) if len(elem) == 0 and elem.text: # No child, code point(s) defined in text cls.add_codepoint(cp_or_sequence_from_class(elem)) for child in elem: cls.add_codepoint(cp_or_sequence_from_class(child)) elif tag in COMBINATOR_TAGS: MAPPING = { UNION_TAG: UnionClass, COMPLEMENT_TAG: ComplementClass, INTERSECTION_TAG: IntersectionClass, DIFFERENCE_TAG: DifferenceClass, SYM_DIFFERENCE_TAG: SymmetricDifferenceClass } cls = MAPPING[tag](name=name, comment=comment) # TODO: ensure number of children for child in elem: cls.add_child(self._parse_class(child)) else: logger.warning("Unhandled '%s' element in <class> object", tag) self.rfc7940_checks.error('parse_xml') return cls def _fast_iter(self, context): """ Iterator used to incrementally parse the XML file. """ metadata_added = False for _, elem in context: if not metadata_added and elem == DATA_TAG: # The optional "meta" element is not present since it must # preceed the required data element. # However, we still have to call _process_meta self._process_meta({}) metadata_added = True if elem.tag == META_TAG: logger.debug("Got 'meta' element") self._process_meta(elem) elif elem.tag == DATA_TAG: logger.debug("Got 'data' element") self._process_data(elem) elif elem.tag == RULES_TAG: logger.debug("Got 'rules' element") self._process_rules(elem) else: continue # Clean-up memory elem.clear() del context
def union_lgrs(lgr1, lgr2): """ Compute the union of 2 LGRs and returns a valid LGR. Note: Ranges have to be expanded before calling this function. :param lgr1: First LGR. :param lgr2: Second LGR. :return: New LGR: union of two inputs. """ name = 'Union of %s and %s' % (lgr1.name, lgr2.name) logger.debug("Union of %s", name) lgr1.expand_ranges() lgr2.expand_ranges() # Note: We need to create a copy (copy.deepcopy) for some elements # otherwise they could reference the original objects. metadata = copy.deepcopy(union_metadata(lgr1.metadata, lgr2.metadata)) lgr = LGR(name=name, metadata=metadata) # No need to copy references, they are new objects references = union_reference_manager(lgr1.reference_manager, lgr2.reference_manager) lgr.reference_manager = references first_cps = {c.cp for c in lgr1.repertoire} second_cps = {c.cp for c in lgr2.repertoire} # No need to copy char, they are new objects # Compute union of all common code points for cp in set.intersection(first_cps, second_cps): char1 = lgr1.get_char(cp) char2 = lgr2.get_char(cp) union_char(lgr, char1, char2) # Append all other code points for cp in set.difference(first_cps, second_cps): char = lgr1.get_char(cp) lgr.add_cp(char.cp, comment=char.comment, #ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when) for cp in set.difference(second_cps, first_cps): char = lgr2.get_char(cp) lgr.add_cp(char.cp, comment=char.comment, #ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when) (actions, actions_xml) = union_actions(lgr1, lgr2) lgr.actions = copy.deepcopy(actions) lgr.actions_xml = actions_xml (rules, rules_xml) = union_rules(lgr1, lgr2) lgr.rules = copy.deepcopy(rules) lgr.rules_xml = rules_xml (classes, classes_xml) = union_classes(lgr1, lgr2) lgr.classes = copy.deepcopy(classes) lgr.classes_xml = classes_xml return lgr
class TestPopulate(unittest.TestCase): def setUp(self): self.lgr = LGR() # Configure log system to redirect validation logs to local attribute self.log_output = StringIO() ch = logging.StreamHandler(self.log_output) ch.setLevel(logging.INFO) logger = logging.getLogger('lgr.populate') logger.addHandler(ch) logger.setLevel(logging.INFO) def test_no_symmetric_in_repertoire(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0062]) populate_lgr(self.lgr) log_content = self.log_output.getvalue() self.assertEqual( "Add missing code point 'U+0062' in LGR as it is a variant of 'U+0061'\n" "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n", log_content) self.assertIn(0x0062, self.lgr.repertoire) new_variant = self.lgr.get_char([0x0062]) self.assertEqual([(0x0061, )], [c.cp for c in new_variant.get_variants()]) def test_no_symmetric_in_repertoire_twice(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0062]) self.lgr.add_variant([0x0061], [0x0063]) self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0061]) self.lgr.add_variant([0x0062], [0x0063]) populate_lgr(self.lgr) log_content = self.log_output.getvalue() self.assertEqual( "Add missing code point 'U+0063' in LGR as it is a variant of 'U+0061'\n" "Add code point 'U+0061' as variant of 'U+0063' for symmetry\n" "Add code point 'U+0062' as variant of 'U+0063' for symmetry\n", log_content) self.assertIn(0x0063, self.lgr.repertoire) new_variant = self.lgr.get_char([0x0063]) self.assertEqual([(0x0061, ), (0x0062, )], [c.cp for c in new_variant.get_variants()]) def test_no_symmetric_in_variants(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0062]) self.lgr.add_cp([0x0062]) populate_lgr(self.lgr) log_content = self.log_output.getvalue() self.assertEqual( "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n", log_content) cp = self.lgr.get_char([0x0062]) self.assertEqual([(0x0061, )], [c.cp for c in cp.get_variants()]) def test_no_transitivity(self): self.lgr.add_cp([0x0061]) self.lgr.add_variant([0x0061], [0x0062]) self.lgr.add_cp([0x0062]) self.lgr.add_variant([0x0062], [0x0063]) self.lgr.add_cp([0x0063]) populate_lgr(self.lgr) log_content = self.log_output.getvalue() self.assertEqual( "Add code point 'U+0061' as variant of 'U+0062' for symmetry\n" "Add code point 'U+0062' as variant of 'U+0063' for symmetry\n" "Add code point 'U+0063' as variant of 'U+0061' for transitivity with 'U+0062'\n" "Add code point 'U+0061' as variant of 'U+0063' for transitivity with 'U+0062'\n", log_content) cp = self.lgr.get_char([0x0061]) self.assertEqual([(0x0062, ), (0x0063, )], [c.cp for c in cp.get_variants()]) cp = self.lgr.get_char([0x0062]) self.assertEqual([(0x0061, ), (0x0063, )], [c.cp for c in cp.get_variants()]) cp = self.lgr.get_char([0x0063]) self.assertEqual([(0x0061, ), (0x0062, )], [c.cp for c in cp.get_variants()])
def test_merge_rules(self): merged_lgr = LGR() lgr = LGR() rule = Rule(name='rule-name') anonymous_class = UnionClass() anonymous_class.add_child(Class(codepoints=[0x0061])) anonymous_class.add_child(Class(codepoints=[0x0062])) rule.add_child(ClassMatcher(anonymous_class)) rule_xml = """ <rule name="rule-name"> <union> <class>0x0061</class> <class>0x0062</class> </union> </rule> """ lgr.add_rule(rule) lgr.rules_xml.append(rule_xml) merge_rules(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.rules), 1) self.assertEqual(len(merged_lgr.rules_xml), 1) self.assertEqual(merged_lgr.rules[0], 'fr-rule-name') # Merging is idempotent merge_rules(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.rules), 1) self.assertEqual(len(merged_lgr.rules_xml), 1) self.assertEqual(merged_lgr.rules[0], 'fr-rule-name') # Not with different script merge_rules(lgr, 'en', merged_lgr, {}) self.assertEqual(len(merged_lgr.rules), 2) self.assertEqual(len(merged_lgr.rules_xml), 2) self.assertEqual(merged_lgr.rules[1], 'en-rule-name') # Nor with MSR2 lgr = LGR() rule = Rule(name='leading-combining-mark') rule.add_child(StartMatcher()) anonymous_class = UnionClass() anonymous_class.add_child(Class(unicode_property="gc:Mn")) anonymous_class.add_child(Class(unicode_property="gc:Mc")) lgr.add_rule(rule) lgr.rules_xml.append(""" <rule name="leading-combining-mark" comment="WLE Rule1: default WLE rule matching labels with leading combining marks ⍟"> <start /> <union> <class property="gc:Mn" /> <class property="gc:Mc" /> </union> </rule> """) merge_rules(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.rules), 3) self.assertEqual(len(merged_lgr.rules_xml), 3) self.assertEqual(merged_lgr.rules[2], 'Common-leading-combining-mark') merge_rules(lgr, 'fr', merged_lgr, {}) self.assertEqual(len(merged_lgr.rules), 3) self.assertEqual(len(merged_lgr.rules_xml), 3) self.assertEqual(merged_lgr.rules[2], 'Common-leading-combining-mark')