def check_conditional_variants(lgr, options): """ Check that all variants "when"/"not-when" values are parameterized context rule. :param LGR lgr: The LGR to check. :param options: Dictionary of options to the validation function - unused. """ logger.info("Testing conditional variants") for char in lgr.repertoire: if isinstance(char, RangeChar): # Range have no variants continue for var in char.get_variants(): when = var.when not_when = var.not_when if when is not None and when not in lgr.rules: logger.warning( "CP %s: Variant '%s' \"when\" attribute " "'%s' is not an existing rule name.", format_cp(char.cp), format_cp(var.cp), when) if not_when is not None and not_when not in lgr.rules: logger.warning( "CP %s: Variant '%s' \"not-when\" attribute " "'%s' is not an existing rule name.", format_cp(char.cp), format_cp(var.cp), not_when) logger.info("Conditional variants test done") return True
def check_symmetry(lgr, options): """ Check that all variants are defined in a symmetric way. If B is a variant of A, then A must be a variant of B. We only check the presence of the code point in the variants, not that all properties are identical (type, when, not-when). :param lgr: The LGR to be tested. :param options: Dictionary of options to the validation function - unused. """ logger.info("Testing symmetry") for a in lgr.repertoire: if isinstance(a, RangeChar): # Range have no variants continue for b in a.get_variants(): if b.cp not in lgr.repertoire: # Variant is not defined in repertoire logger.warning('CP %s: Variant %s is not in repertoire.', format_cp(a.cp), format_cp(b.cp)) continue # Variant is defined in repertoire, # let's see if the original character is in its # variants if a.cp not in [var.cp for var in lgr.get_variants(b.cp)]: logger.warning('CP %s should have CP %s in its variants.', format_cp(b.cp), format_cp(a.cp)) logger.info("Symmetry test done") return True
def add_range(self, first_cp, last_cp, comment=None, ref=None, tag=None, when=None, not_when=None, skip_check=False): """ Add a range of characters to the LGR. :param first_cp: First code point of the range. :param last_cp: Last code point of the range. :param comment: Comment associated to the range. :param ref: List of references associated to the range. :param tag: List of tags associated to the range. :param when: Condition to be satisfied by the code point. :param not_when: Condition to not be satisfied by the codepoint. :param skip_check: If True, skips checking for overlapping ranges. Invalid use of this parameter may leave the dictionary in an inconsistent state! :raises RangeAlreadyExists: If input already exists in dictionary. >>> cd = Repertoire() >>> cd.add_range(0x002A, 0x0030) >>> 0x02A in cd True >>> c = cd[0x002A] >>> isinstance(c, RangeChar) True >>> c.first_cp == 0x002A and c.last_cp == 0x0030 True >>> cd.add_range(0x002A, 0x0030) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... RangeAlreadyExists: """ assert first_cp < last_cp, "range must be defined in order" if not skip_check and self._check_range_overlap(first_cp, last_cp): logger.error("Range '%s - %s' already exists", format_cp(first_cp), format_cp(last_cp)) raise RangeAlreadyExists(first_cp, last_cp) for cp in range(first_cp, last_cp + 1): char = RangeChar(cp, first_cp, last_cp, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when) # TODO: clean-up range on error. self._add_char(char) # Insert by first cp self.ranges.append((first_cp, last_cp))
def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue if UNICODE_CODEPOINT_RE.match(line) is None: # Line is not starting with a valid unicode code point, skip continue # Remove comments and split base character from variant(s) char_variant = line.split('#')[0].split('|') char = char_variant[0] try: codepoints = parse_char(char) self._lgr.add_cp(codepoints) except ValueError: logger.error("Invalid character '%s' at line %d", char, line_num) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc) # Handle variants, if any if len(char_variant) > 1: variants = char_variant[1].split(':') for var in variants: try: var_codepoints = parse_char(var) self._lgr.add_variant(codepoints, var_codepoints) except ValueError: logger.error("Invalid variant '%s' at line %d", var, line_num) except LGRException as exc: logger.error( "Cannot add variant '%s' to code point '%s' at line %d: %s", format_cp(var_codepoints), format_cp(codepoints), line_num, exc)
def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue codepoints = [] for cp in UNICODE_CODEPOINT_RE.finditer(line): try: codepoints.append(int(cp.group(1), 16)) except ValueError: logger.error("Invalid code point '%s' at line %d", cp, line_num) try: self._lgr.add_cp(codepoints) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc)
def del_char(self, cp_or_sequence): """ Delete a character from the LGR. :param cp_or_sequence: code point or code point sequence to delete. :raises NotInLGR: If the code point does not exist. >>> cd = Repertoire() >>> _ = cd.add_char([0x002A]) >>> 0x002A in cd True >>> cd.del_char([0x002A]) >>> 0x002A in cd False >>> cd.del_char([0x002B]) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... NotInLGR: """ assert len(cp_or_sequence), "there should be at least one char" char = CharBase.from_cp_or_sequence(cp_or_sequence) if not self._del_char(char): logger.error("Code point '%s' does not exist", format_cp(cp_or_sequence)) raise NotInLGR(cp_or_sequence)
def get_char(self, cp_or_sequence): """ Get the char object of a code point. :param cp_or_sequence: Code point or sequence of the character to get. :raises NotInLGR: If the code point does not exist. >>> cd = Repertoire() >>> char = cd.add_char([0x002A]) >>> c = cd.get_char([0x002A]) >>> c is char True """ assert len(cp_or_sequence), "there should be at least one char" origin = CharBase.from_cp_or_sequence(cp_or_sequence) idx = origin.as_index() if idx not in self._chardict: raise NotInLGR(cp_or_sequence) chars = self._chardict[idx] try: list_idx = chars.index(origin) except ValueError: logger.error("Code point '%s' does not exist", format_cp(cp_or_sequence)) raise NotInLGR(cp_or_sequence) char = chars[list_idx] return char
def check_conditional_variants(lgr, options): """ Check that all variants "when"/"not-when" values are parameterized context rule. :param LGR lgr: The LGR to check. :param options: Dictionary of options to the validation function - unused. """ logger.info("Testing conditional variants") success = True result = {'description': 'Testing conditional variants', 'repertoire': []} for char in lgr.repertoire: if isinstance(char, RangeChar): # Range have no variants continue for var in char.get_variants(): when = var.when not_when = var.not_when if when is not None and when not in lgr.rules: logger.warning( "CP %s: Variant '%s' \"when\" attribute " "'%s' is not an existing rule name.", format_cp(char.cp), format_cp(var.cp), when) success = False result['repertoire'].append({ 'char': char, 'variant': var, 'rule_type': 'when', 'rule': when }) if not_when is not None and not_when not in lgr.rules: logger.warning( "CP %s: Variant '%s' \"not-when\" attribute " "'%s' is not an existing rule name.", format_cp(char.cp), format_cp(var.cp), not_when) success = False result['repertoire'].append({ 'char': char, 'variant': var, 'rule_type': 'not-when', 'rule': not_when }) logger.info("Conditional variants test done") return success, result
def _compare(labels, label1_indexes, label2_indexes): for label in labels: label_cp_out = format_cp(tuple([ord(c) for c in label])) (index1, index2) = labels[label] yield "\n## Comparison on label " \ "{label} [{cp}]\n".format(label="%s'%s'%s" % (LRI, label, PDI), cp=label_cp_out) yield "\n### Test dispositions: ###\n" labels2 = label2_indexes[index2] labels1 = label1_indexes[index1] # change in disposition ret = True for label2 in labels2: label2_cp = label2['cp'] for label1 in labels1: label1_cp = label1['cp'] if label1_cp == label2_cp: (res, out) = _compare_labels(label, label1, label2) ret &= res if out: yield MD yield out yield MD if ret: yield MD yield 'No changes in disposition.' yield MD # change in number of variants yield "\n### Test number of variants: ###\n" added = _get_new_variants(labels2, labels1) removed = _get_new_variants(labels1, labels2) for add in added: yield MD yield "New {cat} in LGR2:\n" \ "{label} [{cp}]\n".format(cat=add['cat'], label=add['bidi'], cp=add['cp_out']) try: yield "\nRules for LGR2:\n{}".format(add['rules'][label]) except KeyError: raise InvalidSymmetry() yield MD for remove in removed: yield MD yield "Removed {cat} in LGR2:\n" \ "{label} [{cp}]\n".format(cat=remove['cat'], label=remove['bidi'], cp=remove['cp_out']) try: yield "\nRules for LGR1:\n{}".format(remove['rules'][label]) except KeyError: raise InvalidSymmetry() yield MD if len(added) == 0 and len(removed) == 0: yield MD yield 'No changes in number of variants.' yield MD
def check_transitivity(lgr, options): """ Check that all variants are defined in a transitive way. If B is a variant of A and C a variant of B, then C must be a variant of A. We only check the presence of the code point in the variants, not that all properties are identical (type, when, not-when). Note: This test assumes the LGR is symmetric. :param LGR lgr: The LGR to check. :param options: Dictionary of options to the validation function - unused. :return True is LGR transitivity is achieved, False otherwise. """ success = True logger.info("Testing transitivity") result = {'description': 'Testing transitivity', 'repertoire': []} for a in lgr.repertoire: if isinstance(a, RangeChar): # Range have no variants continue a_variants = list(a.get_variants()) # get_variants() returns generator logger.debug("A: '%s'", format_cp(a.cp)) for b in a_variants: logger.debug("A: '%s' - B: '%s'", format_cp(a.cp), format_cp(b.cp)) try: variants = lgr.get_variants(b.cp) except NotInLGR: logger.error("Code point '%s' not in LGR", format_cp(b.cp)) success = False continue # Variant is defined in repertoire # (we have checked for symmetry first) for c in [var for var in variants if var.cp != a.cp]: logger.debug("A: '%s' - B: '%s' - C: '%s'", format_cp(a.cp), format_cp(b.cp), format_cp(c.cp)) # Iterate through all second-level variants # which are not the original code point if c.cp not in [var.cp for var in a_variants]: success = False logger.warning("CP %s should have CP %s in its variants.", format_cp(a.cp), format_cp(c.cp)) lgr.notify_error('basic_transitivity') result['repertoire'].append({'char': a, 'variant': c}) logger.info("Transitivity test done") lgr.notify_tested('basic_transitivity') return success, result
def display_variant(variant): """ Nicely display a variant. :param variant: The variant to display. """ return "Variant {}: type={} - when={} - not-when={} - comment={}".format( format_cp(variant.cp), variant.type, variant.when, variant.not_when, variant.comment)
def insert_variant(self, line_num, codepoints, var, var_type=None): try: variants = parse_char(var) except ValueError: logger.error("Invalid variant '%s' at line %d", var, line_num) return for (var_codepoints, references) in variants: try: self._lgr.add_variant(codepoints, var_codepoints, ref=references, variant_type=var_type) except LGRException as exc: logger.error( "Cannot add variant '%s' to code point '%s' at line %d: %s", format_cp(var_codepoints), format_cp(codepoints), line_num, exc)
def check_label(lgr, label, invalid, test): from lgr.utils import format_cp label_cp = tuple([ord(c) for c in label]) label_display = u' '.join(u"{:04X}".format(cp) for cp in label_cp) logger.info("- Code points: %s", label_display) (eligible, label_part, not_in_lgr, disp, action_idx, logs) = lgr.test_label_eligible(label_cp) logger.info("- Eligible: %s", eligible) logger.info("- Disposition: %s", disp) is_default_action = action_idx > len(lgr.actions) actual_index = action_idx if not is_default_action else action_idx - len( lgr.actions) action_name = "DefaultAction" if is_default_action else "Action" logger.info("- Action triggered: %s[%d]", action_name, actual_index) logger.info("- Logs: %s", logs) write_output( u"Validation: {} ({}): Result: {}".format( label, label_display, "valid" if eligible else "INVALID"), test) if eligible: write_output( u"Disposition: {} ({}): Result: {} due to {}[{}]".format( label, label_display, disp, action_name, actual_index), test) summary, labels = lgr.compute_label_disposition_summary( label_cp, include_invalid=invalid) logger.info("Summary: %s", summary) for (variant_cp, var_disp, action_idx, disp_set, logs) in labels: variant_u = ''.join([unichr(c) for c in variant_cp]) variant_display = u' '.join(u"{:04X}".format(cp) for cp in variant_cp) logger.info("\tVariant '%s'", variant_u) logger.info("\t- Code points: %s", format_cp(variant_cp)) logger.info("\t- Disposition: '%s'", var_disp) is_default_action = action_idx > len(lgr.actions) actual_index = action_idx if not is_default_action else action_idx - len( lgr.actions) action_name = "DefaultAction" if is_default_action else "Action" logger.info("\t- Action triggered: %s[%d]", action_name, actual_index) disp_set_display = '{%s}' % ','.join(disp_set) write_output( u"Variant: ({}): [{}] ==> {} due to {}[{}]".format( variant_display, disp_set_display, var_disp, action_name, actual_index), test) logger.info("\t- Logs: %s", logs) else: logger.info("- Valid code points from label: %s", u' '.join(u"{:04X}".format(cp) for cp in label_part)) logger.info("- Invalid code points from label: %s", u' '.join(u"{:04X}".format(cp) for cp in not_in_lgr))
def populate_lgr(lgr): """ Populate an LGR with missing variants, and fix symmetry and transitivity :param lgr: The LGR to be populated. :return: Result of checks and summary as a string """ # not in LGR variants for a in lgr.repertoire: for b in a.get_variants(): try: lgr.get_variants(b.cp) except NotInLGR: logger.info("Add missing code point '{}' in LGR as it is a variant of '{}'".format( format_cp(b.cp), format_cp(a.cp))) lgr.add_cp(b.cp) # add current code point as variant for missing code point logger.info("Add code point '{}' as variant of '{}' for symmetry".format(format_cp(a.cp), format_cp(b.cp))) lgr.add_variant(b.cp, a.cp, variant_type='blocked') while not check_symmetry(lgr, None)[0] or not check_transitivity(lgr, None)[0]: # symmetry for a in lgr.repertoire: for b in a.get_variants(): # Variant is defined in repertoire # let's see if the original character is in its # variants if a.cp not in [var.cp for var in lgr.get_variants(b.cp)]: logger.info("Add code point '{}' as variant of '{}' for symmetry".format(format_cp(a.cp), format_cp(b.cp))) lgr.add_variant(b.cp, a.cp, variant_type='blocked') # transitivity for a in lgr.repertoire: for b in a.get_variants(): for c in [var for var in lgr.get_variants(b.cp) if var.cp != a.cp]: # Iterate through all second-level variants # which are not the original code point if c.cp not in [var.cp for var in a.get_variants()]: logger.info("Add code point '{}' as variant of '{}' for transitivity with '{}'".format( format_cp(c.cp), format_cp(a.cp), format_cp(b.cp))) lgr.add_variant(a.cp, c.cp, variant_type='blocked')
def find_variants_to_block(lgr, label_ref, label): var_ref = [var for (var, _, _) in lgr._generate_label_variants(label_ref)] for (variant_cp, disp, _, _, disp_set, _) in lgr.compute_label_disposition(label): if variant_cp in var_ref: variant_u = cp_to_ulabel(variant_cp) write_output("Variant '%s' [%s] with disposition set '%s' " "should be blocked (current disposition :%s)" % (variant_u, format_cp(variant_cp), disp_set, disp))
def cross_script_variants(lgr, labels_input): """ Compute cross-script variants of labels. :param lgr: The LGR to use for variant generation. :param labels_input: The file containing the labels """ if lgr.metadata is None: logger.error("Cannot generate cross-scripts variants " "for LGR without metadata") raise Exception if lgr.unicode_database is None: logger.error("Cannot generate cross-scripts variants " "for LGR without unicode database attached") raise Exception found = False for label, valid, error in read_labels(labels_input, lgr.unicode_database): if not valid: yield "Input label {}: {}\n".format(label, error) else: label_cp = tuple([ord(c) for c in label]) result, _, _, _, _, _ = lgr.test_label_eligible(label_cp) if not result: continue label_displayed = False for variant, disp, script_mapping in _generate_variants( lgr, label_cp): if not label_displayed: # Only display input label if it has x-variants yield "Input label {} ({}) has cross-script variants:\n".format( format_cp(label_cp), label) label_displayed = True found = True yield "\t- Cross-variant {} ({}), disposition {}:\n".format( format_cp(variant), cp_to_ulabel(variant), disp) yield '\t\t+ ' + '\t\t+ '.join([ "{} ({}): {}\n".format(format_cp(c), cp_to_ulabel(c), s) for c, s in script_mapping.items() ]) if not found: yield 'No cross-script variants for input!'
def check_label(lgr, label, generate_variants=False): from lgr.utils import format_cp label_cp = tuple([ord(c) for c in label]) write_output("Label: %s [%s]" % (label, format_cp(label_cp))) (eligible, label_part, not_in_lgr, disp, _, _) = lgr.test_label_eligible(label_cp) write_output("- Eligible: %s" % eligible) write_output("- Disposition: %s" % disp) if eligible: if generate_variants: summary, labels = lgr.compute_label_disposition_summary(label_cp) for (variant_cp, var_disp, _, _, _) in labels: variant_u = ''.join([unichr(c) for c in variant_cp]) write_output("\tVariant %s [%s]" % (variant_u, format_cp(variant_cp))) write_output("\t- Disposition: '%s'" % var_disp) else: write_output("- Valid code points from label: %s" % u' '.join(u"{:04X}".format(cp) for cp in label_part)) write_output("- Invalid code points from label: %s" % u' '.join(u"{:04X}".format(cp) for cp in not_in_lgr))
def del_range(self, first_cp, last_cp): """ Delete a range of characters from the LGR. Note: This MUST be the exact same range that was added, meaning you cannot delete partial sub-ranges! :param first_cp: First code point of the range. :param last_cp: Last code point of the range. :raises NotInLGR: If the range does not exist. >>> cd = Repertoire() >>> cd.add_range(0x002A, 0x0030) >>> cd.del_range(0x002A, 0x0030) >>> 0x002A in cd False >>> cd.del_range(0x002A, 0x0030) # doctest: +IGNORE_EXCEPTION_DETAIL Traceback (most recent call last): ... NotInLGR: """ assert first_cp < last_cp, "range must be defined in order" if (first_cp, last_cp) not in self.ranges: logger.error("Range '%s - %s' does not exist", format_cp(first_cp), format_cp(last_cp)) raise NotInLGR(first_cp) for cp in range(first_cp, last_cp + 1): char = RangeChar(cp, first_cp, last_cp) if not self._del_char(char): # TODO: clean-up range on error # This should only happen if range insertion failed # -> inconsistent state for now logger.critical("Range '%s - %s' is missing code point %s", format_cp(first_cp), format_cp(last_cp), format_cp(cp)) raise NotInLGR(cp) # Remove and sort by first cp self.ranges.remove((first_cp, last_cp))
def add_variant(self, cp_or_sequence, variant_type=None, when=None, not_when=None, comment=None, ref=None): """ Add a variant to a char. :param cp_or_sequence: Code point or code point sequence of the variant. :param variant_type: Type for the variant, if any. :param when: Condition to be satisfied for the variant to exist. :param not_when: Condition to must be not satisfied for the variant to exist. :param comment: Optional comment for the variant. :param ref: List of references associated to the code point. :raises VariantAlreadyExists: If variant already exists for character. >>> c = CharBase.from_cp_or_sequence([1]) >>> c.add_variant([10], 'BLOCKED') >>> (10, ) in c._variants True >>> c._variants[(10,)][0].type == text_type('BLOCKED') True """ assert len(cp_or_sequence), "there should be at least one char" var = Variant(cp_or_sequence, variant_type=variant_type, when=when, not_when=not_when, comment=comment, ref=ref) idx = tuple(cp_or_sequence) if idx in self._variants and var in set(self._variants[idx]): logger.error("%r: Variant '%s' already exists", self, format_cp(cp_or_sequence)) raise VariantAlreadyExists(self.cp, var.cp) else: self._variants.setdefault(idx, []).append(var)
def rebuild_lgr(lgr, options): """ Rebuild an LGR with given parameters. options argument can contain: * unicode_version: The target Unicode version to be used when rebuilding the LGR. If None is given, use the current one. * validating_repertoire: The validating repertoire used for checking code points. * unidb: Munidata's Unicode database. If None, skip Unicode checks. :param LGR lgr: The LGR to rebuild. :param dict options: Dictionary of options to the validation function. """ # Local import to prevent import cycles from lgr.core import LGR unicode_version = options.get('unicode_version', lgr.metadata.unicode_version) validating_repertoire = options.get('validating_repertoire', None) description = "Rebuilding LGR with Unicode version {}".format( unicode_version) if validating_repertoire is not None: description += " and validating repertoire '{}'".format( validating_repertoire) result = { 'description': description, 'repertoire': {} # XXX: Cannot use defaultdict because of django... } logger.info( "Rebuilding LGR '%s' with Unicode version %s " "and Validating Repertoire '%s'", lgr, unicode_version, validating_repertoire) unidb = options.get('unidb', None) if unidb is not None: unidb_version = unidb.get_unicode_version() if unidb_version != unicode_version: result['generic'] = "Target Unicode version {} " \ "differs from UnicodeDatabase {}".format(unicode_version, unidb_version) logger.warning( "Target Unicode version %s differs " "from UnicodeDatabase %s", unicode_version, unidb_version) # For now, simply copy the metadata and references of the source LGR target_metadata = copy.deepcopy(lgr.metadata) target_metadata.unicode_version = unicode_version target_reference_manager = copy.deepcopy(lgr.reference_manager) target_lgr = LGR(name=lgr.name, metadata=target_metadata, reference_manager=target_reference_manager, unicode_database=unidb) for char in lgr.repertoire: if isinstance(char, RangeChar): range_ok = True for cp, status in target_lgr.check_range(char.first_cp, char.last_cp, validating_repertoire): if status is not None: result['repertoire'].setdefault(char, {}).setdefault( 'errors', []).append(status) range_ok = False in_script, _ = lgr.cp_in_script([cp]) if not in_script: result['repertoire'].setdefault(char, {}).setdefault( 'warnings', []).append(CharNotInScript(cp)) range_ok = False if not range_ok: continue try: target_lgr.add_range( char.first_cp, char.last_cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, validating_repertoire=validating_repertoire, override_repertoire=False) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault('errors', []).append(exc) logger.error("Cannot add range '%s-%s'", format_cp(char.first_cp), format_cp(char.last_cp)) continue in_script, _ = lgr.cp_in_script(char.cp) if not in_script: result['repertoire'].setdefault(char, {}).setdefault( 'warnings', []).append(CharNotInScript(char.cp)) # Insert code point try: target_lgr.add_cp(char.cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, validating_repertoire=validating_repertoire, override_repertoire=False) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault('errors', []).append(exc) logger.error("Cannot add code point '%s'", format_cp(char.cp)) if not isinstance(exc, CharInvalidIdnaProperty ): # Cannot include non-IDNA valid code points target_lgr.add_cp(char.cp, comment=char.comment, ref=char.references, tag=char.tags, when=char.when, not_when=char.not_when, force=True) # Create variants for var in char.get_variants(): try: target_lgr.add_variant( char.cp, variant_cp=var.cp, variant_type=var.type, when=var.when, not_when=var.not_when, comment=var.comment, ref=var.references, validating_repertoire=validating_repertoire, override_repertoire=True) except LGRException as exc: result['repertoire'].setdefault(char, {}).setdefault( 'variants', {}).setdefault(var, []).append(exc) logger.error("Cannot add variant '%s' to code point '%s'", format_cp(var.cp), format_cp(char.cp)) if not isinstance( exc, CharInvalidIdnaProperty ): # Cannot include non-IDNA valid code points target_lgr.add_variant(char.cp, variant_cp=var.cp, variant_type=var.type, when=var.when, not_when=var.not_when, comment=var.comment, ref=var.references, force=True) logger.info("Rebuilding LGR '%s done", lgr) return True, result
def _parse_doc(self, rule_file): """ Actual parsing of document. :param rule_file: Content of the rule, as a file-like object. """ line_num = 0 for line in rule_file: line_num += 1 line = line.strip() if len(line) == 0: continue if line[0] == '#': continue reference = REFERENCE_RE.match(line) if reference is not None: ref_id = reference.group('ref_id') value = reference.group('value') comment = reference.group('comment') try: self._lgr.add_reference(value, ref_id=ref_id, comment=comment) except LGRException: logger.error("Invalid reference '%s' on line %d", line, line_num) continue version = VERSION_RE.match(line) if version is not None: version_no = version.group('version_no') date = version.group('date') comment = version.group('comment') try: self._lgr.metadata.version = Version(version_no, comment=comment) self._lgr.metadata.date = date except LGRException: logger.error("Invalid version '%s' on line %d", line, line_num) continue if UNICODE_CODEPOINT_RE.match(line) is None: logger.debug("Skipping non-parsable line %d:\n%s", line_num, line) # Line is not starting with a valid unicode code point, skip continue # Split base character from variant(s) char_variant = line.split(';') char = char_variant[0] try: [(codepoints, references)] = parse_char(char) self._lgr.add_cp(codepoints, ref=references) except ValueError: logger.error("Invalid character '%s' at line %d", char, line_num) except LGRException as exc: logger.error("Cannot add code point '%s' at line %d: %s", format_cp(codepoints), line_num, exc) if len(char_variant) > 1: preferred_variants = char_variant[1].strip() if len(preferred_variants ) > 0 and preferred_variants[0] != '#': # From RFC7940, Section 7.3. Recommended Disposition Values: # activated The resulting string should be activated for use. (This # is the same as a Preferred Variant [RFC3743].) var_type = "activated" self.insert_variant(line_num, codepoints, preferred_variants, var_type) if len(char_variant) > 2: variants = char_variant[2].strip() if len(variants) > 0 and variants[0] != '#': self.insert_variant(line_num, codepoints, variants)
def compute_stats(lgr, options): """ Compute statistics for an LGR. :param lgr: The LGR to use. :param options: Not used. """ stats = { 'codepoint_number': 0, 'range_number': 0, 'largest_range': '', 'largest_range_len': 0, 'sequence_number': 0, 'largest_sequence': '', 'largest_sequence_len': 0, 'codepoints_with_variants': 0, 'variant_number': 0, 'variants_by_type': {}, 'average_variants': 0, 'codepoints_by_tag': {}, 'rule_number': len(lgr.rules), } for char in lgr.repertoire: # Range len set to 1 by default (for single code point and sequences) range_len = 1 if isinstance(char, RangeChar): range_len = char.last_cp - char.first_cp + 1 stats['codepoint_number'] += range_len stats['range_number'] += 1 if range_len > stats['largest_range_len']: stats['largest_range_len'] = range_len stats['largest_range'] = format_cp(char.cp) elif isinstance(char, CharSequence): stats['codepoint_number'] += 1 stats['sequence_number'] += 1 sequence_len = len(char.cp) if sequence_len > stats['largest_sequence_len']: stats['largest_sequence_len'] = sequence_len stats['largest_sequence'] = format_cp(char.cp) elif isinstance(char, Char): stats['codepoint_number'] += 1 for t in char.tags: if t in stats['codepoints_by_tag']: stats['codepoints_by_tag'][t] += range_len else: stats['codepoints_by_tag'][t] = range_len variants = list(char.get_variants()) variants_len = len(variants) stats['variant_number'] += variants_len if variants_len > 0: stats['codepoints_with_variants'] += 1 for var in variants: if var.type in stats['variants_by_type']: stats['variants_by_type'][var.type] += 1 else: stats['variants_by_type'][var.type] = 1 if stats['codepoints_with_variants'] != 0: stats['average_variants'] = \ stats['variant_number'] / stats['codepoints_with_variants'] # General summary output = """ General summary: \tNumber of code points: {codepoint_number}. \tNumber of ranges: {range_number}. \tLargest range: {largest_range} (length: {largest_range_len}). \tNumber of sequences: {sequence_number}. \tLargest sequence: {largest_sequence} (length: {largest_sequence_len}). """.format(**stats) # Variants output += """ Variants: \tTotal number of variants: {variant_number}. \tAverage number of variants per code point: {average_variants}. """.format(**stats) for (variant_type, number) in stats['variants_by_type'].iteritems(): output += "\tNumber of variants for type '{0}': {1}.\n"\ .format(variant_type, number) # Tags output += """ Tags: """ for (tag_name, number) in stats['codepoints_by_tag'].iteritems(): output += "\tNumber of code points for tag '{0}': {1}.\n"\ .format(tag_name, number) # Rules summary output += "\nRules:\n" output += "\tNumber of rules defined: {0}.\n".format(stats['rule_number']) logger.info(output) return True
def _generate_indexes(lgr, labels, keep=False, quiet=False): """ Generate indexes based on labels provided in the list :param lgr: The current LGR :param labels: The list of labels, as a list of U-Labels. :param keep: Do we keep labels without collision in the output :param quiet: If True, do not collect rule log. :return: (label_indexes, not_in_lgr), with: - label_indexes: the dictionary containing the primary labels and their variants (with various information) for each index. - not_in_lgr: List of labels that do not pass preliminary eligibility testing. """ label_indexes = {} not_in_lgr = [] # Get the indexes and variants for all labels for label in labels: label_cp = tuple([ord(c) for c in label]) try: label_index = lgr.generate_index_label(label_cp) except NotInLGR: not_in_lgr.append(label_cp) continue label_cp_out = format_cp(label_cp) if label_index not in label_indexes: label_indexes[label_index] = [] label_indexes[label_index].append({ 'label': label, 'bidi': "%s'%s'%s" % (LRI, label, PDI), 'cat': PRIMARY, 'cp': label_cp, 'cp_out': label_cp_out, 'disp': { label: '-' }, 'rules': { label: '-' }, 'action_idx': { label: '-' } }) for (label_index, primaries) in deepcopy(label_indexes).items(): # only get variants for collided labels (if not keep) if len(primaries) < 2 and not keep: del label_indexes[label_index] continue for primary in primaries: label_cp = primary['cp'] label = primary['label'] for (variant_cp, variant_disp, variant_invalid_parts, action_idx, _, log) in lgr.compute_label_disposition(label_cp, include_invalid=True, collect_log=not quiet): variant = cp_to_ulabel(variant_cp) log = log.strip() if quiet: log = '' variant_cp_out = format_cp(variant_cp) # search if variant is already in our dict, then add or # update it existing = [ var for var in label_indexes[label_index] if var['label'] == variant ] if len(existing) < 1: label_indexes[label_index].append({ 'label': variant, 'bidi': "%s'%s'%s" % (LRI, variant, PDI), 'cat': VARIANT, 'cp': variant_cp, 'cp_out': variant_cp_out, 'disp': { label: variant_disp }, 'rules': { label: log }, 'action_idx': { label: action_idx } }) else: assert len(existing) == 1 existing[0]['disp'][label] = variant_disp existing[0]['rules'][label] = log existing[0]['action_idx'][label] = action_idx return label_indexes, not_in_lgr
def main(): parser = argparse.ArgumentParser(description='LGR Collision') parser.add_argument('-v', '--verbose', action='store_true', help='be verbose') parser.add_argument('-g', '--generate', action='store_true', help='Generate variants') parser.add_argument('-l', '--libs', metavar='LIBS', help='ICU libraries', required=True) parser.add_argument('-s', '--set', metavar='SET FILE', help='Filepath to the set of reference labels', required=True) parser.add_argument('xml', metavar='XML') args = parser.parse_args() log_level = logging.DEBUG if args.verbose else logging.INFO logging.basicConfig(stream=sys.stdout, level=log_level) lgr_parser = XMLParser(args.xml) libpath, i18n_libpath, libver = args.libs.split('#') manager = UnicodeDataVersionManager() unidb = manager.register(None, libpath, i18n_libpath, libver) lgr_parser.unicode_database = unidb lgr = lgr_parser.parse_document() if lgr is None: logger.error("Error while parsing LGR file.") logger.error("Please check compliance with RNG.") return ref_label_indexes = {} # Compute index label for set or reference labels with io.open(args.set, 'r', encoding='utf-8') as ref_set: for ref_label in ref_set: label_cp = tuple([ord(c) for c in ref_label.strip()]) try: label_index = compute_label_index(lgr, label_cp) except NotInLGR: continue ref_label_indexes[label_index] = label_cp # Deal with input for label in get_stdin().read().splitlines(): write_output("Check label '%s'" % label) label_cp = tuple([ord(c) for c in label]) label_disp = format_cp(label_cp) label_index = compute_label_index(lgr, label_cp) if label_index in ref_label_indexes: ref_label_cp = ref_label_indexes[label_index] ref_label_disp = format_cp(ref_label_cp) ref_label_u = cp_to_ulabel(ref_label_cp) write_output("Collision for label '%s' [%s] with '%s' [%s]" % (label, label_disp, ref_label_u, ref_label_disp)) if args.generate: find_variants_to_block(lgr, ref_label_cp, label_cp) else: write_output("No collision for label %s [%s]" % (label, label_disp))
def matches(self, label, rules_lookup, classes_lookup, unicode_database, anchor=None, index=0): """ Test if a rule matches a label. :param label: Label to test, as a sequence of code points. :param rules_lookup: Dictionary of defined rules in the LGR to use for by-ref rules. :param classes_lookup: Dictionary of defined classes in the LGR to use for by-ref classes. :param unicode_database: The Unicode Database. :param anchor: Optional anchor to use for look-around rules. :param index: If anchor is used, its index (0-based). :return: True if label is matched by the rule, False otherwise. """ rule_logger.debug( "Test match on %s for label '%s' with anchor '%s' (%d)", self, format_cp(label), format_cp(anchor) if anchor else anchor, index) try: pattern = self.get_pattern(rules_lookup, classes_lookup, unicode_database) except (re.error, PICUException) as re_exc: rule_logger.error('Cannot get pattern for rule %s: %s', self, re_exc) raise RuleError(self.name, re_exc) if len(pattern) == 0: # Pattern is empty, nothing will match rule_logger.debug('Empty pattern') return False if anchor is not None: if '%(anchor)s' not in pattern: rule_logger.debug('Not a parameterized context rule') # Pattern is not a parameterized context-rule, so set index to 0 index = 0 # Format anchor - Can be a sequence. # Use old-style formatting, see note in matcher.AnchorMatcher pattern = pattern % { 'anchor': ''.join( map(lambda c: '\\x{{{:X}}}'.format(c), anchor)) } rule_logger.debug("Pattern for rule %s: '%s'", self, pattern) try: regex = unicode_database.compile_regex(pattern) except (re.error, PICUException) as re_exc: rule_logger.error('Cannot compile regex: %s', re_exc) raise RuleError(self.name, re_exc) rule_logger.debug("Index: %d", index) # Convert label to U-format to be used in regex label_u = cp_to_ulabel(label) # Look for match. It is important to use "search" and not "match" # here, since a rule may not match at the beginning of a label. result = regex.search(label_u, index=index) rule_logger.debug("Result of match: %s", result) if result is None: return False if anchor is not None: match_index = result.start() rule_logger.debug('Match index: %d - Index: %d', match_index, index) if match_index > index: rule_logger.debug('Match found after index, invalid') return False return True
def diff_lgrs(lgr1, lgr2): """ Compare 2 LGRs. Returns a text containing results of comparaison :param lgr1: First LGR. :param lgr2: Second LGR. :return: Text comparison. """ output = "" lgr1.expand_ranges() lgr2.expand_ranges() output += diff_metadata(lgr1.metadata, lgr2.metadata) output += diff_reference_manager(lgr1.reference_manager, lgr2.reference_manager) output += """ ** Compare repertoire ** """ first_cps = {c.cp for c in lgr1.repertoire} second_cps = {c.cp for c in lgr2.repertoire} output += compare_things(first_cps, second_cps, "repertoire", True, format_fct=lambda c: " ".join(map(format_cp, c)), show_same=False) output += """ ** Compare common code points in repertoire ** """ for cp in set.intersection(first_cps, second_cps): char1 = lgr1.get_char(cp) char2 = lgr2.get_char(cp) output += """ Compare code point {}""".format(format_cp(cp)) output += diff_char(char1, char2) output += """ ** Compare WLE ** """ output += diff_actions(lgr1, lgr2) output += diff_rules(lgr1, lgr2) output += diff_classes(lgr1, lgr2) return output
def _process_data(self, elem): """ Process the <data> element of an LGR XML file. """ # It is RECOMMENDED to list all "char" elements in ascending order of # the "cp" attribute. The below variable is used when verifying that. previous_codepoint = [] for child in elem: comment = child.get('comment', None) when = child.get('when', None) not_when = child.get('not-when', None) # Handle references ref = string_to_list(child.get('ref', '')) # Handle tags tag = string_to_list(child.get('tag', '')) if child.tag == CHAR_TAG: codepoint = [int(c, 16) for c in child.get('cp').split()] if codepoint <= previous_codepoint: if previous_codepoint[0:len(codepoint)] == codepoint: # Not clear what order is to be recommended here self.rfc7940_checks.error( 'char_strict_ascending_order') else: logger.warning( "cp attribute not in ascending order: '%s'", child.get('cp')) self.rfc7940_checks.error('char_ascending_order') previous_codepoint = codepoint try: self._lgr.add_cp(codepoint, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: logger.error("Cannot add code point '%s': %s", format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise # Variants of char for variant in child.iter(VARIANT_TAG): var_codepoint = [ int(c, 16) for c in variant.get('cp').split() ] when = variant.get('when', None) not_when = variant.get('not-when', None) variant_type = variant.get('type', None) comment = variant.get('comment', None) # Handle references ref = string_to_list(variant.get('ref', '')) try: self._lgr.add_variant(codepoint, var_codepoint, variant_type=variant_type, when=when, not_when=not_when, comment=comment, ref=ref, force=self.force_mode) except LGRException as exc: logger.error( "Cannot add variant '%s' " "to code point '%s': %s", format_cp(var_codepoint), format_cp(codepoint), exc) self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') if not self.force_mode: raise elif child.tag == RANGE_TAG: first_cp = int(child.get('first-cp'), 16) last_cp = int(child.get('last-cp'), 16) try: self._lgr.add_range(first_cp, last_cp, comment=comment, ref=ref, tag=tag, when=when, not_when=not_when, force=self.force_mode) except LGRException as exc: self.rfc7940_checks.error('parse_xml') self.rfc7940_checks.error('codepoint_valid') logger.error("Cannot add range '%s-%s': %s", format_cp(first_cp), format_cp(last_cp), exc) if not self.force_mode: raise child.clear() self.rfc7940_checks.tested('char_ascending_order') self.rfc7940_checks.tested('char_strict_ascending_order')
def check_symmetry(lgr, options): """ Check that all variants are defined in a symmetric way. If B is a variant of A, then A must be a variant of B. We only check the presence of the code point in the variants, not that all properties are identical (type, when, not-when). :param lgr: The LGR to be tested. :param options: Dictionary of options to the validation function - unused. :return True is LGR symmetry is achieved, False otherwise. """ logger.info("Testing symmetry") success = True result = {'description': 'Testing symmetry', 'repertoire': []} for a in lgr.repertoire: if isinstance(a, RangeChar): # Range have no variants continue for b in a.get_variants(): if b.cp not in lgr.repertoire: # Variant is not defined in repertoire logger.warning('CP %s: Variant %s is not in repertoire.', format_cp(a.cp), format_cp(b.cp)) lgr.notify_error('basic_symmetry') result['repertoire'].append({ 'char': a, 'variant': b, 'type': 'not-in-repertoire' }) success = False continue # Variant is defined in repertoire, # let's see if the original character is in its # variants if a.cp not in [var.cp for var in lgr.get_variants(b.cp)]: success = False logger.warning('CP %s should have CP %s in its variants.', format_cp(b.cp), format_cp(a.cp)) lgr.notify_error('basic_symmetry') result['repertoire'].append({ 'char': b, 'variant': a, 'type': 'missing' }) continue # Now let's check if the reverse mappings agree in their # "when" or "not-when" attributes for c in lgr.get_variants(b.cp): if c.cp == a.cp: if c.when == b.when and c.not_when == b.not_when: break else: success = False lgr.notify_error('strict_symmetry') logger.warning( 'CP %s should have CP %s in its strict variants.', format_cp(b.cp), format_cp(a.cp)) result['repertoire'].append({ 'char': b, 'variant': a, 'type': 'missing' }) logger.info("Symmetry test done") lgr.notify_tested('basic_symmetry') lgr.notify_tested('strict_symmetry') return success, result
def apply(self, label, disp_set, only_variants, rules_lookup, classes_lookup, unicode_database): """ Apply an action to a label. :param label: The label to process, as a sequence of code points. :param disp_set: Set of dispositions used to generate the label. :param only_variants: True if label only contains code point from variant mapping. :param rules_lookup: Dictionary of defined rules in the LGR. :param classes_lookup: Dictionary of defined classes in the LGR. :param unicode_database: The Unicode Database used to process rules. :return: The final label disposition, None is no action applies to the label. :raises RuleError: If rule is invalid. """ # RFC7940, section 8.3. Determining a Disposition for a Label or Variant Label # Step 2 rule_logger.debug( "Applying action %s on label '%s' " "with disposition set '%s'", self, format_cp(label), disp_set) # First bullet rule_matched = True if self.match is not None: rule = rules_lookup[self.match] rule_matched = rule.matches(label, rules_lookup, classes_lookup, unicode_database) rule_logger.info('Action %s: when rule matched: %s', self, rule_matched) # Second bullet elif self.not_match is not None: rule = rules_lookup[self.not_match] rule_matched = not rule.matches(label, rules_lookup, classes_lookup, unicode_database) rule_logger.info('Action %s: not-when rule matched: %s', self, rule_matched) # Third bullet variant_matched = True if self.any_variant is not None: # Any single match may trigger an action that contains # an "any-variant" attribute variant_matched = len(self.any_variant & disp_set) > 0 rule_logger.info('Action %s: any-variant matched: %s', self, variant_matched) # Fourth bullet elif self.all_variants is not None: # For an "all-variants" attribute, # the variant type for all variant code points must match one or # several of the types values specified in to trigger the action. variant_matched = (len(disp_set) > 0 and disp_set.issubset(self.all_variants)) rule_logger.info('Action %s: all-variants matched: %s', self, variant_matched) # Fifth bullet elif self.only_variants is not None: # For an "only-variants" attribute, # the variant type for all variant code points must match one or # several of the types values specified in to trigger the action. # An "only-variants" attribute will trigger the action # only if all code points of the variant label have variant mappings # from the original code points. # => Label only contains code points generated from variant mappings # (including reflexive mappings) variant_matched = (only_variants and len(disp_set) > 0 and disp_set.issubset(self.only_variants)) rule_logger.info('Action %s: only-variants matched: %s', self, variant_matched) # Last bullet: rule_matched and variant_matched are initialised to True if rule_matched and variant_matched: rule_logger.info('Action %s triggered, disposition: %s', self, self.disp) return self.disp rule_logger.info('Action %s not triggered', self) return None
def check_label(lgr, label, generate_variants=False, merged_lgr=None, set_labels=None): from lgr.utils import format_cp label_cp = tuple([ord(c) for c in label]) write_output("\nLabel: %s [%s]" % (label, format_cp(label_cp))) (eligible, label_parts, label_invalid_parts, disp, _, _) = lgr.test_label_eligible(label_cp) write_output("\tEligible: %s" % eligible) write_output("\tDisposition: %s" % disp) if eligible: if merged_lgr and set_labels: write_output("Collisions:") if label in set_labels: write_output("Labels is in the LGR set labels") else: indexes = get_collisions(merged_lgr, set_labels + [label], quiet=True) if len(indexes) > 1: # there should be one collision except if set labels are not checked logger.error( 'More than one collision, please check your LGR set labels' ) return elif len(indexes) > 0: collisions = indexes[list(indexes.keys())[0]] collision = None collide_with = [] # retrieve label in collision list for col in collisions: if col['label'] == label: collision = col if col['label'] in set_labels: collide_with.append(col) if not collision: # this should not happen except if set labels are not checked logger.error( 'Cannot retrieve label in collisions, please check your LGR set labels' ) return if len(collide_with) != 1: logger.error( 'Collision with more than one label in the LGR set labels,' 'please check your LGR set labels') return write_output("Label collides with LGR set label '%s'" % collide_with[0]['label']) else: write_output('\tNone') if generate_variants: write_output("Variants:") summary, labels = lgr.compute_label_disposition_summary(label_cp) for (variant_cp, var_disp, _, _, _) in labels: variant_u = cp_to_ulabel(variant_cp) write_output("\tVariant %s [%s]" % (variant_u, format_cp(variant_cp))) write_output("\t- Disposition: '%s'" % var_disp) else: write_output("- Valid code points from label: %s" % u' '.join(u"{:04X}".format(cp) for cp in label_parts)) if label_invalid_parts: write_output("- Invalid code points from label: {}".format( ' '.join("{:04X} ({})".format( cp, "not in repertoire" if rules is None else ','.join(rules)) for cp, rules in label_invalid_parts)))