def check_conditional_variants(lgr, options):
    """
    Check that all variants "when"/"not-when" values
    are parameterized context rule.

    :param LGR lgr: The LGR to check.
    :param options: Dictionary of options to the validation function - unused.
    """
    logger.info("Testing conditional variants")
    for char in lgr.repertoire:
        if isinstance(char, RangeChar):
            # Range have no variants
            continue
        for var in char.get_variants():
            when = var.when
            not_when = var.not_when

            if when is not None and when not in lgr.rules:
                logger.warning(
                    "CP %s: Variant '%s' \"when\" attribute "
                    "'%s' is not an existing rule name.", format_cp(char.cp),
                    format_cp(var.cp), when)
            if not_when is not None and not_when not in lgr.rules:
                logger.warning(
                    "CP %s: Variant '%s' \"not-when\" attribute "
                    "'%s' is not an existing rule name.", format_cp(char.cp),
                    format_cp(var.cp), not_when)

    logger.info("Conditional variants test done")

    return True
Exemple #2
0
def check_symmetry(lgr, options):
    """
    Check that all variants are defined in a symmetric way.

    If B is a variant of A, then A must be a variant of B.

    We only check the presence of the code point in the variants, not that all
    properties are identical (type, when, not-when).

    :param lgr: The LGR to be tested.
    :param options: Dictionary of options to the validation function - unused.
    """
    logger.info("Testing symmetry")
    for a in lgr.repertoire:
        if isinstance(a, RangeChar):
            # Range have no variants
            continue

        for b in a.get_variants():
            if b.cp not in lgr.repertoire:
                # Variant is not defined in repertoire
                logger.warning('CP %s: Variant %s is not in repertoire.',
                               format_cp(a.cp), format_cp(b.cp))
                continue

            # Variant is defined in repertoire,
            # let's see if the original character is in its
            # variants
            if a.cp not in [var.cp for var in lgr.get_variants(b.cp)]:
                logger.warning('CP %s should have CP %s in its variants.',
                               format_cp(b.cp), format_cp(a.cp))
    logger.info("Symmetry test done")

    return True
Exemple #3
0
    def add_range(self,
                  first_cp,
                  last_cp,
                  comment=None,
                  ref=None,
                  tag=None,
                  when=None,
                  not_when=None,
                  skip_check=False):
        """
        Add a range of characters to the LGR.

        :param first_cp: First code point of the range.
        :param last_cp: Last code point of the range.
        :param comment: Comment associated to the range.
        :param ref: List of references associated to the range.
        :param tag: List of tags associated to the range.
        :param when: Condition to be satisfied by the code point.
        :param not_when: Condition to not be satisfied by the codepoint.
        :param skip_check: If True, skips checking for overlapping ranges.
                           Invalid use of this parameter may leave
                           the dictionary in an inconsistent state!
        :raises RangeAlreadyExists: If input already exists in dictionary.

        >>> cd = Repertoire()
        >>> cd.add_range(0x002A, 0x0030)
        >>> 0x02A in cd
        True
        >>> c = cd[0x002A]
        >>> isinstance(c, RangeChar)
        True
        >>> c.first_cp == 0x002A and c.last_cp == 0x0030
        True
        >>> cd.add_range(0x002A, 0x0030) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
        ...
        RangeAlreadyExists:
        """
        assert first_cp < last_cp, "range must be defined in order"

        if not skip_check and self._check_range_overlap(first_cp, last_cp):
            logger.error("Range '%s - %s' already exists", format_cp(first_cp),
                         format_cp(last_cp))
            raise RangeAlreadyExists(first_cp, last_cp)

        for cp in range(first_cp, last_cp + 1):
            char = RangeChar(cp,
                             first_cp,
                             last_cp,
                             comment=comment,
                             ref=ref,
                             tag=tag,
                             when=when,
                             not_when=not_when)
            # TODO: clean-up range on error.
            self._add_char(char)

        # Insert by first cp
        self.ranges.append((first_cp, last_cp))
Exemple #4
0
    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.
    
        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue
            if UNICODE_CODEPOINT_RE.match(line) is None:
                # Line is not starting with a valid unicode code point, skip
                continue

            # Remove comments and split base character from variant(s)
            char_variant = line.split('#')[0].split('|')
            char = char_variant[0]

            try:
                codepoints = parse_char(char)
                self._lgr.add_cp(codepoints)
            except ValueError:
                logger.error("Invalid character '%s' at line %d", char,
                             line_num)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)

            # Handle variants, if any
            if len(char_variant) > 1:
                variants = char_variant[1].split(':')

                for var in variants:
                    try:
                        var_codepoints = parse_char(var)
                        self._lgr.add_variant(codepoints, var_codepoints)
                    except ValueError:
                        logger.error("Invalid variant '%s' at line %d", var,
                                     line_num)
                    except LGRException as exc:
                        logger.error(
                            "Cannot add variant '%s' to code point '%s' at line %d: %s",
                            format_cp(var_codepoints), format_cp(codepoints),
                            line_num, exc)
Exemple #5
0
    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.

        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue

            codepoints = []
            for cp in UNICODE_CODEPOINT_RE.finditer(line):
                try:
                    codepoints.append(int(cp.group(1), 16))
                except ValueError:
                    logger.error("Invalid code point '%s' at line %d", cp,
                                 line_num)

            try:
                self._lgr.add_cp(codepoints)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)
Exemple #6
0
    def del_char(self, cp_or_sequence):
        """
        Delete a character from the LGR.

        :param cp_or_sequence: code point or code point sequence to delete.
        :raises NotInLGR: If the code point does not exist.

        >>> cd = Repertoire()
        >>> _ = cd.add_char([0x002A])
        >>> 0x002A in cd
        True
        >>> cd.del_char([0x002A])
        >>> 0x002A in cd
        False
        >>> cd.del_char([0x002B]) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
        ...
        NotInLGR:
        """
        assert len(cp_or_sequence), "there should be at least one char"

        char = CharBase.from_cp_or_sequence(cp_or_sequence)
        if not self._del_char(char):
            logger.error("Code point '%s' does not exist",
                         format_cp(cp_or_sequence))
            raise NotInLGR(cp_or_sequence)
Exemple #7
0
    def get_char(self, cp_or_sequence):
        """
        Get the char object of a code point.

        :param cp_or_sequence: Code point or sequence of the character
                              to get.
        :raises NotInLGR: If the code point does not exist.

        >>> cd = Repertoire()
        >>> char = cd.add_char([0x002A])
        >>> c = cd.get_char([0x002A])
        >>> c is char
        True
        """
        assert len(cp_or_sequence), "there should be at least one char"

        origin = CharBase.from_cp_or_sequence(cp_or_sequence)

        idx = origin.as_index()
        if idx not in self._chardict:
            raise NotInLGR(cp_or_sequence)

        chars = self._chardict[idx]
        try:
            list_idx = chars.index(origin)
        except ValueError:
            logger.error("Code point '%s' does not exist",
                         format_cp(cp_or_sequence))
            raise NotInLGR(cp_or_sequence)

        char = chars[list_idx]
        return char
Exemple #8
0
def check_conditional_variants(lgr, options):
    """
    Check that all variants "when"/"not-when" values
    are parameterized context rule.

    :param LGR lgr: The LGR to check.
    :param options: Dictionary of options to the validation function - unused.
    """
    logger.info("Testing conditional variants")
    success = True
    result = {'description': 'Testing conditional variants', 'repertoire': []}
    for char in lgr.repertoire:
        if isinstance(char, RangeChar):
            # Range have no variants
            continue
        for var in char.get_variants():
            when = var.when
            not_when = var.not_when

            if when is not None and when not in lgr.rules:
                logger.warning(
                    "CP %s: Variant '%s' \"when\" attribute "
                    "'%s' is not an existing rule name.", format_cp(char.cp),
                    format_cp(var.cp), when)
                success = False
                result['repertoire'].append({
                    'char': char,
                    'variant': var,
                    'rule_type': 'when',
                    'rule': when
                })
            if not_when is not None and not_when not in lgr.rules:
                logger.warning(
                    "CP %s: Variant '%s' \"not-when\" attribute "
                    "'%s' is not an existing rule name.", format_cp(char.cp),
                    format_cp(var.cp), not_when)
                success = False
                result['repertoire'].append({
                    'char': char,
                    'variant': var,
                    'rule_type': 'not-when',
                    'rule': not_when
                })

    logger.info("Conditional variants test done")

    return success, result
Exemple #9
0
def _compare(labels, label1_indexes, label2_indexes):
    for label in labels:
        label_cp_out = format_cp(tuple([ord(c) for c in label]))
        (index1, index2) = labels[label]
        yield "\n## Comparison on label " \
              "{label} [{cp}]\n".format(label="%s'%s'%s" % (LRI, label, PDI),
                                        cp=label_cp_out)
        yield "\n### Test dispositions: ###\n"
        labels2 = label2_indexes[index2]
        labels1 = label1_indexes[index1]
        # change in disposition
        ret = True
        for label2 in labels2:
            label2_cp = label2['cp']
            for label1 in labels1:
                label1_cp = label1['cp']
                if label1_cp == label2_cp:
                    (res, out) = _compare_labels(label, label1, label2)
                    ret &= res
                    if out:
                        yield MD
                        yield out
                        yield MD
        if ret:
            yield MD
            yield 'No changes in disposition.'
            yield MD

        # change in number of variants
        yield "\n### Test number of variants: ###\n"
        added = _get_new_variants(labels2, labels1)
        removed = _get_new_variants(labels1, labels2)
        for add in added:
            yield MD
            yield "New {cat} in LGR2:\n" \
                  "{label} [{cp}]\n".format(cat=add['cat'],
                                            label=add['bidi'],
                                            cp=add['cp_out'])
            try:
                yield "\nRules for LGR2:\n{}".format(add['rules'][label])
            except KeyError:
                raise InvalidSymmetry()
            yield MD
        for remove in removed:
            yield MD
            yield "Removed {cat} in LGR2:\n" \
                  "{label} [{cp}]\n".format(cat=remove['cat'],
                                            label=remove['bidi'],
                                            cp=remove['cp_out'])
            try:
                yield "\nRules for LGR1:\n{}".format(remove['rules'][label])
            except KeyError:
                raise InvalidSymmetry()
            yield MD

        if len(added) == 0 and len(removed) == 0:
            yield MD
            yield 'No changes in number of variants.'
            yield MD
Exemple #10
0
def check_transitivity(lgr, options):
    """
    Check that all variants are defined in a transitive way.

    If B is a variant of A and C a variant of B, then C must be a variant of A.

    We only check the presence of the code point in the variants, not that all
    properties are identical (type, when, not-when).

    Note: This test assumes the LGR is symmetric.

    :param LGR lgr: The LGR to check.
    :param options: Dictionary of options to the validation function - unused.
    :return True is LGR transitivity is achieved, False otherwise.
    """
    success = True
    logger.info("Testing transitivity")
    result = {'description': 'Testing transitivity', 'repertoire': []}

    for a in lgr.repertoire:
        if isinstance(a, RangeChar):
            # Range have no variants
            continue

        a_variants = list(a.get_variants())  # get_variants() returns generator
        logger.debug("A: '%s'", format_cp(a.cp))
        for b in a_variants:
            logger.debug("A: '%s' - B: '%s'", format_cp(a.cp), format_cp(b.cp))
            try:
                variants = lgr.get_variants(b.cp)
            except NotInLGR:
                logger.error("Code point '%s' not in LGR", format_cp(b.cp))
                success = False
                continue
            # Variant is defined in repertoire
            # (we have checked for symmetry first)
            for c in [var for var in variants if var.cp != a.cp]:
                logger.debug("A: '%s' - B: '%s' - C: '%s'", format_cp(a.cp),
                             format_cp(b.cp), format_cp(c.cp))
                # Iterate through all second-level variants
                # which are not the original code point
                if c.cp not in [var.cp for var in a_variants]:
                    success = False
                    logger.warning("CP %s should have CP %s in its variants.",
                                   format_cp(a.cp), format_cp(c.cp))
                    lgr.notify_error('basic_transitivity')
                    result['repertoire'].append({'char': a, 'variant': c})
    logger.info("Transitivity test done")
    lgr.notify_tested('basic_transitivity')

    return success, result
Exemple #11
0
def display_variant(variant):
    """
    Nicely display a variant.

    :param variant: The variant to display.
    """
    return "Variant {}: type={} - when={} - not-when={} - comment={}".format(
        format_cp(variant.cp), variant.type, variant.when, variant.not_when,
        variant.comment)
Exemple #12
0
    def insert_variant(self, line_num, codepoints, var, var_type=None):
        try:
            variants = parse_char(var)
        except ValueError:
            logger.error("Invalid variant '%s' at line %d", var, line_num)
            return

        for (var_codepoints, references) in variants:
            try:
                self._lgr.add_variant(codepoints,
                                      var_codepoints,
                                      ref=references,
                                      variant_type=var_type)
            except LGRException as exc:
                logger.error(
                    "Cannot add variant '%s' to code point '%s' at line %d: %s",
                    format_cp(var_codepoints), format_cp(codepoints), line_num,
                    exc)
Exemple #13
0
def check_label(lgr, label, invalid, test):
    from lgr.utils import format_cp
    label_cp = tuple([ord(c) for c in label])
    label_display = u' '.join(u"{:04X}".format(cp) for cp in label_cp)

    logger.info("- Code points: %s", label_display)

    (eligible, label_part, not_in_lgr, disp, action_idx,
     logs) = lgr.test_label_eligible(label_cp)
    logger.info("- Eligible: %s", eligible)
    logger.info("- Disposition: %s", disp)
    is_default_action = action_idx > len(lgr.actions)
    actual_index = action_idx if not is_default_action else action_idx - len(
        lgr.actions)
    action_name = "DefaultAction" if is_default_action else "Action"
    logger.info("- Action triggered: %s[%d]", action_name, actual_index)
    logger.info("- Logs: %s", logs)

    write_output(
        u"Validation: {} ({}): Result: {}".format(
            label, label_display, "valid" if eligible else "INVALID"), test)

    if eligible:
        write_output(
            u"Disposition: {} ({}): Result: {} due to {}[{}]".format(
                label, label_display, disp, action_name, actual_index), test)

        summary, labels = lgr.compute_label_disposition_summary(
            label_cp, include_invalid=invalid)
        logger.info("Summary: %s", summary)
        for (variant_cp, var_disp, action_idx, disp_set, logs) in labels:
            variant_u = ''.join([unichr(c) for c in variant_cp])
            variant_display = u' '.join(u"{:04X}".format(cp)
                                        for cp in variant_cp)
            logger.info("\tVariant '%s'", variant_u)
            logger.info("\t- Code points: %s", format_cp(variant_cp))
            logger.info("\t- Disposition: '%s'", var_disp)

            is_default_action = action_idx > len(lgr.actions)
            actual_index = action_idx if not is_default_action else action_idx - len(
                lgr.actions)
            action_name = "DefaultAction" if is_default_action else "Action"
            logger.info("\t- Action triggered: %s[%d]", action_name,
                        actual_index)
            disp_set_display = '{%s}' % ','.join(disp_set)
            write_output(
                u"Variant: ({}): [{}] ==> {} due to {}[{}]".format(
                    variant_display, disp_set_display, var_disp, action_name,
                    actual_index), test)

            logger.info("\t- Logs: %s", logs)
    else:
        logger.info("- Valid code points from label: %s",
                    u' '.join(u"{:04X}".format(cp) for cp in label_part))
        logger.info("- Invalid code points from label: %s",
                    u' '.join(u"{:04X}".format(cp) for cp in not_in_lgr))
Exemple #14
0
def populate_lgr(lgr):
    """
    Populate an LGR with missing variants, and fix symmetry and transitivity

    :param lgr: The LGR to be populated.
    :return: Result of checks and summary as a string
    """
    # not in LGR variants
    for a in lgr.repertoire:
        for b in a.get_variants():
            try:
                lgr.get_variants(b.cp)
            except NotInLGR:
                logger.info("Add missing code point '{}' in LGR as it is a variant of '{}'".format(
                    format_cp(b.cp), format_cp(a.cp)))
                lgr.add_cp(b.cp)
                # add current code point as variant for missing code point
                logger.info("Add code point '{}' as variant of '{}' for symmetry".format(format_cp(a.cp),
                                                                                         format_cp(b.cp)))
                lgr.add_variant(b.cp, a.cp, variant_type='blocked')

    while not check_symmetry(lgr, None)[0] or not check_transitivity(lgr, None)[0]:
        # symmetry
        for a in lgr.repertoire:
            for b in a.get_variants():
                # Variant is defined in repertoire
                # let's see if the original character is in its
                # variants
                if a.cp not in [var.cp for var in lgr.get_variants(b.cp)]:
                    logger.info("Add code point '{}' as variant of '{}' for symmetry".format(format_cp(a.cp),
                                                                                             format_cp(b.cp)))
                    lgr.add_variant(b.cp, a.cp, variant_type='blocked')

        # transitivity
        for a in lgr.repertoire:
            for b in a.get_variants():
                for c in [var for var in lgr.get_variants(b.cp) if var.cp != a.cp]:
                    # Iterate through all second-level variants
                    # which are not the original code point
                    if c.cp not in [var.cp for var in a.get_variants()]:
                        logger.info("Add code point '{}' as variant of '{}' for transitivity with '{}'".format(
                            format_cp(c.cp), format_cp(a.cp), format_cp(b.cp)))
                        lgr.add_variant(a.cp, c.cp, variant_type='blocked')
Exemple #15
0
def find_variants_to_block(lgr, label_ref, label):
    var_ref = [var for (var, _, _) in lgr._generate_label_variants(label_ref)]

    for (variant_cp, disp, _, _, disp_set,
         _) in lgr.compute_label_disposition(label):
        if variant_cp in var_ref:
            variant_u = cp_to_ulabel(variant_cp)
            write_output("Variant '%s' [%s] with disposition set '%s' "
                         "should be blocked (current disposition :%s)" %
                         (variant_u, format_cp(variant_cp), disp_set, disp))
def cross_script_variants(lgr, labels_input):
    """
    Compute cross-script variants of labels.

    :param lgr: The LGR to use for variant generation.
    :param labels_input: The file containing the labels
    """
    if lgr.metadata is None:
        logger.error("Cannot generate cross-scripts variants "
                     "for LGR without metadata")
        raise Exception
    if lgr.unicode_database is None:
        logger.error("Cannot generate cross-scripts variants "
                     "for LGR without unicode database attached")
        raise Exception
    found = False
    for label, valid, error in read_labels(labels_input, lgr.unicode_database):
        if not valid:
            yield "Input label {}: {}\n".format(label, error)
        else:
            label_cp = tuple([ord(c) for c in label])
            result, _, _, _, _, _ = lgr.test_label_eligible(label_cp)
            if not result:
                continue
            label_displayed = False
            for variant, disp, script_mapping in _generate_variants(
                    lgr, label_cp):
                if not label_displayed:
                    # Only display input label if it has x-variants
                    yield "Input label {} ({}) has cross-script variants:\n".format(
                        format_cp(label_cp), label)
                    label_displayed = True
                    found = True
                yield "\t- Cross-variant {} ({}), disposition {}:\n".format(
                    format_cp(variant), cp_to_ulabel(variant), disp)
                yield '\t\t+ ' + '\t\t+ '.join([
                    "{} ({}): {}\n".format(format_cp(c), cp_to_ulabel(c), s)
                    for c, s in script_mapping.items()
                ])

    if not found:
        yield 'No cross-script variants for input!'
Exemple #17
0
def check_label(lgr, label, generate_variants=False):
    from lgr.utils import format_cp
    label_cp = tuple([ord(c) for c in label])

    write_output("Label: %s [%s]" % (label, format_cp(label_cp)))

    (eligible, label_part, not_in_lgr, disp, _, _) = lgr.test_label_eligible(label_cp)
    write_output("- Eligible: %s" % eligible)
    write_output("- Disposition: %s" % disp)

    if eligible:
        if generate_variants:
            summary, labels = lgr.compute_label_disposition_summary(label_cp)
            for (variant_cp, var_disp, _, _, _) in labels:
                variant_u = ''.join([unichr(c) for c in variant_cp])
                write_output("\tVariant %s [%s]" % (variant_u, format_cp(variant_cp)))
                write_output("\t- Disposition: '%s'" % var_disp)
    else:
        write_output("- Valid code points from label: %s" % u' '.join(u"{:04X}".format(cp) for cp in label_part))
        write_output("- Invalid code points from label: %s" % u' '.join(u"{:04X}".format(cp) for cp in not_in_lgr))
Exemple #18
0
    def del_range(self, first_cp, last_cp):
        """
        Delete a range of characters from the LGR.

        Note: This MUST be the exact same range that was added,
        meaning you cannot delete partial sub-ranges!

        :param first_cp: First code point of the range.
        :param last_cp: Last code point of the range.
        :raises NotInLGR: If the range does not exist.

        >>> cd = Repertoire()
        >>> cd.add_range(0x002A, 0x0030)
        >>> cd.del_range(0x002A, 0x0030)
        >>> 0x002A in cd
        False
        >>> cd.del_range(0x002A, 0x0030) # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
        ...
        NotInLGR:
        """
        assert first_cp < last_cp, "range must be defined in order"

        if (first_cp, last_cp) not in self.ranges:
            logger.error("Range '%s - %s' does not exist", format_cp(first_cp),
                         format_cp(last_cp))
            raise NotInLGR(first_cp)

        for cp in range(first_cp, last_cp + 1):
            char = RangeChar(cp, first_cp, last_cp)
            if not self._del_char(char):
                # TODO: clean-up range on error
                # This should only happen if range insertion failed
                # -> inconsistent state for now
                logger.critical("Range '%s - %s' is missing code point %s",
                                format_cp(first_cp), format_cp(last_cp),
                                format_cp(cp))
                raise NotInLGR(cp)

        # Remove and sort by first cp
        self.ranges.remove((first_cp, last_cp))
Exemple #19
0
    def add_variant(self,
                    cp_or_sequence,
                    variant_type=None,
                    when=None,
                    not_when=None,
                    comment=None,
                    ref=None):
        """
        Add a variant to a char.

        :param cp_or_sequence: Code point or code point sequence of the variant.
        :param variant_type: Type for the variant, if any.
        :param when: Condition to be satisfied for the variant to exist.
        :param not_when: Condition to must be not satisfied for the variant
                         to exist.
        :param comment: Optional comment for the variant.
        :param ref: List of references associated to the code point.
        :raises VariantAlreadyExists: If variant already exists for character.

        >>> c = CharBase.from_cp_or_sequence([1])
        >>> c.add_variant([10], 'BLOCKED')
        >>> (10, ) in c._variants
        True
        >>> c._variants[(10,)][0].type == text_type('BLOCKED')
        True
        """
        assert len(cp_or_sequence), "there should be at least one char"

        var = Variant(cp_or_sequence,
                      variant_type=variant_type,
                      when=when,
                      not_when=not_when,
                      comment=comment,
                      ref=ref)

        idx = tuple(cp_or_sequence)
        if idx in self._variants and var in set(self._variants[idx]):
            logger.error("%r: Variant '%s' already exists", self,
                         format_cp(cp_or_sequence))
            raise VariantAlreadyExists(self.cp, var.cp)
        else:
            self._variants.setdefault(idx, []).append(var)
Exemple #20
0
def rebuild_lgr(lgr, options):
    """
    Rebuild an LGR with given parameters.

    options argument can contain:
        * unicode_version: The target Unicode version to be used
          when rebuilding the LGR. If None is given, use the current one.
        * validating_repertoire: The validating repertoire used
          for checking code points.
        * unidb: Munidata's Unicode database. If None, skip Unicode checks.

    :param LGR lgr: The LGR to rebuild.
    :param dict options: Dictionary of options to the validation function.
    """
    # Local import to prevent import cycles
    from lgr.core import LGR

    unicode_version = options.get('unicode_version',
                                  lgr.metadata.unicode_version)
    validating_repertoire = options.get('validating_repertoire', None)

    description = "Rebuilding LGR with Unicode version {}".format(
        unicode_version)
    if validating_repertoire is not None:
        description += " and validating repertoire '{}'".format(
            validating_repertoire)
    result = {
        'description': description,
        'repertoire': {}  # XXX: Cannot use defaultdict because of django...
    }

    logger.info(
        "Rebuilding LGR '%s' with Unicode version %s "
        "and Validating Repertoire '%s'", lgr, unicode_version,
        validating_repertoire)

    unidb = options.get('unidb', None)
    if unidb is not None:
        unidb_version = unidb.get_unicode_version()
        if unidb_version != unicode_version:
            result['generic'] = "Target Unicode version {} " \
                                "differs from UnicodeDatabase {}".format(unicode_version,
                                                                         unidb_version)
            logger.warning(
                "Target Unicode version %s differs "
                "from UnicodeDatabase %s", unicode_version, unidb_version)

    # For now, simply copy the metadata and references of the source LGR
    target_metadata = copy.deepcopy(lgr.metadata)
    target_metadata.unicode_version = unicode_version
    target_reference_manager = copy.deepcopy(lgr.reference_manager)

    target_lgr = LGR(name=lgr.name,
                     metadata=target_metadata,
                     reference_manager=target_reference_manager,
                     unicode_database=unidb)

    for char in lgr.repertoire:
        if isinstance(char, RangeChar):
            range_ok = True
            for cp, status in target_lgr.check_range(char.first_cp,
                                                     char.last_cp,
                                                     validating_repertoire):
                if status is not None:
                    result['repertoire'].setdefault(char, {}).setdefault(
                        'errors', []).append(status)
                    range_ok = False
                in_script, _ = lgr.cp_in_script([cp])
                if not in_script:
                    result['repertoire'].setdefault(char, {}).setdefault(
                        'warnings', []).append(CharNotInScript(cp))
                    range_ok = False

            if not range_ok:
                continue

            try:
                target_lgr.add_range(
                    char.first_cp,
                    char.last_cp,
                    comment=char.comment,
                    ref=char.references,
                    tag=char.tags,
                    when=char.when,
                    not_when=char.not_when,
                    validating_repertoire=validating_repertoire,
                    override_repertoire=False)
            except LGRException as exc:
                result['repertoire'].setdefault(char,
                                                {}).setdefault('errors',
                                                               []).append(exc)
                logger.error("Cannot add range '%s-%s'",
                             format_cp(char.first_cp), format_cp(char.last_cp))
            continue

        in_script, _ = lgr.cp_in_script(char.cp)
        if not in_script:
            result['repertoire'].setdefault(char, {}).setdefault(
                'warnings', []).append(CharNotInScript(char.cp))
        # Insert code point
        try:
            target_lgr.add_cp(char.cp,
                              comment=char.comment,
                              ref=char.references,
                              tag=char.tags,
                              when=char.when,
                              not_when=char.not_when,
                              validating_repertoire=validating_repertoire,
                              override_repertoire=False)
        except LGRException as exc:
            result['repertoire'].setdefault(char,
                                            {}).setdefault('errors',
                                                           []).append(exc)
            logger.error("Cannot add code point '%s'", format_cp(char.cp))
            if not isinstance(exc, CharInvalidIdnaProperty
                              ):  # Cannot include non-IDNA valid code points
                target_lgr.add_cp(char.cp,
                                  comment=char.comment,
                                  ref=char.references,
                                  tag=char.tags,
                                  when=char.when,
                                  not_when=char.not_when,
                                  force=True)

        # Create variants
        for var in char.get_variants():
            try:
                target_lgr.add_variant(
                    char.cp,
                    variant_cp=var.cp,
                    variant_type=var.type,
                    when=var.when,
                    not_when=var.not_when,
                    comment=var.comment,
                    ref=var.references,
                    validating_repertoire=validating_repertoire,
                    override_repertoire=True)
            except LGRException as exc:
                result['repertoire'].setdefault(char, {}).setdefault(
                    'variants', {}).setdefault(var, []).append(exc)
                logger.error("Cannot add variant '%s' to code point '%s'",
                             format_cp(var.cp), format_cp(char.cp))
                if not isinstance(
                        exc, CharInvalidIdnaProperty
                ):  # Cannot include non-IDNA valid code points
                    target_lgr.add_variant(char.cp,
                                           variant_cp=var.cp,
                                           variant_type=var.type,
                                           when=var.when,
                                           not_when=var.not_when,
                                           comment=var.comment,
                                           ref=var.references,
                                           force=True)

    logger.info("Rebuilding LGR '%s done", lgr)

    return True, result
Exemple #21
0
    def _parse_doc(self, rule_file):
        """
        Actual parsing of document.

        :param rule_file: Content of the rule, as a file-like object.
        """
        line_num = 0
        for line in rule_file:
            line_num += 1

            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '#':
                continue

            reference = REFERENCE_RE.match(line)
            if reference is not None:
                ref_id = reference.group('ref_id')
                value = reference.group('value')
                comment = reference.group('comment')
                try:
                    self._lgr.add_reference(value,
                                            ref_id=ref_id,
                                            comment=comment)
                except LGRException:
                    logger.error("Invalid reference '%s' on line %d", line,
                                 line_num)
                continue

            version = VERSION_RE.match(line)
            if version is not None:
                version_no = version.group('version_no')
                date = version.group('date')
                comment = version.group('comment')

                try:
                    self._lgr.metadata.version = Version(version_no,
                                                         comment=comment)
                    self._lgr.metadata.date = date
                except LGRException:
                    logger.error("Invalid version '%s' on line %d", line,
                                 line_num)
                continue

            if UNICODE_CODEPOINT_RE.match(line) is None:
                logger.debug("Skipping non-parsable line %d:\n%s", line_num,
                             line)
                # Line is not starting with a valid unicode code point, skip
                continue

            # Split base character from variant(s)
            char_variant = line.split(';')
            char = char_variant[0]

            try:
                [(codepoints, references)] = parse_char(char)
                self._lgr.add_cp(codepoints, ref=references)
            except ValueError:
                logger.error("Invalid character '%s' at line %d", char,
                             line_num)
            except LGRException as exc:
                logger.error("Cannot add code point '%s' at line %d: %s",
                             format_cp(codepoints), line_num, exc)

            if len(char_variant) > 1:
                preferred_variants = char_variant[1].strip()
                if len(preferred_variants
                       ) > 0 and preferred_variants[0] != '#':
                    # From RFC7940, Section 7.3. Recommended Disposition Values:
                    # activated  The resulting string should be activated for use.  (This
                    # is the same as a Preferred Variant [RFC3743].)
                    var_type = "activated"
                    self.insert_variant(line_num, codepoints,
                                        preferred_variants, var_type)

            if len(char_variant) > 2:
                variants = char_variant[2].strip()
                if len(variants) > 0 and variants[0] != '#':
                    self.insert_variant(line_num, codepoints, variants)
Exemple #22
0
def compute_stats(lgr, options):
    """
    Compute statistics for an LGR.

    :param lgr: The LGR to use.
    :param options: Not used.
    """

    stats = {
        'codepoint_number': 0,
        'range_number': 0,
        'largest_range': '',
        'largest_range_len': 0,
        'sequence_number': 0,
        'largest_sequence': '',
        'largest_sequence_len': 0,
        'codepoints_with_variants': 0,
        'variant_number': 0,
        'variants_by_type': {},
        'average_variants': 0,
        'codepoints_by_tag': {},
        'rule_number': len(lgr.rules),
    }

    for char in lgr.repertoire:

        # Range len set to 1 by default (for single code point and sequences)
        range_len = 1
        if isinstance(char, RangeChar):
            range_len = char.last_cp - char.first_cp + 1
            stats['codepoint_number'] += range_len
            stats['range_number'] += 1
            if range_len > stats['largest_range_len']:
                stats['largest_range_len'] = range_len
                stats['largest_range'] = format_cp(char.cp)
        elif isinstance(char, CharSequence):
            stats['codepoint_number'] += 1
            stats['sequence_number'] += 1
            sequence_len = len(char.cp)
            if sequence_len > stats['largest_sequence_len']:
                stats['largest_sequence_len'] = sequence_len
                stats['largest_sequence'] = format_cp(char.cp)
        elif isinstance(char, Char):
            stats['codepoint_number'] += 1

        for t in char.tags:
            if t in stats['codepoints_by_tag']:
                stats['codepoints_by_tag'][t] += range_len
            else:
                stats['codepoints_by_tag'][t] = range_len

        variants = list(char.get_variants())
        variants_len = len(variants)
        stats['variant_number'] += variants_len
        if variants_len > 0:
            stats['codepoints_with_variants'] += 1

        for var in variants:
            if var.type in stats['variants_by_type']:
                stats['variants_by_type'][var.type] += 1
            else:
                stats['variants_by_type'][var.type] = 1

    if stats['codepoints_with_variants'] != 0:
        stats['average_variants'] = \
            stats['variant_number'] / stats['codepoints_with_variants']

    # General summary
    output = """
General summary:
\tNumber of code points: {codepoint_number}.

\tNumber of ranges: {range_number}.
\tLargest range: {largest_range} (length: {largest_range_len}).

\tNumber of sequences: {sequence_number}.
\tLargest sequence: {largest_sequence} (length: {largest_sequence_len}).
""".format(**stats)

    # Variants
    output += """
Variants:
\tTotal number of variants: {variant_number}.
\tAverage number of variants per code point: {average_variants}.

""".format(**stats)

    for (variant_type, number) in stats['variants_by_type'].iteritems():
        output += "\tNumber of variants for type '{0}': {1}.\n"\
                .format(variant_type, number)

    # Tags
    output += """
Tags:
"""
    for (tag_name, number) in stats['codepoints_by_tag'].iteritems():
        output += "\tNumber of code points for tag '{0}': {1}.\n"\
                .format(tag_name, number)

    # Rules summary
    output += "\nRules:\n"
    output += "\tNumber of rules defined: {0}.\n".format(stats['rule_number'])

    logger.info(output)

    return True
Exemple #23
0
def _generate_indexes(lgr, labels, keep=False, quiet=False):
    """
    Generate indexes based on labels provided in the list

    :param lgr: The current LGR
    :param labels: The list of labels, as a list of U-Labels.
    :param keep: Do we keep labels without collision in the output
    :param quiet: If True, do not collect rule log.

    :return: (label_indexes, not_in_lgr), with:
              - label_indexes: the dictionary containing the primary labels
                               and their variants (with various information) for each index.
              - not_in_lgr: List of labels that do not pass preliminary eligibility testing.
  """

    label_indexes = {}
    not_in_lgr = []
    # Get the indexes and variants for all labels
    for label in labels:
        label_cp = tuple([ord(c) for c in label])
        try:
            label_index = lgr.generate_index_label(label_cp)
        except NotInLGR:
            not_in_lgr.append(label_cp)
            continue

        label_cp_out = format_cp(label_cp)
        if label_index not in label_indexes:
            label_indexes[label_index] = []
        label_indexes[label_index].append({
            'label':
            label,
            'bidi':
            "%s'%s'%s" % (LRI, label, PDI),
            'cat':
            PRIMARY,
            'cp':
            label_cp,
            'cp_out':
            label_cp_out,
            'disp': {
                label: '-'
            },
            'rules': {
                label: '-'
            },
            'action_idx': {
                label: '-'
            }
        })

    for (label_index, primaries) in deepcopy(label_indexes).items():
        # only get variants for collided labels (if not keep)
        if len(primaries) < 2 and not keep:
            del label_indexes[label_index]
            continue
        for primary in primaries:
            label_cp = primary['cp']
            label = primary['label']
            for (variant_cp, variant_disp, variant_invalid_parts, action_idx,
                 _,
                 log) in lgr.compute_label_disposition(label_cp,
                                                       include_invalid=True,
                                                       collect_log=not quiet):
                variant = cp_to_ulabel(variant_cp)
                log = log.strip()
                if quiet:
                    log = ''
                variant_cp_out = format_cp(variant_cp)
                # search if variant is already in our dict, then add or
                # update it
                existing = [
                    var for var in label_indexes[label_index]
                    if var['label'] == variant
                ]
                if len(existing) < 1:
                    label_indexes[label_index].append({
                        'label':
                        variant,
                        'bidi':
                        "%s'%s'%s" % (LRI, variant, PDI),
                        'cat':
                        VARIANT,
                        'cp':
                        variant_cp,
                        'cp_out':
                        variant_cp_out,
                        'disp': {
                            label: variant_disp
                        },
                        'rules': {
                            label: log
                        },
                        'action_idx': {
                            label: action_idx
                        }
                    })
                else:
                    assert len(existing) == 1
                    existing[0]['disp'][label] = variant_disp
                    existing[0]['rules'][label] = log
                    existing[0]['action_idx'][label] = action_idx

    return label_indexes, not_in_lgr
Exemple #24
0
def main():
    parser = argparse.ArgumentParser(description='LGR Collision')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='be verbose')
    parser.add_argument('-g',
                        '--generate',
                        action='store_true',
                        help='Generate variants')
    parser.add_argument('-l',
                        '--libs',
                        metavar='LIBS',
                        help='ICU libraries',
                        required=True)
    parser.add_argument('-s',
                        '--set',
                        metavar='SET FILE',
                        help='Filepath to the set of reference labels',
                        required=True)
    parser.add_argument('xml', metavar='XML')

    args = parser.parse_args()

    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(stream=sys.stdout, level=log_level)

    lgr_parser = XMLParser(args.xml)

    libpath, i18n_libpath, libver = args.libs.split('#')
    manager = UnicodeDataVersionManager()
    unidb = manager.register(None, libpath, i18n_libpath, libver)

    lgr_parser.unicode_database = unidb

    lgr = lgr_parser.parse_document()
    if lgr is None:
        logger.error("Error while parsing LGR file.")
        logger.error("Please check compliance with RNG.")
        return

    ref_label_indexes = {}

    # Compute index label for set or reference labels
    with io.open(args.set, 'r', encoding='utf-8') as ref_set:
        for ref_label in ref_set:
            label_cp = tuple([ord(c) for c in ref_label.strip()])
            try:
                label_index = compute_label_index(lgr, label_cp)
            except NotInLGR:
                continue
            ref_label_indexes[label_index] = label_cp

    # Deal with input
    for label in get_stdin().read().splitlines():
        write_output("Check label '%s'" % label)
        label_cp = tuple([ord(c) for c in label])
        label_disp = format_cp(label_cp)
        label_index = compute_label_index(lgr, label_cp)

        if label_index in ref_label_indexes:
            ref_label_cp = ref_label_indexes[label_index]
            ref_label_disp = format_cp(ref_label_cp)
            ref_label_u = cp_to_ulabel(ref_label_cp)

            write_output("Collision for label '%s' [%s] with '%s' [%s]" %
                         (label, label_disp, ref_label_u, ref_label_disp))
            if args.generate:
                find_variants_to_block(lgr, ref_label_cp, label_cp)
        else:
            write_output("No collision for label %s [%s]" %
                         (label, label_disp))
Exemple #25
0
    def matches(self,
                label,
                rules_lookup,
                classes_lookup,
                unicode_database,
                anchor=None,
                index=0):
        """
        Test if a rule matches a label.

        :param label: Label to test, as a sequence of code points.
        :param rules_lookup: Dictionary of defined rules in the LGR to use
                             for by-ref rules.
        :param classes_lookup: Dictionary of defined classes in the LGR to use
                               for by-ref classes.
        :param unicode_database: The Unicode Database.
        :param anchor: Optional anchor to use for look-around rules.
        :param index: If anchor is used, its index (0-based).
        :return: True if label is matched by the rule, False otherwise.
        """
        rule_logger.debug(
            "Test match on %s for label '%s' with anchor '%s' (%d)", self,
            format_cp(label),
            format_cp(anchor) if anchor else anchor, index)
        try:
            pattern = self.get_pattern(rules_lookup, classes_lookup,
                                       unicode_database)
        except (re.error, PICUException) as re_exc:
            rule_logger.error('Cannot get pattern for rule %s: %s', self,
                              re_exc)
            raise RuleError(self.name, re_exc)

        if len(pattern) == 0:
            # Pattern is empty, nothing will match
            rule_logger.debug('Empty pattern')
            return False

        if anchor is not None:
            if '%(anchor)s' not in pattern:
                rule_logger.debug('Not a parameterized context rule')
                # Pattern is not a parameterized context-rule, so set index to 0
                index = 0
            # Format anchor - Can be a sequence.
            # Use old-style formatting, see note in matcher.AnchorMatcher
            pattern = pattern % {
                'anchor': ''.join(
                    map(lambda c: '\\x{{{:X}}}'.format(c), anchor))
            }
        rule_logger.debug("Pattern for rule %s: '%s'", self, pattern)
        try:
            regex = unicode_database.compile_regex(pattern)
        except (re.error, PICUException) as re_exc:
            rule_logger.error('Cannot compile regex: %s', re_exc)
            raise RuleError(self.name, re_exc)

        rule_logger.debug("Index: %d", index)

        # Convert label to U-format to be used in regex
        label_u = cp_to_ulabel(label)

        # Look for match. It is important to use "search" and not "match"
        # here, since a rule may not match at the beginning of a label.
        result = regex.search(label_u, index=index)
        rule_logger.debug("Result of match: %s", result)
        if result is None:
            return False

        if anchor is not None:
            match_index = result.start()
            rule_logger.debug('Match index: %d - Index: %d', match_index,
                              index)
            if match_index > index:
                rule_logger.debug('Match found after index, invalid')
                return False
        return True
Exemple #26
0
def diff_lgrs(lgr1, lgr2):
    """
    Compare 2 LGRs.

    Returns a text containing results of comparaison

    :param lgr1: First LGR.
    :param lgr2: Second LGR.
    :return: Text comparison.
    """
    output = ""

    lgr1.expand_ranges()
    lgr2.expand_ranges()

    output += diff_metadata(lgr1.metadata, lgr2.metadata)
    output += diff_reference_manager(lgr1.reference_manager,
                                     lgr2.reference_manager)

    output += """


** Compare repertoire **
"""

    first_cps = {c.cp for c in lgr1.repertoire}
    second_cps = {c.cp for c in lgr2.repertoire}

    output += compare_things(first_cps,
                             second_cps,
                             "repertoire",
                             True,
                             format_fct=lambda c: " ".join(map(format_cp, c)),
                             show_same=False)

    output += """


** Compare common code points in repertoire **
"""

    for cp in set.intersection(first_cps, second_cps):
        char1 = lgr1.get_char(cp)
        char2 = lgr2.get_char(cp)

        output += """

Compare code point {}""".format(format_cp(cp))
        output += diff_char(char1, char2)

    output += """


** Compare WLE **
"""

    output += diff_actions(lgr1, lgr2)
    output += diff_rules(lgr1, lgr2)
    output += diff_classes(lgr1, lgr2)

    return output
Exemple #27
0
    def _process_data(self, elem):
        """
        Process the <data> element of an LGR XML file.
        """

        # It is RECOMMENDED to list all "char" elements in ascending order of
        # the "cp" attribute. The below variable is used when verifying that.
        previous_codepoint = []

        for child in elem:
            comment = child.get('comment', None)
            when = child.get('when', None)
            not_when = child.get('not-when', None)

            # Handle references
            ref = string_to_list(child.get('ref', ''))

            # Handle tags
            tag = string_to_list(child.get('tag', ''))

            if child.tag == CHAR_TAG:
                codepoint = [int(c, 16) for c in child.get('cp').split()]

                if codepoint <= previous_codepoint:
                    if previous_codepoint[0:len(codepoint)] == codepoint:
                        # Not clear what order is to be recommended here
                        self.rfc7940_checks.error(
                            'char_strict_ascending_order')
                    else:
                        logger.warning(
                            "cp attribute not in ascending order: '%s'",
                            child.get('cp'))
                        self.rfc7940_checks.error('char_ascending_order')
                previous_codepoint = codepoint

                try:
                    self._lgr.add_cp(codepoint,
                                     comment=comment,
                                     ref=ref,
                                     tag=tag,
                                     when=when,
                                     not_when=not_when,
                                     force=self.force_mode)
                except LGRException as exc:
                    logger.error("Cannot add code point '%s': %s",
                                 format_cp(codepoint), exc)
                    self.rfc7940_checks.error('parse_xml')
                    self.rfc7940_checks.error('codepoint_valid')
                    if not self.force_mode:
                        raise

                # Variants of char
                for variant in child.iter(VARIANT_TAG):
                    var_codepoint = [
                        int(c, 16) for c in variant.get('cp').split()
                    ]
                    when = variant.get('when', None)
                    not_when = variant.get('not-when', None)
                    variant_type = variant.get('type', None)
                    comment = variant.get('comment', None)

                    # Handle references
                    ref = string_to_list(variant.get('ref', ''))

                    try:
                        self._lgr.add_variant(codepoint,
                                              var_codepoint,
                                              variant_type=variant_type,
                                              when=when,
                                              not_when=not_when,
                                              comment=comment,
                                              ref=ref,
                                              force=self.force_mode)
                    except LGRException as exc:
                        logger.error(
                            "Cannot add variant '%s' "
                            "to code point '%s': %s", format_cp(var_codepoint),
                            format_cp(codepoint), exc)
                        self.rfc7940_checks.error('parse_xml')
                        self.rfc7940_checks.error('codepoint_valid')
                        if not self.force_mode:
                            raise
            elif child.tag == RANGE_TAG:
                first_cp = int(child.get('first-cp'), 16)
                last_cp = int(child.get('last-cp'), 16)

                try:
                    self._lgr.add_range(first_cp,
                                        last_cp,
                                        comment=comment,
                                        ref=ref,
                                        tag=tag,
                                        when=when,
                                        not_when=not_when,
                                        force=self.force_mode)
                except LGRException as exc:
                    self.rfc7940_checks.error('parse_xml')
                    self.rfc7940_checks.error('codepoint_valid')
                    logger.error("Cannot add range '%s-%s': %s",
                                 format_cp(first_cp), format_cp(last_cp), exc)
                    if not self.force_mode:
                        raise

            child.clear()

        self.rfc7940_checks.tested('char_ascending_order')
        self.rfc7940_checks.tested('char_strict_ascending_order')
Exemple #28
0
def check_symmetry(lgr, options):
    """
    Check that all variants are defined in a symmetric way.

    If B is a variant of A, then A must be a variant of B.

    We only check the presence of the code point in the variants, not that all
    properties are identical (type, when, not-when).

    :param lgr: The LGR to be tested.
    :param options: Dictionary of options to the validation function - unused.
    :return True is LGR symmetry is achieved, False otherwise.
    """
    logger.info("Testing symmetry")

    success = True
    result = {'description': 'Testing symmetry', 'repertoire': []}
    for a in lgr.repertoire:
        if isinstance(a, RangeChar):
            # Range have no variants
            continue

        for b in a.get_variants():
            if b.cp not in lgr.repertoire:
                # Variant is not defined in repertoire
                logger.warning('CP %s: Variant %s is not in repertoire.',
                               format_cp(a.cp), format_cp(b.cp))
                lgr.notify_error('basic_symmetry')
                result['repertoire'].append({
                    'char': a,
                    'variant': b,
                    'type': 'not-in-repertoire'
                })
                success = False
                continue

            # Variant is defined in repertoire,
            # let's see if the original character is in its
            # variants
            if a.cp not in [var.cp for var in lgr.get_variants(b.cp)]:
                success = False
                logger.warning('CP %s should have CP %s in its variants.',
                               format_cp(b.cp), format_cp(a.cp))
                lgr.notify_error('basic_symmetry')
                result['repertoire'].append({
                    'char': b,
                    'variant': a,
                    'type': 'missing'
                })
                continue

            # Now let's check if the reverse mappings agree in their
            # "when" or "not-when" attributes
            for c in lgr.get_variants(b.cp):
                if c.cp == a.cp:
                    if c.when == b.when and c.not_when == b.not_when:
                        break
            else:
                success = False
                lgr.notify_error('strict_symmetry')
                logger.warning(
                    'CP %s should have CP %s in its strict variants.',
                    format_cp(b.cp), format_cp(a.cp))
                result['repertoire'].append({
                    'char': b,
                    'variant': a,
                    'type': 'missing'
                })
    logger.info("Symmetry test done")
    lgr.notify_tested('basic_symmetry')
    lgr.notify_tested('strict_symmetry')

    return success, result
Exemple #29
0
    def apply(self, label, disp_set, only_variants, rules_lookup,
              classes_lookup, unicode_database):
        """
        Apply an action to a label.

        :param label: The label to process, as a sequence of code points.
        :param disp_set: Set of dispositions used to generate the label.
        :param only_variants: True if label only contains code point
                              from variant mapping.
        :param rules_lookup: Dictionary of defined rules in the LGR.
        :param classes_lookup: Dictionary of defined classes in the LGR.
        :param unicode_database: The Unicode Database used to process rules.
        :return: The final label disposition,
                 None is no action applies to the label.
        :raises RuleError: If rule is invalid.
        """

        # RFC7940, section 8.3.  Determining a Disposition for a Label or Variant Label
        # Step 2
        rule_logger.debug(
            "Applying action %s on label '%s' "
            "with disposition set '%s'", self, format_cp(label), disp_set)

        # First bullet
        rule_matched = True
        if self.match is not None:
            rule = rules_lookup[self.match]
            rule_matched = rule.matches(label, rules_lookup, classes_lookup,
                                        unicode_database)
            rule_logger.info('Action %s: when rule matched: %s', self,
                             rule_matched)
        # Second bullet
        elif self.not_match is not None:
            rule = rules_lookup[self.not_match]
            rule_matched = not rule.matches(label, rules_lookup,
                                            classes_lookup, unicode_database)
            rule_logger.info('Action %s: not-when rule matched: %s', self,
                             rule_matched)

        # Third bullet
        variant_matched = True
        if self.any_variant is not None:
            # Any single match may trigger an action that contains
            # an "any-variant" attribute
            variant_matched = len(self.any_variant & disp_set) > 0
            rule_logger.info('Action %s: any-variant matched: %s', self,
                             variant_matched)
        # Fourth bullet
        elif self.all_variants is not None:
            # For an "all-variants" attribute,
            # the variant type for all variant code points must match one or
            # several of the types values specified in to trigger the action.
            variant_matched = (len(disp_set) > 0
                               and disp_set.issubset(self.all_variants))
            rule_logger.info('Action %s: all-variants matched: %s', self,
                             variant_matched)
        # Fifth bullet
        elif self.only_variants is not None:
            # For an "only-variants" attribute,
            # the variant type for all variant code points must match one or
            # several of the types values specified in to trigger the action.
            # An "only-variants" attribute will trigger the action
            # only if all code points of the variant label have variant mappings
            # from the original code points.
            # => Label only contains code points generated from variant mappings
            # (including reflexive mappings)
            variant_matched = (only_variants and len(disp_set) > 0
                               and disp_set.issubset(self.only_variants))
            rule_logger.info('Action %s: only-variants matched: %s', self,
                             variant_matched)

        # Last bullet: rule_matched and variant_matched are initialised to True
        if rule_matched and variant_matched:
            rule_logger.info('Action %s triggered, disposition: %s', self,
                             self.disp)
            return self.disp

        rule_logger.info('Action %s not triggered', self)
        return None
Exemple #30
0
def check_label(lgr,
                label,
                generate_variants=False,
                merged_lgr=None,
                set_labels=None):
    from lgr.utils import format_cp
    label_cp = tuple([ord(c) for c in label])

    write_output("\nLabel: %s [%s]" % (label, format_cp(label_cp)))

    (eligible, label_parts, label_invalid_parts, disp, _,
     _) = lgr.test_label_eligible(label_cp)
    write_output("\tEligible: %s" % eligible)
    write_output("\tDisposition: %s" % disp)

    if eligible:
        if merged_lgr and set_labels:
            write_output("Collisions:")
            if label in set_labels:
                write_output("Labels is in the LGR set labels")
            else:
                indexes = get_collisions(merged_lgr,
                                         set_labels + [label],
                                         quiet=True)
                if len(indexes) > 1:
                    # there should be one collision except if set labels are not checked
                    logger.error(
                        'More than one collision, please check your LGR set labels'
                    )
                    return
                elif len(indexes) > 0:
                    collisions = indexes[list(indexes.keys())[0]]
                    collision = None
                    collide_with = []
                    # retrieve label in collision list
                    for col in collisions:
                        if col['label'] == label:
                            collision = col
                        if col['label'] in set_labels:
                            collide_with.append(col)

                    if not collision:
                        # this should not happen except if set labels are not checked
                        logger.error(
                            'Cannot retrieve label in collisions, please check your LGR set labels'
                        )
                        return

                    if len(collide_with) != 1:
                        logger.error(
                            'Collision with more than one label in the LGR set labels,'
                            'please check your LGR set labels')
                        return

                    write_output("Label collides with LGR set label '%s'" %
                                 collide_with[0]['label'])
                else:
                    write_output('\tNone')

        if generate_variants:
            write_output("Variants:")
            summary, labels = lgr.compute_label_disposition_summary(label_cp)
            for (variant_cp, var_disp, _, _, _) in labels:
                variant_u = cp_to_ulabel(variant_cp)
                write_output("\tVariant %s [%s]" %
                             (variant_u, format_cp(variant_cp)))
                write_output("\t- Disposition: '%s'" % var_disp)
    else:
        write_output("- Valid code points from label: %s" %
                     u' '.join(u"{:04X}".format(cp) for cp in label_parts))
        if label_invalid_parts:
            write_output("- Invalid code points from label: {}".format(
                ' '.join("{:04X} ({})".format(
                    cp,
                    "not in repertoire" if rules is None else ','.join(rules))
                         for cp, rules in label_invalid_parts)))