Example #1
0
def _get_variants(lgr, label_cplist, threshold_include_vars, idna_encoder,
                  lgr_actions):
    res = {}
    var_results = []
    summary, label_dispositions = lgr.compute_label_disposition_summary(
        label_cplist, include_invalid=True)
    res['summary'] = summary
    res['num_variants'] = len(label_dispositions)
    res['threshold_include_vars'] = threshold_include_vars
    if threshold_include_vars < 0 or len(
            label_dispositions) <= threshold_include_vars:
        for (variant_cp, var_disp, var_invalid_parts, action_idx, disp_set,
             logs) in label_dispositions:

            invalid_codepoints = set([c for c, _ in var_invalid_parts or []])

            def format_cphex(c, want_html=True):
                if want_html and c in invalid_codepoints:
                    return u'<span class="text-danger not-in-rep">U+{:04X} (&#{};)</span>'.format(
                        c, c)
                else:
                    return u"U+{:04X} (&#{};)".format(c, c)

            variant_u = cp_to_ulabel(variant_cp)
            variant_display_html = mark_safe(u' '.join(
                map(format_cphex, variant_cp)))
            variant_display = u' '.join(
                u"U+{:04X}".format(cp, cp_to_ulabel(cp)) for cp in variant_cp)
            variant_input = u' '.join(u"U+{:04X}".format(cp)
                                      for cp in variant_cp)
            variant_a = idna_encoder(variant_u)

            var_results.append({
                'u_label':
                variant_u,
                'a_label':
                variant_a,
                'cp_display_html':
                variant_display_html,
                'cp_display':
                variant_display,
                'cp_input':
                variant_input,
                'disposition':
                var_disp,
                'label_invalid_parts':
                var_invalid_parts,
                'action_idx':
                action_idx,
                'action':
                lgr_actions[action_idx] if action_idx >= 0 else None,
                'disp_set':
                disp_set,
                'logs':
                logs,
            })
        res['variants'] = var_results

    return res
Example #2
0
def validate_label_task(lgr_json, label, email_address, storage_path):
    """
    Compute label validation variants of labels in a LGR.

    :param lgr_json: The LGRInfo as a JSON object.
    :param label: The label to validate, as a list of code points.
    :param email_address: The e-mail address where the results will be sent
    :param storage_path: The place where results will be stored
    """
    lgr_info = LGRInfo.from_dict(lgr_json)
    udata = get_db_by_version(lgr_info.lgr.metadata.unicode_version)

    logger.info("Starting task 'validate label' for %s, for input label '%s'",
                lgr_info.name, label)

    u_label = cp_to_ulabel(label)
    body = "Hi,\nThe processing of label validation for label '{label}' in LGR '{lgr}' has".format(label=u_label,
                                                                                                   lgr=lgr_info.name)

    _lgr_tool_task(storage_path,
                   base_filename='label_validation_{0}'.format(lgr_info.name),
                   email_subject='LGR Toolset label validation result',
                   email_body=body,
                   email_address=email_address,
                   cb=lgr_validate_label,
                   lgr=lgr_info.lgr,
                   label=label,
                   udata=udata)
Example #3
0
def find_variants_to_block(lgr, label_ref, label):
    var_ref = [var for (var, _, _) in lgr._generate_label_variants(label_ref)]

    for (variant_cp, disp, _, _, disp_set,
         _) in lgr.compute_label_disposition(label):
        if variant_cp in var_ref:
            variant_u = cp_to_ulabel(variant_cp)
            write_output("Variant '%s' [%s] with disposition set '%s' "
                         "should be blocked (current disposition :%s)" %
                         (variant_u, format_cp(variant_cp), disp_set, disp))
Example #4
0
def cross_script_variants(lgr, labels_input):
    """
    Compute cross-script variants of labels.

    :param lgr: The LGR to use for variant generation.
    :param labels_input: The file containing the labels
    """
    if lgr.metadata is None:
        logger.error("Cannot generate cross-scripts variants "
                     "for LGR without metadata")
        raise Exception
    if lgr.unicode_database is None:
        logger.error("Cannot generate cross-scripts variants "
                     "for LGR without unicode database attached")
        raise Exception
    found = False
    for label, valid, error in read_labels(labels_input, lgr.unicode_database):
        if not valid:
            yield "Input label {}: {}\n".format(label, error)
        else:
            label_cp = tuple([ord(c) for c in label])
            result, _, _, _, _, _ = lgr.test_label_eligible(label_cp)
            if not result:
                continue
            label_displayed = False
            for variant, disp, script_mapping in _generate_variants(
                    lgr, label_cp):
                if not label_displayed:
                    # Only display input label if it has x-variants
                    yield "Input label {} ({}) has cross-script variants:\n".format(
                        format_cp(label_cp), label)
                    label_displayed = True
                    found = True
                yield "\t- Cross-variant {} ({}), disposition {}:\n".format(
                    format_cp(variant), cp_to_ulabel(variant), disp)
                yield '\t\t+ ' + '\t\t+ '.join([
                    "{} ({}): {}\n".format(format_cp(c), cp_to_ulabel(c), s)
                    for c, s in script_mapping.items()
                ])

    if not found:
        yield 'No cross-script variants for input!'
Example #5
0
def check_label(lgr, label, invalid, test):
    from lgr.utils import format_cp
    label_cp = tuple([ord(c) for c in label])
    label_display = ' '.join("{:04X}".format(cp) for cp in label_cp)

    logger.info("- Code points: %s", label_display)

    (eligible, label_parts, label_invalid_parts, disp, action_idx, logs) = lgr.test_label_eligible(label_cp)
    logger.info("- Eligible: %s", eligible)
    logger.info("- Disposition: %s", disp)
    is_default_action = action_idx > len(lgr.actions)
    actual_index = action_idx if not is_default_action else action_idx - len(lgr.actions)
    action_name = "DefaultAction" if is_default_action else "Action"
    logger.info("- Action triggered: %s[%d]", action_name, actual_index)
    logger.info("- Logs: %s", logs)
    write_output("Validation: {} ({}): Result: {}".format(label, label_display, "valid" if eligible else "INVALID"),
                 test)

    if eligible:
        write_output("Disposition: {} ({}): Result: {} due to {}[{}]".format(label, label_display, disp,
                                                                             action_name, actual_index), test)

        summary, labels = lgr.compute_label_disposition_summary(label_cp,
                                                                include_invalid=invalid)
        logger.info("Summary: %s", summary)
        for (variant_cp, var_disp, variant_invalid_parts, action_idx, disp_set, logs) in labels:
            variant_u = cp_to_ulabel(variant_cp)
            variant_display = ' '.join("{:04X}".format(cp) for cp in variant_cp)
            logger.info("\tVariant '%s'", variant_u)
            logger.info("\t- Code points: %s", format_cp(variant_cp))
            logger.info("\t- Disposition: '%s'", var_disp)

            if variant_invalid_parts:
                logger.info("\t- Invalid code points from variant: %s",
                            ' '.join(("{:04X} ({})".format(cp,
                                                           "not in repertoire" if rules is None else ','.join(rules))
                                      for cp, rules in variant_invalid_parts)))

            is_default_action = action_idx > len(lgr.actions)
            actual_index = action_idx if not is_default_action else action_idx - len(lgr.actions)
            action_name = "DefaultAction" if is_default_action else "Action"
            logger.info("\t- Action triggered: %s[%d]", action_name, actual_index)
            disp_set_display = '{%s}' % ','.join(disp_set)
            write_output("Variant: ({}): [{}] ==> {} due to {}[{}]".format(variant_display, disp_set_display, var_disp,
                                                                           action_name, actual_index), test)

            logger.info("\t- Logs: %s", logs)
    else:
        logger.info("- Valid code points from label: %s",
                    ' '.join("{:04X}".format(cp) for cp in label_parts))
        logger.info("- Invalid code points from label: %s",
                    ' '.join(("{:04X} ({})".format(cp, "not in repertoire" if rules is None else ','.join(rules)) for
                              cp, rules in label_invalid_parts)))
Example #6
0
def parse_label_input(s,
                      idna_decoder=lambda x: x.encode('utf-8').decode('idna'),
                      as_cp=True):
    """
    Parses a label from user input, applying a bit of auto-detection smarts

    :param s: input string in A-label, U-label or space-separated hex sequences.
    :param idna_decoder: IDNA decode function.
    :param as_cp: If True, returns a list of code points. Otherwise, unicode string.
    :return: list of code points

    >>> parse_label_input('0061')  # treated as U-label - probably the only confusing result
    [48, 48, 54, 49]
    >>> parse_label_input('U+0061')  # this is how to signal that you want hex
    [97]
    >>> parse_label_input('abc')
    [97, 98, 99]
    >>> parse_label_input('a b c')
    [97, 98, 99]
    >>> parse_label_input('xn--m-0ga')  # "öm"
    [246, 109]
    """
    if s.lower().startswith('xn--'):
        if as_cp:
            return [ord(c) for c in idna_decoder(s.lower())]
        else:
            return idna_decoder(s.lower())
    elif ' ' in s or 'U+' in s.upper():
        try:
            label_cp = parse_codepoint_input(s)
        except:
            if ' ' in s:
                raise ValueError("Label '{}' contains spaces "
                                 "that are not PVALID for IDNA2008".format(s))
            raise
        if as_cp:
            return label_cp
        else:
            return cp_to_ulabel(label_cp)
    else:
        # treat as unicode
        if as_cp:
            return [ord(c) for c in s]
        else:
            return s
Example #7
0
def lgr_set_validate_label_task(lgr_json, script_lgr_json, label, email_address, storage_path):
    """
    Compute label validation variants of labels in a LGR.

    :param lgr_json: The LGRInfo as a JSON object.
    :param script_lgr_json: The LGRInfo for the script used to check label validity as a JSON object.
    :param label: The label to validate, as a list of code points.
    :param email_address: The e-mail address where the results will be sent
    :param storage_path: The place where results will be stored
    """
    lgr_info = LGRInfo.from_dict(lgr_json)
    udata = get_db_by_version(lgr_info.lgr.metadata.unicode_version)
    script_lgr = LGRInfo.from_dict(script_lgr_json).lgr
    set_labels_info = lgr_info.set_labels_info
    if set_labels_info is None:
        set_labels_info = LabelInfo(name='None', labels=[])

    logger.info("Starting task 'validate label' for %s, for input label '%s'",
                lgr_info.name, label)

    u_label = cp_to_ulabel(label)
    body = "Hi,\nThe processing of label validation for label '{label}'" \
           " in LGR set '{lgr}' with script '{script}' has".format(label=u_label,
                                                                   lgr=lgr_info.lgr.name,
                                                                   script=script_lgr.name)

    _lgr_tool_task(storage_path,
                   base_filename='label_validation_{0}'.format(lgr_info.name),
                   email_subject='LGR Toolset label validation result',
                   email_body=body,
                   email_address=email_address,
                   cb=lgr_set_validate_label,
                   lgr=lgr_info.lgr,
                   script_lgr=script_lgr,
                   set_labels=set_labels_info.labels,
                   label=label,
                   udata=udata)
Example #8
0
def collision(lgr, labels_input, show_dump=False, quiet=False):
    """
    Show collisions in a list of labels for a given LGR

    :param lgr: The LGR object.
    :param labels_input: The file containing the labels
    :param show_dump: Generate a full dump
    :param quiet: Do not print rules
    """
    from lgr.tools.utils import read_labels
    labels = set()
    for label, valid, error in read_labels(labels_input, lgr.unicode_database):
        if valid:
            labels.add(label)
        else:
            yield "Label {}: {}\n".format(label, error)

    # get diff between labels and variants for the two LGR
    # only keep label without collision for a full dump
    label_indexes, not_in_lgr = _generate_indexes(lgr,
                                                  labels,
                                                  keep=show_dump,
                                                  quiet=quiet)

    if not_in_lgr:
        yield "\n# Labels not in LGR #\n\n"
        for label_cp in not_in_lgr:
            yield "Label {}\n".format(cp_to_ulabel(label_cp))

    # output collisions
    yield "\n# Collisions #\n\n"
    for output in _write_complete_output(label_indexes):
        yield output
    if show_dump:
        yield "\n# Summary #\n\n"
        for output in _full_dump(label_indexes):
            yield output
Example #9
0
def _get_validity(lgr, label_cplist, idna_encoder):
    label_u = cp_to_ulabel(label_cplist)
    try:
        label_a = idna_encoder(label_u)
    except UnicodeError as e:
        label_a = '!ERROR - {}!'.format(e)

    (eligible, label_valid_parts, label_invalid_parts, disp, action_idx,
     logs) = lgr.test_label_eligible(label_cplist)

    invalid_codepoints = set([c for c, _ in label_invalid_parts])

    def format_cphex(c, want_html=True):
        if want_html and c in invalid_codepoints:
            return u'<span class="text-danger not-in-rep">U+{:04X} (&#{};)</span>'.format(
                c, c)
        else:
            return u"U+{:04X} (&#{};)".format(c, c)

    label_display_html = mark_safe(u' '.join(map(format_cphex, label_cplist)))
    label_display_text = u' '.join(u"U+{:04X}".format(cp)
                                   for cp in label_cplist)

    lgr_actions = lgr.effective_actions_xml  # save it once (since `lgr.effective_actions` is dynamically computed)
    return {
        'u_label': label_u,
        'a_label': label_a,
        'cp_display_html': label_display_html,
        'cp_display': label_display_text,
        'eligible': eligible,
        'disposition': disp,
        'label_invalid_parts': label_invalid_parts,
        'action_idx': action_idx,
        'action': lgr_actions[action_idx] if action_idx >= 0 else None,
        'logs': logs
    }, lgr_actions
Example #10
0
 def __unicode__(self):
     return cp_to_ulabel(self.cp)
Example #11
0
def main():
    parser = argparse.ArgumentParser(description='LGR Collision')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='be verbose')
    parser.add_argument('-g',
                        '--generate',
                        action='store_true',
                        help='Generate variants')
    parser.add_argument('-l',
                        '--libs',
                        metavar='LIBS',
                        help='ICU libraries',
                        required=True)
    parser.add_argument('-s',
                        '--set',
                        metavar='SET FILE',
                        help='Filepath to the set of reference labels',
                        required=True)
    parser.add_argument('xml', metavar='XML')

    args = parser.parse_args()

    log_level = logging.DEBUG if args.verbose else logging.INFO
    logging.basicConfig(stream=sys.stdout, level=log_level)

    lgr_parser = XMLParser(args.xml)

    libpath, i18n_libpath, libver = args.libs.split('#')
    manager = UnicodeDataVersionManager()
    unidb = manager.register(None, libpath, i18n_libpath, libver)

    lgr_parser.unicode_database = unidb

    lgr = lgr_parser.parse_document()
    if lgr is None:
        logger.error("Error while parsing LGR file.")
        logger.error("Please check compliance with RNG.")
        return

    ref_label_indexes = {}

    # Compute index label for set or reference labels
    with io.open(args.set, 'r', encoding='utf-8') as ref_set:
        for ref_label in ref_set:
            label_cp = tuple([ord(c) for c in ref_label.strip()])
            try:
                label_index = compute_label_index(lgr, label_cp)
            except NotInLGR:
                continue
            ref_label_indexes[label_index] = label_cp

    # Deal with input
    for label in get_stdin().read().splitlines():
        write_output("Check label '%s'" % label)
        label_cp = tuple([ord(c) for c in label])
        label_disp = format_cp(label_cp)
        label_index = compute_label_index(lgr, label_cp)

        if label_index in ref_label_indexes:
            ref_label_cp = ref_label_indexes[label_index]
            ref_label_disp = format_cp(ref_label_cp)
            ref_label_u = cp_to_ulabel(ref_label_cp)

            write_output("Collision for label '%s' [%s] with '%s' [%s]" %
                         (label, label_disp, ref_label_u, ref_label_disp))
            if args.generate:
                find_variants_to_block(lgr, ref_label_cp, label_cp)
        else:
            write_output("No collision for label %s [%s]" %
                         (label, label_disp))
Example #12
0
    def matches(self,
                label,
                rules_lookup,
                classes_lookup,
                unicode_database,
                anchor=None,
                index=0):
        """
        Test if a rule matches a label.

        :param label: Label to test, as a sequence of code points.
        :param rules_lookup: Dictionary of defined rules in the LGR to use
                             for by-ref rules.
        :param classes_lookup: Dictionary of defined classes in the LGR to use
                               for by-ref classes.
        :param unicode_database: The Unicode Database.
        :param anchor: Optional anchor to use for look-around rules.
        :param index: If anchor is used, its index (0-based).
        :return: True if label is matched by the rule, False otherwise.
        """
        rule_logger.debug(
            "Test match on %s for label '%s' with anchor '%s' (%d)", self,
            format_cp(label),
            format_cp(anchor) if anchor else anchor, index)
        try:
            pattern = self.get_pattern(rules_lookup, classes_lookup,
                                       unicode_database)
        except (re.error, PICUException) as re_exc:
            rule_logger.error('Cannot get pattern for rule %s: %s', self,
                              re_exc)
            raise RuleError(self.name, re_exc)

        if len(pattern) == 0:
            # Pattern is empty, nothing will match
            rule_logger.debug('Empty pattern')
            return False

        if anchor is not None:
            if '%(anchor)s' not in pattern:
                rule_logger.debug('Not a parameterized context rule')
                # Pattern is not a parameterized context-rule, so set index to 0
                index = 0
            # Format anchor - Can be a sequence.
            # Use old-style formatting, see note in matcher.AnchorMatcher
            pattern = pattern % {
                'anchor': ''.join(
                    map(lambda c: '\\x{{{:X}}}'.format(c), anchor))
            }
        rule_logger.debug("Pattern for rule %s: '%s'", self, pattern)
        try:
            regex = unicode_database.compile_regex(pattern)
        except (re.error, PICUException) as re_exc:
            rule_logger.error('Cannot compile regex: %s', re_exc)
            raise RuleError(self.name, re_exc)

        rule_logger.debug("Index: %d", index)

        # Convert label to U-format to be used in regex
        label_u = cp_to_ulabel(label)

        # Look for match. It is important to use "search" and not "match"
        # here, since a rule may not match at the beginning of a label.
        result = regex.search(label_u, index=index)
        rule_logger.debug("Result of match: %s", result)
        if result is None:
            return False

        if anchor is not None:
            match_index = result.start()
            rule_logger.debug('Match index: %d - Index: %d', match_index,
                              index)
            if match_index > index:
                rule_logger.debug('Match found after index, invalid')
                return False
        return True
Example #13
0
def check_label(lgr,
                label,
                generate_variants=False,
                merged_lgr=None,
                set_labels=None):
    from lgr.utils import format_cp
    label_cp = tuple([ord(c) for c in label])

    write_output("\nLabel: %s [%s]" % (label, format_cp(label_cp)))

    (eligible, label_parts, label_invalid_parts, disp, _,
     _) = lgr.test_label_eligible(label_cp)
    write_output("\tEligible: %s" % eligible)
    write_output("\tDisposition: %s" % disp)

    if eligible:
        if merged_lgr and set_labels:
            write_output("Collisions:")
            if label in set_labels:
                write_output("Labels is in the LGR set labels")
            else:
                indexes = get_collisions(merged_lgr,
                                         set_labels + [label],
                                         quiet=True)
                if len(indexes) > 1:
                    # there should be one collision except if set labels are not checked
                    logger.error(
                        'More than one collision, please check your LGR set labels'
                    )
                    return
                elif len(indexes) > 0:
                    collisions = indexes[list(indexes.keys())[0]]
                    collision = None
                    collide_with = []
                    # retrieve label in collision list
                    for col in collisions:
                        if col['label'] == label:
                            collision = col
                        if col['label'] in set_labels:
                            collide_with.append(col)

                    if not collision:
                        # this should not happen except if set labels are not checked
                        logger.error(
                            'Cannot retrieve label in collisions, please check your LGR set labels'
                        )
                        return

                    if len(collide_with) != 1:
                        logger.error(
                            'Collision with more than one label in the LGR set labels,'
                            'please check your LGR set labels')
                        return

                    write_output("Label collides with LGR set label '%s'" %
                                 collide_with[0]['label'])
                else:
                    write_output('\tNone')

        if generate_variants:
            write_output("Variants:")
            summary, labels = lgr.compute_label_disposition_summary(label_cp)
            for (variant_cp, var_disp, _, _, _) in labels:
                variant_u = cp_to_ulabel(variant_cp)
                write_output("\tVariant %s [%s]" %
                             (variant_u, format_cp(variant_cp)))
                write_output("\t- Disposition: '%s'" % var_disp)
    else:
        write_output("- Valid code points from label: %s" %
                     u' '.join(u"{:04X}".format(cp) for cp in label_parts))
        if label_invalid_parts:
            write_output("- Invalid code points from label: {}".format(
                ' '.join("{:04X} ({})".format(
                    cp,
                    "not in repertoire" if rules is None else ','.join(rules))
                         for cp, rules in label_invalid_parts)))
Example #14
0
def _get_collisions(lgr, label_cplist, set_labels, idna_encoder, lgr_actions):
    res = {}
    label_u = cp_to_ulabel(label_cplist)
    set_labels = [l.strip() for l in set_labels]

    # if label is in the LGR set labels skip
    if label_u in set_labels:
        res['collisions_error'] = _('The label is in the LGR set labels.')
        return res

    # check for collisions
    indexes = get_collisions(lgr, set_labels + [label_u], quiet=False)
    if len(indexes) > 1:
        # there should be one collision as set labels are checked, this error should not happen
        res['collisions_error'] = _(
            'ERROR more than one collision, please check your LGR set labels')
        return res

    if len(indexes) == 0:
        return res

    collisions = indexes[list(indexes.keys())[0]]
    collision = None
    collide_with = []
    # retrieve label in collision list
    for col in collisions:
        if col['label'] == label_u:
            collision = col
        if col['label'] in set_labels:
            collide_with.append(col)

    if not collision:
        # this should not happen
        res['collisions_error'] = _(
            'ERROR cannot retrieve label in collisions, please check your LGR set labels'
        )
        return res

    if len(collide_with) != 1:
        res['collisions_error'] = _(
            'ERROR collision with more than one label in the LGR set labels,'
            'please check your LGR set labels')
        return res

    collide_with = collide_with[0]
    variant_u = idna_encoder(collide_with['label'])
    variant_display_html = mark_safe(u' '.join(
        u"U+{:04X} ({})".format(cp, cp_to_ulabel(cp))
        for cp in collide_with['cp']))
    variant_display = u' '.join(u"U+{:04X}".format(cp)
                                for cp in collide_with['cp'])
    try:
        variant_a = idna_encoder(variant_u)
    except UnicodeError as e:
        variant_a = '!ERROR - {}!'.format(e)

    # XXX Collided variants info may be retrieved in script LGR rather than in merged LGR
    action_idx = collision['action_idx'][collide_with['label']]
    collision_dct = {
        'input': collide_with['label'],
        'u_label': variant_u,
        'a_label': variant_a,
        'cp_display_html': variant_display_html,
        'cp_display': variant_display,
        'disposition': collision['disp'][collide_with['label']],
        'action_idx': action_idx,
        'action': lgr_actions[action_idx] if action_idx >= 0 else None,
        'rules': collision['rules'][collide_with['label']]
    }
    # remove variants that are not in our labels set
    res['collision'] = collision_dct

    return res
Example #15
0
def diff(lgr_1,
         lgr_2,
         labels_input,
         show_collision=True,
         show_dump=False,
         quiet=False):
    """
    Show diff for a list of labels between 2 LGR

    :param lgr_1: The first LGR info object.
    :param lgr_2: The second LGR info object.
    :param labels_input: The file containing the labels
    :param show_collision: Output collisions
    :param show_dump: Generate a full dump
    :param quiet: Do not print rules
    """
    from lgr.tools.utils import read_labels
    labels = set()
    for label, valid, error in read_labels(labels_input,
                                           lgr_1.unicode_database):
        if valid:
            labels.add(label)
        else:
            yield "Label {}: {}\n".format(label, error)

    # get diff between labels and variants for the two LGR
    # keep label without collision as we need to compare
    label1_indexes, not_in_lgr_1 = _generate_indexes(lgr_1,
                                                     labels,
                                                     keep=True,
                                                     quiet=quiet)
    label2_indexes, not_in_lgr_2 = _generate_indexes(lgr_2,
                                                     labels,
                                                     keep=True,
                                                     quiet=quiet)

    if not_in_lgr_1 or not_in_lgr_2:
        for index, not_in_lgr in enumerate([not_in_lgr_1, not_in_lgr_2], 1):
            yield "# Labels not in LGR {} #\n\n".format(index)
            for label_cp in not_in_lgr:
                yield "Label {}\n".format(cp_to_ulabel(label_cp))
            yield '\n'

    # generate a dictionary of indexes per label
    labels_dic = {}
    yield "\n# LGR comparison #\n"
    for label in labels:
        label_cp = tuple([ord(c) for c in label])
        try:
            index1 = lgr_1.generate_index_label(label_cp)
        except NotInLGR:
            yield "Label {} not in LGR {}\n".format(label, lgr_1)
            continue
        try:
            index2 = lgr_2.generate_index_label(label_cp)
        except NotInLGR:
            yield "Label {} not in LGR {}\n".format(label, lgr_2)
            continue
        labels_dic[label] = (index1, index2)

    for output in _compare(labels_dic, label1_indexes, label2_indexes):
        yield output
    # output collisions
    if show_collision:
        yield "\n\n# Collisions for LGR1 #\n"
        for output in _write_complete_output(label1_indexes):
            yield output
        if show_dump:
            yield "\n# Summary for LGR1 #\n"
            for output in _full_dump(label1_indexes):
                yield output
        yield "\n\n# Collisions for LGR2 #\n"
        for output in _write_complete_output(label2_indexes):
            yield output
        if show_dump:
            yield "\n# Summary for LGR2 #\n\n"
            for output in _full_dump(label2_indexes):
                yield output
Example #16
0
def _generate_indexes(lgr, labels, keep=False, quiet=False):
    """
    Generate indexes based on labels provided in the list

    :param lgr: The current LGR
    :param labels: The list of labels, as a list of U-Labels.
    :param keep: Do we keep labels without collision in the output
    :param quiet: If True, do not collect rule log.

    :return: (label_indexes, not_in_lgr), with:
              - label_indexes: the dictionary containing the primary labels
                               and their variants (with various information) for each index.
              - not_in_lgr: List of labels that do not pass preliminary eligibility testing.
  """

    label_indexes = {}
    not_in_lgr = []
    # Get the indexes and variants for all labels
    for label in labels:
        label_cp = tuple([ord(c) for c in label])
        try:
            label_index = lgr.generate_index_label(label_cp)
        except NotInLGR:
            not_in_lgr.append(label_cp)
            continue

        label_cp_out = format_cp(label_cp)
        if label_index not in label_indexes:
            label_indexes[label_index] = []
        label_indexes[label_index].append({
            'label':
            label,
            'bidi':
            "%s'%s'%s" % (LRI, label, PDI),
            'cat':
            PRIMARY,
            'cp':
            label_cp,
            'cp_out':
            label_cp_out,
            'disp': {
                label: '-'
            },
            'rules': {
                label: '-'
            },
            'action_idx': {
                label: '-'
            }
        })

    for (label_index, primaries) in deepcopy(label_indexes).items():
        # only get variants for collided labels (if not keep)
        if len(primaries) < 2 and not keep:
            del label_indexes[label_index]
            continue
        for primary in primaries:
            label_cp = primary['cp']
            label = primary['label']
            for (variant_cp, variant_disp, variant_invalid_parts, action_idx,
                 _,
                 log) in lgr.compute_label_disposition(label_cp,
                                                       include_invalid=True,
                                                       collect_log=not quiet):
                variant = cp_to_ulabel(variant_cp)
                log = log.strip()
                if quiet:
                    log = ''
                variant_cp_out = format_cp(variant_cp)
                # search if variant is already in our dict, then add or
                # update it
                existing = [
                    var for var in label_indexes[label_index]
                    if var['label'] == variant
                ]
                if len(existing) < 1:
                    label_indexes[label_index].append({
                        'label':
                        variant,
                        'bidi':
                        "%s'%s'%s" % (LRI, variant, PDI),
                        'cat':
                        VARIANT,
                        'cp':
                        variant_cp,
                        'cp_out':
                        variant_cp_out,
                        'disp': {
                            label: variant_disp
                        },
                        'rules': {
                            label: log
                        },
                        'action_idx': {
                            label: action_idx
                        }
                    })
                else:
                    assert len(existing) == 1
                    existing[0]['disp'][label] = variant_disp
                    existing[0]['rules'][label] = log
                    existing[0]['action_idx'][label] = action_idx

    return label_indexes, not_in_lgr