Beispiel #1
0
def find_rules_in_flr_bit(mode, bit):
    found = list(find_rules(mode, bit))
    w = None
    ret = []
    if regex.match(b'\n*Rule [0-9]', bit):
        for data in found:
            ret.append(data)
        if len(found) > 1:
            if b'Rule 2389/0' not in bit:
                w = 'got multiple rules in FLR bit'

            rulenum = None
        elif len(found) == 0:
            if b'Rule 2386/0' not in bit:
                w = 'got no rules in FLR bit'
    else:
        if len(found) > 0:
            w = 'got rule in weird FLR bit'
        rulenum = False
    if w is not None:
        with warnx():
            print('full: {{{')
            print(util.highlight_spaces(decode(bit)))
            print('}}}', )
            print('^-', w)
    return ret
Beispiel #2
0
def find_stdformat_rules(text, expect_history=False):
    m = regex.search(b'Rule ([0-9]+)', text)
    early_rulenum = None
    if m:
        early_rulenum = m.group(1)
        action = FIXUPS_FOR_RULENUM.get(early_rulenum)
        if action:
            text = action(text)
    # fix CFJ annotations missing brackets altogether
    text = regex.sub(b'\n(CFJ [0-9]+[^\n]*:.*?)(?=\n\n)',
                     br'\n[\1]',
                     text,
                     flags=regex.S)
    # --
    for m in fsfr_regex.finditer(text):
        #print('yaeh', text[m.end():m.end()+100])
        g = m.groupdict()
        full = g['full']
        history = None
        if g['history'] is not None:
            thehist = decode(g['thehist'])
            thehist = regex.sub(
                'The following section is not a portion of the report:.*',
                '',
                thehist,
                flags=regex.S)  # lol, old scam
            history = list(split_history(thehist))
        if expect_history and not history:
            if early_rulenum not in {b'2385', b'2119', b'2001'}:
                with warnx():
                    print(repr(g))
                    print('full: {{{')
                    print(util.highlight_spaces(decode(text)))
                    print('}}}')
                    print('^- no history in this FLR entry')
        rtext = g['text'] or b''
        inumber = int(g['number'])
        extraheader = g['extraheader'] or b''
        if extraheader:
            _extratitle, rtext, extraheader = fix_oldformat_header(
                inumber, rtext, extraheader.rstrip())

        data = {
            'number': inumber,
            'revnum': decode(g['revnum']) if g['revnum'] else None,
            'title': decode(g['title']) if g['title'] else None,
            'header': decode(g['header']),
            'extra': decode(extraheader) if extraheader else None,
            'text': decode(rtext),
            'annotations': decode(g['annotations']) or None,
            'history': history,
        }
        yield data
Beispiel #3
0
 def detect_renumberings():
     latents_by_reduced_sig = defaultdict(list)
     for latent in all_latents:
         if latent.sig is None or isinstance(
                 latent.sig, str) or latent.sig == ('0', 'create'):
             continue
         rsig = latent.sig[1:]
         if rsig[-1] == 'cleaning':
             continue
         latents_by_reduced_sig[rsig].append(latent)
     for rsig, latents in latents_by_reduced_sig.items():
         if len(latents) <= 1: continue
         # potential renumbering
         # are they all in different entries?
         entries = [an._entry for latent in latents for an in latent.ans]
         if len(entries) == len(set(entries)):
             # yes, so... find the most recent one
             # [(date, latent)]
             last_seen = sorted((max(an._entry.date_lower_bound()
                                     for an in latent.ans), latent)
                                for latent in latents)
             # is it strictly more recent?
             if last_seen[-1][0] > last_seen[-2][0]:
                 winnerdate, winner = last_seen[-1]
                 for loserdate, loser in last_seen[:-1]:
                     for an in loser.ans:
                         an._entry.no_link = True
                     loser.merge_into(winner)
                 #print('xxx', some_number)
                 continue
             else:
                 why_not = "can't find newest"
         else:
             # this may not actually be a problem
             #why_not = "duplicates were seen in one entry"
             continue
         with warnx():
             print(
                 "in rule %s, couldn't autofix revision renumbering because %s:"
                 % (
                     rule.numbers,
                     why_not,
                 ))
             for latent in latents:
                 print('--')
                 latent.print_sig_and_texts()
             print('***')
Beispiel #4
0
def add_guessed_numbers(entry):
    data = entry.data
    meta = entry.meta
    history = data['history']
    # propagate cur_num and revnum forward
    cur_num = None
    revnum = None
    for an in entry.ans:
        if an.is_indeterminate:
            cur_num = None
            revnum = None
            an._guessed_num = None
            an._guessed_revnum = None
            continue
        if an.cur_num is not None:
            cur_num = an.cur_num
        an._guessed_num = cur_num
        if an.revnum is not None:
            revnum = an.revnum
        an._guessed_revnum = revnum
    # propagate prev_num backward
    prev_num = data['number']
    ans = entry.ans
    for i in range(len(ans) - 1, -1, -1):
        an = ans[i]
        if an.is_indeterminate:
            prev_num = None
            continue
        if an._guessed_num is None:
            an._guessed_num = prev_num
        elif (prev_num is not None and an._guessed_num != prev_num
              and not (an._guessed_num == 155 and prev_num == 115)):
            with warnx():
                print('in %s:' % (entry.meta['path'], ))
                print(
                    'disagreement about rule number (on [%d]: going backwards: %r; going forwards: %r) for annotation set:'
                    % (i, prev_num, an._guessed_num))
                for j, an2 in enumerate(entry.ans):
                    print('[%d] %r' % (j, an2))
        if an.num_changed:
            prev_num = an.prev_num
Beispiel #5
0
 def break_cycles():
     # break cycles - turns out doesn't actually happen, but I had to write this code to figure that out, so may as well keep it
     stack = []
     for latent in all_latents:
         latent.nextlist = list(latent.nexts)
     for j, latent0 in enumerate(all_latents):
         stack = [[latent0, 0]]
         latent0.stackidx = 0
         while stack:
             latent, nexti = stack[-1]
             if nexti == len(latent.nextlist):
                 latent.stackidx = None
                 stack.pop()
                 continue
             assert latent in all_latents  # XXX
             next = latent.nextlist[nexti]
             stack[-1][1] = nexti + 1
             if next.seen:
                 if next.stackidx is not None:
                     # got a cycle
                     with warnx():
                         print('Got cycle in annotation ordering:')
                         for xlatent, _ in stack[next.stackidx:]:
                             print('--')
                             xlatent.print_sig_and_texts()
                         print('***')
                     # arbitrarily choose the last link to break since it's easier - TODO if there are real cycles, do it better
                     latent.nextlist.remove(next)
                     latent.nexts.remove(next)
                     next.prevs.remove(latent)
                     stack[-1][1] -= 1
                     stack[next.stackidx][1] -= 1
             else:
                 next.seen = True
                 next.stackidx = len(stack)
                 stack.append([next, 0])
Beispiel #6
0
def walk_doc(metadata, text):
    new_metadata = metadata.copy()
    if 'rcslog' in new_metadata:
        del new_metadata['rcslog']
        del new_metadata['rcsauthor']
    assert isinstance(text, bytes)
    #print(metadata['path'])
    m = regex.match(b'(.{,2048}\n)?THE (FULL |SHORT |)LOGICAL RULESET\n\n',
                    text, regex.S)
    if m:
        # this is a ruleset!
        lr_start = m.end()
        n = regex.search(b'\nEND OF THE [^ ]* LOGICAL RULESET',
                         text,
                         pos=lr_start)
        if n:
            lr_end = n.end()
        else:
            lr_end = len(text)
        ruleset = m.group(0)
        ruleset_bits = regex.split(
            b'\n------------------------------+|====================+\n',
            text[lr_start:lr_end])
        have_rulenums = []
        mode = find_rules_mode_for_path(metadata['path'])
        for datas in map(partial(find_rules_in_flr_bit, mode), ruleset_bits):
            for data in datas:
                if data['number'] is not None:
                    have_rulenums.append(data['number'])
                yield {'meta': new_metadata, 'data': data}
        # explicit repeal annotations in RCS?
        if 'rcslog' in metadata and metadata['rcsauthor'] == 'comex':
            # split by semicolon, but not semicolons in parens
            logs = regex.findall(br';\s*((?:\([^\)]*\)|[^;\(]+)*)',
                                 metadata['rcslog'])
            for log in logs:
                log = log.strip()
                if log in {
                        b'formatting', b'update xrefs',
                        b'lots of formatting fixes'
                }:
                    continue  # old stuff I put in
                n = regex.match(b'Rule ([0-9]+) (?:\([^\)]*\) )?repealed', log)
                if not n:
                    raise Exception('unknown RCS annotation %r' % log)
                number = int(n.group(1))
                yield {
                    'meta': new_metadata,
                    'data': {
                        'number': number,
                        'revnum': None,
                        'title': None,
                        'header': None,
                        'extra': None,
                        'text': None,
                        'annotations': None,
                        'history': [decode(log)],
                    }
                }
        # repeals?
        yield {
            'meta': new_metadata,
            'data': {
                'no_rules_except': have_rulenums
            }
        }

        # handle any remaining data
        rest = text[lr_end:].lstrip()
        if rest:
            yield from walk_doc(metadata, rest)
        return
    elif b'THE RULES OF INTERNOMIC' in text:
        # this is ... a fake ruleset!
        return

    else:  # not a ruleset
        if 'rcslog' in metadata and 'current_flr.txt,v' in metadata['path']:
            with warnx():
                print(repr(text))
                print("this should be a flr but doesn't match")
        for data in find_rules(find_rules_mode_for_path(metadata['path']),
                               text):
            yield {'meta': new_metadata, 'data': data}
Beispiel #7
0
    def handle_revnum_clashes(full):
        latents_by_revnum = defaultdict(list)
        for latent in latent_by_sig.values():
            if latent.dead: continue
            try:
                revnum = next(
                    an.revnum for an in latent.ans
                    if not an._entry.no_link and an.revnum is not None)
            except StopIteration:
                revnum = next(an.revnum for an in latent.ans)
            if revnum is not None:
                latents_by_revnum[revnum].append(latent)
        for latent in list(all_latents):
            if latent.sig is None:
                an = next(iter(latent.ans))
                if an.revnum is not None:
                    proper = latents_by_revnum[an.revnum]
                    if len(proper) == 1:
                        latent.merge_into(proper[0])
                        continue
                    if len(proper) == 0:
                        # eh... I guess there's nothing for it
                        proper.append(latent)
                        continue
                    # ...can we find a text match?
                    text = an._entry.normalized_text
                    text_matches = [
                        olatent for olatent in proper
                        if any(oan._entry.normalized_text == text
                               for oan in olatent.ans)
                    ]
                    if len(text_matches) == 1:
                        latent.merge_into(text_matches[0])
                        continue
                    # ...can we find a text match with some other revnum?
                    oans = (oan for olatent in latent_by_sig.values()
                            if not olatent.dead for oan in olatent.ans
                            if oan._entry.normalized_text == text)
                    try:
                        oan = next(oans)
                    except StopIteration:
                        pass
                    else:
                        #print('other text match, so ignore')
                        #print('matched:', oan)
                        latent.delete()
                        continue
                    # ...
                    with warnx():
                        print(
                            "in rule %d: couldn't find something to merge this into:"
                            % (some_number, ))
                        print(an)
                        print(an._entry)
                        print('possibilities: (%d)' % (len(proper), ))
                        for olatent in proper:
                            print(olatent)
                        print('--')

        if not full:
            return
        for revnum, latents in latents_by_revnum.items():
            kill = set()
            if len(latents) <= 1:
                continue
            dispositions = [
                REVNUM_FIXES.get((some_number, latent.sig))
                for latent in latents
            ]
            nones = [
                latent for (disposition, latent) in zip(dispositions, latents)
                if disposition is None
            ]
            if len(nones) > 1:
                # are they all seen in some single entry? then keep all
                if functools.reduce(set.intersection,
                                    (latent.ans for latent in nones)):
                    continue
                # is all but one from nolink?
                with warnx():
                    print('Duplicate revnums for rule %s:' % (rule.numbers, ))
                    for latent in latents:
                        print('--')
                        latent.print_sig_and_texts()
                    print('***')
                continue

            for disposition, latent in zip(dispositions, latents):
                if disposition == 'merge':
                    latent.merge_into(nones[0])
                elif disposition == 'kill':
                    latent.delete()
                elif disposition == 'allow':
                    pass
                elif disposition is None:
                    pass
                else:
                    raise Exception('? %r' % (disposition, latent))
Beispiel #8
0
def do_stragglers(rules, unowned_entries):
    rules_by_text = defaultdict(set)
    rules_by_number = defaultdict(set)
    for rule in rules:
        for entry in rule.entries:
            rules_by_text[entry.normalized_text].add(rule)
        for number in rule.numbers:
            rules_by_number[number].add(rule)
    unowned_by_number_and_text = defaultdict(lambda: defaultdict(set))
    for entry in unowned_entries:
        unowned_by_number_and_text[entry.data['number']][
            entry.normalized_text].add(entry)
    for number, by_text in unowned_by_number_and_text.items():
        nrules = rules_by_number[number]
        if len(nrules) == 0 or number in {1741}:
            # no history for this rule at all; just assume they're all one rule
            # rule 1741: the unanchored entry is a different rule from the anchored one
            new_rule = Rule()
            new_rule.numbers.add(number)
            for entries in by_text.values():
                new_rule.entries.extend(entries)
            rules.add(new_rule)
            continue
        for normalized_text, entries in by_text.items():
            trules = nrules
            if len(trules) > 1:
                trules = trules.intersection(rules_by_text[normalized_text])
            for entry in entries:
                drules = trules
                edate = entry.meta.get('date')
                if edate is not None and number not in {430}:
                    # rule 430: zefram_rules_text says "ca. Sep. 13 1993",
                    # but it was published on Sep. 8
                    edate = util.datetime_from_timestamp(edate).date()
                    drules = [
                        rule for rule in trules
                        if not rule.definitely_created_after(edate)
                    ]
                drules = list(drules)
                if len(drules) == 1:
                    rule = next(iter(drules))
                    rule.entries.append(entry)
                else:
                    with warnx():
                        print('could not match entry (and copies) to rule:')
                        print(next(iter(entries)))
                        print('date:', entry.date())
                        for i, rule in enumerate(drules):
                            print('***** candidate %d/%d:' %
                                  (i + 1, len(drules)))
                            print(rule)
                            for oentry in rule.entries:
                                print('--')
                                print(oentry)
                                #for an in oentry.ans: print(an)
                        if not drules:
                            print(
                                '***** no candidates! (%d by number alone, but enacted too late)'
                                % (len(nrules), ))
                        print('====')
                    break
Beispiel #9
0
def split_into_rules_with_number_timeline(rule_entries):
    by_num_and_numbered_date = defaultdict(lambda: defaultdict(list))
    for entry in rule_entries:
        entry.anchors = []
        for an in entry.ans:
            if (an.is_create or an.num_changed
                ) and an.date is not None and an._guessed_num is not None:
                anchor = (an._guessed_num, an.proposal_num or an.date)
                anchor = ANCHOR_OVERRIDES.get(anchor, anchor)
                by_num_and_numbered_date[an._guessed_num][an.date].append(
                    (entry, anchor))
                entry.anchors.append(anchor)
    timeline_by_num = defaultdict(list)
    for number, by_numbered_date in by_num_and_numbered_date.items():
        timeline_by_num[number] = sorted(by_numbered_date.keys())
    for entry in rule_entries:
        already_anchored = False
        prev_gn = None
        for an in entry.ans:
            if an.num_changed or an.is_indeterminate or an._guessed_num != prev_gn:
                already_anchored = False
            prev_gn = an._guessed_num
            if (an.is_create or an.num_changed) and an.date is not None:
                already_anchored = True
            if already_anchored:
                continue
            if an._guessed_num is not None and an.date is not None:
                number = an._guessed_num
                timeline = timeline_by_num[number]
                if len(timeline) == 0:
                    # eh, assume there weren't multiple copies of this rule
                    entry.anchors.append((number, None))
                    already_anchored = True
                    continue
                i = bisect.bisect_right(timeline, an.date)
                if i == 0 and number not in {1741}:
                    with warnx():
                        print(
                            'Annotation comes before all anchors for rule %d: %s'
                            % (number, an))
                        print('from %s' % entry.meta['path'])
                        print('earliest is at %s: %s' %
                              (timeline[0],
                               by_num_and_numbered_date[number][timeline[0]]))
                        for an in entry.ans:
                            print(an)
                    continue
                if i == len(timeline):
                    i -= 1  # assume the rule stayed the same?
                date = timeline[i]
                oentry, anchor = by_num_and_numbered_date[
                    an._guessed_num][date][-1]
                entry.anchors.append(anchor)
                already_anchored = True

    def print_timeline(num):
        print('timeline for %s:' % (num, ))
        for date in timeline_by_num[num]:
            print('- %s' % (date, ))
            for entry in by_num_and_numbered_date[num][date][0]:
                print(entry)
                for an in entry.ans:
                    print(an)

    #print_timeline(1051); die
    anchor_to_rule = {}
    all_rules = set()
    rule_id = 0
    # unify multiple anchors in the same entry
    for entry in rule_entries:
        first_info = None
        for i, anchor in enumerate(entry.anchors):
            rule = anchor_to_rule.get(anchor)
            if rule is None:
                rule = Rule()
                rule.anchors.append(anchor)
                all_rules.add(rule)
                anchor_to_rule[anchor] = rule
            if i == 0:
                first_rule = rule
            else:
                if rule is not first_rule:
                    for anchor in rule.anchors:
                        anchor_to_rule[anchor] = first_rule
                    all_rules.remove(rule)
                    first_rule.anchors.extend(rule.anchors)
    unowned_entries = []
    for entry in rule_entries:
        if entry.anchors:
            rule = anchor_to_rule[entry.anchors[0]]
            rule.entries.append(entry)
            for an in entry.ans:
                if an._guessed_num is not None:
                    rule.numbers.add(an._guessed_num)
            assert_(entry.data['number'] in rule.numbers)
        else:
            unowned_entries.append(entry)
    return all_rules, unowned_entries
Beispiel #10
0
def identify_same(entries):
    def add_with_revnum(entry):
        number = entry.data['number']
        revnum = entry.data['revnum']
        existings = by_number_and_revnum[number][revnum]
        for existing in existings:
            # lol, performance
            if text_match(existing, entry):
                existing.variants.append(entry)
                break
        else:
            entry.variants = [entry]
            existings.append(entry)

    by_number_and_revnum = defaultdict(lambda: defaultdict(list))
    for entry in entries:
        add_with_revnum(entry)
    # deal with revnum=None entries
    for number, by_revnum in by_number_and_revnum.items():
        nones = by_revnum[None]
        del by_revnum[None]
        unmatched_nones = []
        for entry in nones:
            for existing in (existing for xentries in by_revnum.values()
                             for existing in xentries):
                if text_match(existing, entry):
                    existing.variants.append(entry)
                    break
            else:
                unmatched_nones.append(entry)
        still_unmatched_nones = []
        for entry in unmatched_nones:
            fudge_revnum = None
            number = entry.data['number']
            path = entry.meta['path']
            still_unmatched_nones.append(entry)
        if not still_unmatched_nones:
            continue
        fudge_revnums = None
        if len(still_unmatched_nones) == 1 and len(by_revnum) == 0:
            fudge_revnums = ['0']
        elif number in FUDGE_REVNUMS:
            fudge_revnums = FUDGE_REVNUMS[number]
        if fudge_revnums is not None:
            if len(fudge_revnums) != len(still_unmatched_nones):
                print('...bad fudge_revnums length!')
            else:
                for fudge_revnum, entry in zip(fudge_revnums,
                                               still_unmatched_nones):
                    entry.data['revnum'] = fudge_revnum
                    add_with_revnum(entry)
                continue
        with warnx():
            print('Orphan texts for rule %d:' % (entry.data['number'], ), )
            for entry in still_unmatched_nones:
                print(entry.data['text'])
                print('variants:%d meta:%s' % (
                    len(entry.variants),
                    entry.meta,
                ))
                print('last annotation:%s' % (entry.ans[-1], ))
                print('==')
            print('Here are all the numbered texts I have for that rule:')
            have_any = False
            for revnum, xentries in sorted(by_revnum.items(),
                                           key=lambda tup: revnum_key(tup[0])):
                if revnum is None:
                    continue
                print('--')
                for existing in xentries:
                    print('revnum: %s  header: %s' %
                          (existing.data['revnum'], existing.data['header']))
                    print(existing.data['text']),
                    print('variants:%d meta:%s' % (
                        len(existing.variants),
                        existing.meta,
                    ))
                    have_any = True
            if not have_any:
                print('(none)')

    for by_revnum in by_number_and_revnum.values():
        for revnum, xentries in by_revnum.items():
            if revnum is None:
                continue
            for i, entry in enumerate(xentries):
                best = max(entry.variants, key=quality)
                if best is not entry:
                    best.variants = entry.variants
                    del entry.variants
                xentries[i] = best
    return [
        existing for xentries in by_revnum.values() for existing in xentries
    ]