def parsed_title(text, appendix_letter):
    digit_str_parser = (Marker(appendix_letter) + Suppress('-') +
                        grammar.a1.copy().leaveWhitespace() +
                        Optional(grammar.markerless_upper) +
                        Optional(grammar.paren_upper | grammar.paren_lower) +
                        Optional(grammar.paren_digit))
    part_roman_parser = Marker("part") + grammar.aI
    parser = QuickSearchable(LineStart() +
                             (digit_str_parser | part_roman_parser))

    for match, _, _ in parser.scanString(text):
        return match
def parsed_title(text, appendix_letter):
    digit_str_parser = (Marker(appendix_letter) +
                        Suppress('-') +
                        grammar.a1.copy().leaveWhitespace() +
                        Optional(grammar.markerless_upper) +
                        Optional(grammar.paren_upper | grammar.paren_lower) +
                        Optional(grammar.paren_digit))
    part_roman_parser = Marker("part") + grammar.aI
    parser = QuickSearchable(
        LineStart() + (digit_str_parser | part_roman_parser))

    for match, _, _ in parser.scanString(text):
        return match
class StatutesFinder(FDSYSFinder, FinderBase):
    """Statutes at large"""
    CITE_TYPE = 'STATUTES_AT_LARGE'
    GRAMMAR = QuickSearchable(
        Word(string.digits).setResultsName("volume") + Suppress("Stat.") +
        Word(string.digits).setResultsName("page"))
    CONST_PARAMS = dict(collection='statute')
class PublicLawFinder(FDSYSFinder, FinderBase):
    """Public Law"""
    CITE_TYPE = 'PUBLIC_LAW'
    GRAMMAR = QuickSearchable(
        Marker("Public") + Marker("Law") +
        Word(string.digits).setResultsName("congress") + Suppress("-") +
        Word(string.digits).setResultsName("lawnum"))
    CONST_PARAMS = dict(collection='plaw', lawtype='public')
class USCFinder(FDSYSFinder, FinderBase):
    """U.S. Code"""
    CITE_TYPE = 'USC'
    GRAMMAR = QuickSearchable(
        Word(string.digits).setResultsName("title") + "U.S.C." +
        Suppress(Optional("Chapter")) +
        Word(string.digits).setResultsName("section"))
    CONST_PARAMS = dict(collection='uscode')
class StatutesFinder(FinderBase):
    """Statutes at large"""
    CITE_TYPE = 'STATUTES_AT_LARGE'
    GRAMMAR = QuickSearchable(
        Word(string.digits).setResultsName("volume") + Suppress("Stat.") +
        Word(string.digits).setResultsName("page"))

    def find(self, node):
        for match, start, end in self.GRAMMAR.scanString(node.text):
            components = {'volume': match.volume, 'page': match.page}
            yield Cite(self.CITE_TYPE, start, end, components,
                       fdsys_url(collection='statute', **components))
class USCFinder(FinderBase):
    """U.S. Code"""
    CITE_TYPE = 'USC'
    GRAMMAR = QuickSearchable(
        Word(string.digits).setResultsName("title") + Marker("U.S.C.") +
        Word(string.digits).setResultsName("section"))

    def find(self, node):
        for match, start, end in self.GRAMMAR.scanString(node.text):
            components = {'title': match.title, 'section': match.section}
            yield Cite(self.CITE_TYPE, start, end, components,
                       fdsys_url(collection='uscode', **components))
class PublicLawFinder(FinderBase):
    """Public Law"""
    CITE_TYPE = 'PUBLIC_LAW'
    GRAMMAR = QuickSearchable(
        Marker("Public") + Marker("Law") +
        Word(string.digits).setResultsName("congress") + Suppress("-") +
        Word(string.digits).setResultsName("lawnum"))

    def find(self, node):
        for match, start, end in self.GRAMMAR.scanString(node.text):
            components = {'congress': match.congress, 'lawnum': match.lawnum}
            yield Cite(
                self.CITE_TYPE, start, end, components,
                fdsys_url(collection='plaw', lawtype='public', **components))
def make_multiple(head, tail=None, wrap_tail=False):
    """We have a recurring need to parse citations which have a string of
    terms, e.g. section 11(a), (b)(4), and (5). This function is a shorthand
    for setting these elements up"""
    if tail is None:
        tail = head
    head = keep_pos(head).setResultsName("head")
    # We need to address just the matching text separately from the
    # conjunctive phrase
    tail = keep_pos(tail).setResultsName("match")
    tail = (atomic.conj_phrases + tail).setResultsName("tail",
                                                       listAllMatches=True)
    if wrap_tail:
        tail = Optional(Suppress('(')) + tail + Optional(Suppress(')'))
    return QuickSearchable(head + OneOrMore(tail))
Example #10
0
def make_multiple(head, tail=None, wrap_tail=False):
    """We have a recurring need to parse citations which have a string of
    terms, e.g. section 11(a), (b)(4), and (5). This function is a shorthand
    for setting these elements up"""
    if tail is None:
        tail = head
    # Use `Empty` over `copy` as `head`/`tail` may be single-element grammars,
    # in which case we don't want to completely rename the results
    head = (head + Empty()).setParseAction(keep_pos).setResultsName("head")
    # We need to address just the matching text separately from the
    # conjunctive phrase
    tail = (tail + Empty()).setParseAction(keep_pos).setResultsName("match")
    tail = (atomic.conj_phrases + tail).setResultsName("tail",
                                                       listAllMatches=True)
    if wrap_tail:
        tail = Optional(Suppress('(')) + tail + Optional(Suppress(')'))
    return QuickSearchable(head + OneOrMore(tail))
Example #11
0
from regparser.layer.layer import Layer

logger = logging.getLogger(__name__)

level1 = Word("IVXLCDM").leaveWhitespace().setResultsName("l1")
level2 = Word(string.ascii_uppercase).leaveWhitespace().setResultsName("l2")
level3 = Word(string.digits).leaveWhitespace().setResultsName("l3")
level4 = Word(string.ascii_lowercase).leaveWhitespace().setResultsName("l4")
level5 = Word("ivxlcdm").leaveWhitespace().setResultsName("l5")
level6 = Word(string.ascii_lowercase).leaveWhitespace().setResultsName("l6")
period = Suppress(".").leaveWhitespace()

# e.g. I.B, I.B.3, I.B.3.d, I.B.3.d.v, I.B.3.d.v.f
citation = level1 + period + level2 + Optional(period + level3 + Optional(
    period + level4 + Optional(period + level5 + Optional(period + level6))))
citation = QuickSearchable(citation)


class InternalCitations(Layer):
    shorthand = 'internal-citations'

    def __init__(self, tree, **context):
        super(InternalCitations, self).__init__(tree, **context)
        self.known_citations = set()

    def pre_process(self):
        """As a preprocessing step, run through the entire tree, collecting
        all labels"""
        labels = self.tree.walk(lambda node: tuple(node.label))
        self.known_citations = set(labels)
            pair = (match.a1, 2)
        elif match.aI:
            pair = (match.aI, 2)

        if (pair is not None and reg_part in APPENDIX_IGNORE_SUBHEADER_LABEL
                and pair[0]
                in APPENDIX_IGNORE_SUBHEADER_LABEL[reg_part][appendix_letter]):
            logger.warning("Ignoring subheader label %s of appendix %s",
                           pair[0], appendix_letter)
            pair = None

    return pair


_parser = QuickSearchable(grammar.paren_upper | grammar.paren_lower
                          | grammar.paren_digit | grammar.period_upper
                          | grammar.period_digit | grammar.period_lower)


def initial_marker(text):
    for match, start, end in _parser.scanString(text):
        if start != 0:
            continue
        marker = (match.paren_upper or match.paren_lower or match.paren_digit
                  or match.period_upper or match.period_lower
                  or match.period_digit)
        if len(marker) < 3 or all(char in 'ivxlcdm' for char in marker):
            return marker, text[:end]


def build_non_reg_text(reg_xml, reg_part):
from regparser.grammar import atomic
from regparser.grammar.utils import keep_pos, Marker, QuickSearchable

period_section = Suppress(".") + atomic.section
part_section = atomic.part + period_section
marker_part_section = (
    keep_pos(atomic.section_marker).setResultsName("marker") + part_section)

depth6_p = atomic.em_roman_p | atomic.plaintext_level6_p
depth5_p = ((atomic.em_digit_p | atomic.plaintext_level5_p) +
            Optional(depth6_p))
depth4_p = atomic.upper_p + Optional(depth5_p)
depth3_p = atomic.roman_p + Optional(depth4_p)
depth2_p = atomic.digit_p + Optional(depth3_p)
depth1_p = atomic.lower_p + ~FollowedBy(atomic.upper_p) + Optional(depth2_p)
any_depth_p = QuickSearchable(depth1_p | depth2_p | depth3_p | depth4_p
                              | depth5_p | depth6_p)

depth3_c = atomic.upper_c + Optional(atomic.em_digit_c)
depth2_c = atomic.roman_c + Optional(depth3_c)
depth1_c = atomic.digit_c + Optional(depth2_c)
any_a = atomic.upper_a | atomic.digit_a

section_comment = atomic.section + depth1_c

section_paragraph = QuickSearchable(atomic.section + depth1_p)

mps_paragraph = QuickSearchable(marker_part_section + Optional(depth1_p))
ps_paragraph = part_section + Optional(depth1_p)
part_section_paragraph = QuickSearchable(atomic.part + Suppress(".") +
                                         atomic.section + depth1_p)
token_patterns = QuickSearchable(
    put_active | put_passive | post_active | post_passive |
    delete_active | delete_passive | move_active | move_passive |
    designate_active | reserve_active |
    insert_in_order |

    interp | marker_subpart | appendix |
    comment_context_with_section | comment_context_without_section |
    comment_context_under_with_section |
    paragraph_heading_of | section_heading_of |
    multiple_intro_text_of | intro_text_of |
    appendix_section_heading_of |
    intro_text_of_interp |
    comment_heading | appendix_subheading | section_paragraph_heading_of |
    # Must come after other headings as it is a catch-all
    section_heading |
    multiple_paragraph_sections | section_single_par |
    multiple_interp_entries |

    multiple_sections | multiple_paragraphs | multiple_appendices |
    multiple_comment_pars | multiple_comments |
    #   Must come after multiple_appendices
    appendix_section |
    #   Must come after multiple_pars |
    single_par_section | single_par |
    #   Must come after multiple_comment_pars
    single_comment_with_section | single_comment_par |
    #   Must come after section_single_par
    section |
    #   Must come after intro_text_of
    intro_text |

    definition |

    # Finally allow for an explicit override label
    override_label | subject_group |

    paragraph_context |
    and_token
)
Example #15
0
# vim: set encoding=utf-8
from pyparsing import (LineStart, Literal, OneOrMore, Optional, Regex, SkipTo,
                       srange, Suppress, Word, ZeroOrMore)

from regparser.grammar import atomic, unified
from regparser.grammar.utils import (DocLiteral, keep_pos, Marker,
                                     QuickSearchable)

smart_quotes = QuickSearchable(
    Suppress(DocLiteral(u'“', "left-smart-quote")) + keep_pos(
        SkipTo(DocLiteral(u'”', "right-smart-quote"))).setResultsName("term"))

e_tag = (
    Suppress(Regex(r"<E[^>]*>")) +
    keep_pos(OneOrMore(Word(srange("[a-zA-Z-]")))).setResultsName("term") +
    Suppress(Literal("</E>")))

xml_term_parser = QuickSearchable(
    LineStart() + Optional(Suppress(unified.any_depth_p)) +
    e_tag.setResultsName("head") +
    ZeroOrMore((atomic.conj_phrases +
                e_tag).setResultsName("tail", listAllMatches=True)) +
    Suppress(ZeroOrMore(Regex(r",[a-zA-Z ]+,"))) +
    Suppress(ZeroOrMore((Marker("this") | Marker("the")) + Marker("term"))) +
    ((Marker("mean") | Marker("means"))
     | (Marker("refers") + ZeroOrMore(Marker("only")) + Marker("to")) | (
         (Marker("has") | Marker("have")) + Marker("the") + Marker("same") +
         Marker("meaning") + Marker("as"))))

key_term_parser = QuickSearchable(
    LineStart() + Optional(Suppress(unified.any_depth_p)) +
Example #16
0

any_depth_p = unified.any_depth_p.copy().setParseAction(_any_depth_parse)


def initial_markers(text):
    """Pull out a list of the first paragraph markers, i.e. markers before any
    text"""
    try:
        return list(any_depth_p.parseString(text))
    except pyparsing.ParseException:
        return []


_collapsed_grammar = QuickSearchable(
    # A guard to reduce false positives
    pyparsing.Suppress(pyparsing.Regex(u',|\\.|-|—|>|means ')) +
    any_depth_p)


def collapsed_markers(text):
    """Not all paragraph markers are at the beginning of of the text. This
    grabs inner markers like (1) and (i) here:
    (c) cContent —(1) 1Content (i) iContent"""
    potential = [triplet for triplet in _collapsed_grammar.scanString(text)]
    #   remove any that overlap with citations
    potential = [trip for trip in remove_citation_overlaps(text, potential)]
    #   flatten the results
    potential = [pm for pms, _, _ in potential for pm in pms]
    #   remove any matches that aren't (a), (1), (i), etc. -- All other
    #   markers can't be collapsed
    first_markers = [level[0] for level in p_levels]