def parsed_title(text, appendix_letter): digit_str_parser = (Marker(appendix_letter) + Suppress('-') + grammar.a1.copy().leaveWhitespace() + Optional(grammar.markerless_upper) + Optional(grammar.paren_upper | grammar.paren_lower) + Optional(grammar.paren_digit)) part_roman_parser = Marker("part") + grammar.aI parser = QuickSearchable(LineStart() + (digit_str_parser | part_roman_parser)) for match, _, _ in parser.scanString(text): return match
def parsed_title(text, appendix_letter): digit_str_parser = (Marker(appendix_letter) + Suppress('-') + grammar.a1.copy().leaveWhitespace() + Optional(grammar.markerless_upper) + Optional(grammar.paren_upper | grammar.paren_lower) + Optional(grammar.paren_digit)) part_roman_parser = Marker("part") + grammar.aI parser = QuickSearchable( LineStart() + (digit_str_parser | part_roman_parser)) for match, _, _ in parser.scanString(text): return match
class StatutesFinder(FDSYSFinder, FinderBase): """Statutes at large""" CITE_TYPE = 'STATUTES_AT_LARGE' GRAMMAR = QuickSearchable( Word(string.digits).setResultsName("volume") + Suppress("Stat.") + Word(string.digits).setResultsName("page")) CONST_PARAMS = dict(collection='statute')
class PublicLawFinder(FDSYSFinder, FinderBase): """Public Law""" CITE_TYPE = 'PUBLIC_LAW' GRAMMAR = QuickSearchable( Marker("Public") + Marker("Law") + Word(string.digits).setResultsName("congress") + Suppress("-") + Word(string.digits).setResultsName("lawnum")) CONST_PARAMS = dict(collection='plaw', lawtype='public')
class USCFinder(FDSYSFinder, FinderBase): """U.S. Code""" CITE_TYPE = 'USC' GRAMMAR = QuickSearchable( Word(string.digits).setResultsName("title") + "U.S.C." + Suppress(Optional("Chapter")) + Word(string.digits).setResultsName("section")) CONST_PARAMS = dict(collection='uscode')
class StatutesFinder(FinderBase): """Statutes at large""" CITE_TYPE = 'STATUTES_AT_LARGE' GRAMMAR = QuickSearchable( Word(string.digits).setResultsName("volume") + Suppress("Stat.") + Word(string.digits).setResultsName("page")) def find(self, node): for match, start, end in self.GRAMMAR.scanString(node.text): components = {'volume': match.volume, 'page': match.page} yield Cite(self.CITE_TYPE, start, end, components, fdsys_url(collection='statute', **components))
class USCFinder(FinderBase): """U.S. Code""" CITE_TYPE = 'USC' GRAMMAR = QuickSearchable( Word(string.digits).setResultsName("title") + Marker("U.S.C.") + Word(string.digits).setResultsName("section")) def find(self, node): for match, start, end in self.GRAMMAR.scanString(node.text): components = {'title': match.title, 'section': match.section} yield Cite(self.CITE_TYPE, start, end, components, fdsys_url(collection='uscode', **components))
class PublicLawFinder(FinderBase): """Public Law""" CITE_TYPE = 'PUBLIC_LAW' GRAMMAR = QuickSearchable( Marker("Public") + Marker("Law") + Word(string.digits).setResultsName("congress") + Suppress("-") + Word(string.digits).setResultsName("lawnum")) def find(self, node): for match, start, end in self.GRAMMAR.scanString(node.text): components = {'congress': match.congress, 'lawnum': match.lawnum} yield Cite( self.CITE_TYPE, start, end, components, fdsys_url(collection='plaw', lawtype='public', **components))
def make_multiple(head, tail=None, wrap_tail=False): """We have a recurring need to parse citations which have a string of terms, e.g. section 11(a), (b)(4), and (5). This function is a shorthand for setting these elements up""" if tail is None: tail = head head = keep_pos(head).setResultsName("head") # We need to address just the matching text separately from the # conjunctive phrase tail = keep_pos(tail).setResultsName("match") tail = (atomic.conj_phrases + tail).setResultsName("tail", listAllMatches=True) if wrap_tail: tail = Optional(Suppress('(')) + tail + Optional(Suppress(')')) return QuickSearchable(head + OneOrMore(tail))
def make_multiple(head, tail=None, wrap_tail=False): """We have a recurring need to parse citations which have a string of terms, e.g. section 11(a), (b)(4), and (5). This function is a shorthand for setting these elements up""" if tail is None: tail = head # Use `Empty` over `copy` as `head`/`tail` may be single-element grammars, # in which case we don't want to completely rename the results head = (head + Empty()).setParseAction(keep_pos).setResultsName("head") # We need to address just the matching text separately from the # conjunctive phrase tail = (tail + Empty()).setParseAction(keep_pos).setResultsName("match") tail = (atomic.conj_phrases + tail).setResultsName("tail", listAllMatches=True) if wrap_tail: tail = Optional(Suppress('(')) + tail + Optional(Suppress(')')) return QuickSearchable(head + OneOrMore(tail))
from regparser.layer.layer import Layer logger = logging.getLogger(__name__) level1 = Word("IVXLCDM").leaveWhitespace().setResultsName("l1") level2 = Word(string.ascii_uppercase).leaveWhitespace().setResultsName("l2") level3 = Word(string.digits).leaveWhitespace().setResultsName("l3") level4 = Word(string.ascii_lowercase).leaveWhitespace().setResultsName("l4") level5 = Word("ivxlcdm").leaveWhitespace().setResultsName("l5") level6 = Word(string.ascii_lowercase).leaveWhitespace().setResultsName("l6") period = Suppress(".").leaveWhitespace() # e.g. I.B, I.B.3, I.B.3.d, I.B.3.d.v, I.B.3.d.v.f citation = level1 + period + level2 + Optional(period + level3 + Optional( period + level4 + Optional(period + level5 + Optional(period + level6)))) citation = QuickSearchable(citation) class InternalCitations(Layer): shorthand = 'internal-citations' def __init__(self, tree, **context): super(InternalCitations, self).__init__(tree, **context) self.known_citations = set() def pre_process(self): """As a preprocessing step, run through the entire tree, collecting all labels""" labels = self.tree.walk(lambda node: tuple(node.label)) self.known_citations = set(labels)
pair = (match.a1, 2) elif match.aI: pair = (match.aI, 2) if (pair is not None and reg_part in APPENDIX_IGNORE_SUBHEADER_LABEL and pair[0] in APPENDIX_IGNORE_SUBHEADER_LABEL[reg_part][appendix_letter]): logger.warning("Ignoring subheader label %s of appendix %s", pair[0], appendix_letter) pair = None return pair _parser = QuickSearchable(grammar.paren_upper | grammar.paren_lower | grammar.paren_digit | grammar.period_upper | grammar.period_digit | grammar.period_lower) def initial_marker(text): for match, start, end in _parser.scanString(text): if start != 0: continue marker = (match.paren_upper or match.paren_lower or match.paren_digit or match.period_upper or match.period_lower or match.period_digit) if len(marker) < 3 or all(char in 'ivxlcdm' for char in marker): return marker, text[:end] def build_non_reg_text(reg_xml, reg_part):
from regparser.grammar import atomic from regparser.grammar.utils import keep_pos, Marker, QuickSearchable period_section = Suppress(".") + atomic.section part_section = atomic.part + period_section marker_part_section = ( keep_pos(atomic.section_marker).setResultsName("marker") + part_section) depth6_p = atomic.em_roman_p | atomic.plaintext_level6_p depth5_p = ((atomic.em_digit_p | atomic.plaintext_level5_p) + Optional(depth6_p)) depth4_p = atomic.upper_p + Optional(depth5_p) depth3_p = atomic.roman_p + Optional(depth4_p) depth2_p = atomic.digit_p + Optional(depth3_p) depth1_p = atomic.lower_p + ~FollowedBy(atomic.upper_p) + Optional(depth2_p) any_depth_p = QuickSearchable(depth1_p | depth2_p | depth3_p | depth4_p | depth5_p | depth6_p) depth3_c = atomic.upper_c + Optional(atomic.em_digit_c) depth2_c = atomic.roman_c + Optional(depth3_c) depth1_c = atomic.digit_c + Optional(depth2_c) any_a = atomic.upper_a | atomic.digit_a section_comment = atomic.section + depth1_c section_paragraph = QuickSearchable(atomic.section + depth1_p) mps_paragraph = QuickSearchable(marker_part_section + Optional(depth1_p)) ps_paragraph = part_section + Optional(depth1_p) part_section_paragraph = QuickSearchable(atomic.part + Suppress(".") + atomic.section + depth1_p)
token_patterns = QuickSearchable( put_active | put_passive | post_active | post_passive | delete_active | delete_passive | move_active | move_passive | designate_active | reserve_active | insert_in_order | interp | marker_subpart | appendix | comment_context_with_section | comment_context_without_section | comment_context_under_with_section | paragraph_heading_of | section_heading_of | multiple_intro_text_of | intro_text_of | appendix_section_heading_of | intro_text_of_interp | comment_heading | appendix_subheading | section_paragraph_heading_of | # Must come after other headings as it is a catch-all section_heading | multiple_paragraph_sections | section_single_par | multiple_interp_entries | multiple_sections | multiple_paragraphs | multiple_appendices | multiple_comment_pars | multiple_comments | # Must come after multiple_appendices appendix_section | # Must come after multiple_pars | single_par_section | single_par | # Must come after multiple_comment_pars single_comment_with_section | single_comment_par | # Must come after section_single_par section | # Must come after intro_text_of intro_text | definition | # Finally allow for an explicit override label override_label | subject_group | paragraph_context | and_token )
# vim: set encoding=utf-8 from pyparsing import (LineStart, Literal, OneOrMore, Optional, Regex, SkipTo, srange, Suppress, Word, ZeroOrMore) from regparser.grammar import atomic, unified from regparser.grammar.utils import (DocLiteral, keep_pos, Marker, QuickSearchable) smart_quotes = QuickSearchable( Suppress(DocLiteral(u'“', "left-smart-quote")) + keep_pos( SkipTo(DocLiteral(u'”', "right-smart-quote"))).setResultsName("term")) e_tag = ( Suppress(Regex(r"<E[^>]*>")) + keep_pos(OneOrMore(Word(srange("[a-zA-Z-]")))).setResultsName("term") + Suppress(Literal("</E>"))) xml_term_parser = QuickSearchable( LineStart() + Optional(Suppress(unified.any_depth_p)) + e_tag.setResultsName("head") + ZeroOrMore((atomic.conj_phrases + e_tag).setResultsName("tail", listAllMatches=True)) + Suppress(ZeroOrMore(Regex(r",[a-zA-Z ]+,"))) + Suppress(ZeroOrMore((Marker("this") | Marker("the")) + Marker("term"))) + ((Marker("mean") | Marker("means")) | (Marker("refers") + ZeroOrMore(Marker("only")) + Marker("to")) | ( (Marker("has") | Marker("have")) + Marker("the") + Marker("same") + Marker("meaning") + Marker("as")))) key_term_parser = QuickSearchable( LineStart() + Optional(Suppress(unified.any_depth_p)) +
any_depth_p = unified.any_depth_p.copy().setParseAction(_any_depth_parse) def initial_markers(text): """Pull out a list of the first paragraph markers, i.e. markers before any text""" try: return list(any_depth_p.parseString(text)) except pyparsing.ParseException: return [] _collapsed_grammar = QuickSearchable( # A guard to reduce false positives pyparsing.Suppress(pyparsing.Regex(u',|\\.|-|—|>|means ')) + any_depth_p) def collapsed_markers(text): """Not all paragraph markers are at the beginning of of the text. This grabs inner markers like (1) and (i) here: (c) cContent —(1) 1Content (i) iContent""" potential = [triplet for triplet in _collapsed_grammar.scanString(text)] # remove any that overlap with citations potential = [trip for trip in remove_citation_overlaps(text, potential)] # flatten the results potential = [pm for pms, _, _ in potential for pm in pms] # remove any matches that aren't (a), (1), (i), etc. -- All other # markers can't be collapsed first_markers = [level[0] for level in p_levels]