def is_child_of(child_xml, header_xml): """Children are paragraphs, have lower 'source' or the header has citations and the child does not""" return (child_xml.tag != 'HD' or child_xml.get('SOURCE') > header_xml.get('SOURCE') or (internal_citations(header_xml.text, Label()) and not internal_citations(child_xml.text, Label())))
def scope_of_text(self, text, label_struct, verify_prefix=True): """Given specific text, try to determine the definition scope it indicates. Implicit return None if none is found.""" scopes = [] # First, make a list of potential scope indicators citations = internal_citations(text, label_struct, require_marker=True) indicators = [(c.full_start, c.label.to_list()) for c in citations] text = text.lower() label_list = label_struct.to_list() indicators.extend((m.start(), label_list[:1]) for m in Terms.part_re.finditer(text)) indicators.extend((m.start(), label_list[:2]) for m in Terms.sect_re.finditer(text)) indicators.extend((m.start(), label_list) for m in Terms.par_re.finditer(text)) # Subpart's a bit more complicated, as it gets expanded into a # list of sections for match in Terms.subpart_re.finditer(text): indicators.extend( (match.start(), subpart_label) for subpart_label in self.subpart_scope(label_list)) # Finally, add the scope if we verify its prefix for start, label in indicators: if not verify_prefix or Terms.scope_re.match(text[:start]): scopes.append(label) elif Terms.scope_used_re.match(text[:start]): scopes.append(label) # Add interpretation to scopes scopes = scopes + [s + [struct.Node.INTERP_MARK] for s in scopes] if scopes: return [tuple(s) for s in scopes]
def test_section_ref_in_appendix(self): text = u"""(a) Something something § 1005.7(b)(1).""" citations = internal_citations( text, Label(part='1005', appendix='A', appendix_section='2', p1='a')) self.assertEqual(citations[0].label.to_list(), ['1005', '7', 'b', '1'])
def segment_tree(text, part, parent_label): """Build a tree representing the interpretation of a section, paragraph, or appendix.""" title, body = utils.title_body(text) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(body, Label(part=parent_label[0]))] label = merge_labels(text_to_labels(title, Label(part=part, comment=True))) return interpParser.build_tree(body, 1, exclude, label, title)
def test_single_match_multiple_paragraphs4(self): text = "Listing sections 11.55(d) and 321.11 (h)(4)" citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['11', '55', 'd'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '11.55(d)') citation = citations[1] self.assertEqual(['321', '11', 'h', '4'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '321.11 (h)(4)')
def add_spaces_to_title(title): """Federal Register often seems to miss spaces in the title of SxS sections. Make sure spaces get added if appropriate""" for citation in internal_citations(title, Label()): end = citation.end # Next char is an alpha and last char isn't a space if end < len(title) and title[end].isalpha() and title[end - 1] != ' ': title = title[:end] + ' ' + title[end:] break # Assumes there is only one paragraph in a title return title
def add_spaces_to_title(title): """Federal Register often seems to miss spaces in the title of SxS sections. Make sure spaces get added if appropriate""" for citation in internal_citations(title, Label()): end = citation.end # Next char is an alpha and last char isn't a space if end < len(title) and title[end].isalpha() and title[end-1] != ' ': title = title[:end] + ' ' + title[end:] break # Assumes there is only one paragraph in a title return title
def parse_into_labels(txt, part): """Find what part+section+(paragraph) (could be multiple) this text is related to.""" citations = internal_citations(txt, Label(part=part)) # odd corner case: headers shouldn't include both an appendix and regtext labels = [c.label for c in citations] if any('appendix' in l.settings for l in labels): labels = [l for l in labels if 'appendix' in l.settings] labels = ['-'.join(l.to_list()) for l in labels] return labels
def test_single_match_multiple_paragraphs2(self): text = u'§ 1005.10(a) and (d)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['1005', '10', 'a'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '1005.10(a)') citation = citations[1] self.assertEqual(['1005', '10', 'd'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(d)') text = u'§ 1005.7(b)(1), (2) and (3)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(3, len(citations)) self.assertEqual(['1005', '7', 'b', '1'], citations[0].label.to_list()) self.assertEqual(['1005', '7', 'b', '2'], citations[1].label.to_list()) self.assertEqual(['1005', '7', 'b', '3'], citations[2].label.to_list()) text = u'§ 1005.15(d)(1)(i) and (ii)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) self.assertEqual(['1005', '15', 'd', '1', 'i'], citations[0].label.to_list()) self.assertEqual(['1005', '15', 'd', '1', 'ii'], citations[1].label.to_list()) text = u'§ 1005.9(a)(5) (i), (ii), or (iii)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(3, len(citations)) self.assertEqual(['1005', '9', 'a', '5', 'i'], citations[0].label.to_list()) self.assertEqual(['1005', '9', 'a', '5', 'ii'], citations[1].label.to_list()) self.assertEqual(['1005', '9', 'a', '5', 'iii'], citations[2].label.to_list()) text = u'§ 1005.11(a)(1)(vi) or (vii).' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) self.assertEqual(['1005', '11', 'a', '1', 'vi'], citations[0].label.to_list()) self.assertEqual(['1005', '11', 'a', '1', 'vii'], citations[1].label.to_list()) text = u'§§ 1005.3(b)(2) and (3), 1005.10(b), (d), and (e), 1005.13, ' text += 'and 1005.20' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(7, len(citations)) text = 'Sections 1005.3, .4, and .5' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(3, len(citations)) self.assertEqual(['1005', '3'], citations[0].label.to_list()) self.assertEqual(['1005', '4'], citations[1].label.to_list()) self.assertEqual(['1005', '5'], citations[2].label.to_list())
def generate_keyterm(node): label_id = node.label_id() if label_id in real_key_terms_layer: layer[label_id] = real_key_terms_layer[label_id] else: node_text = key_terms.KeyTerms.process_node_text(node) if not node_text: return # Our Appendix parsing isn't particularly accurate -- avoid keyterms if node.node_type == struct.Node.APPENDIX: return exclude = [(start, end) for _, start, end in exclude_parser.scanString(node_text)] exclude.extend((pc.full_start, pc.full_end) for pc in internal_citations(node_text, Label())) periods = [m.start() for m in period.finditer(node_text)] # Remove any periods which are part of a citation periods = filter(lambda p: all(p < start or p > end for start, end in exclude), periods) # Key terms must either have a full "sentence" or end with a hyphen if not periods and node_text[-1] != u'—': return if periods: first_p = periods[0] # Check for cases where the period is "inside" something; # include the period next_char = node_text[first_p + 1: first_p + 2] if next_char in (')', u'”'): first_sentence = node_text[:first_p + 2] else: first_sentence = node_text[:first_p + 1] else: first_sentence = node_text # Key terms can't be the entire text of a leaf node if first_sentence == node_text and not node.children: return words = first_sentence.split() if (not words[-1] == part_end and not first_sentence.startswith('![')): num_words = len(words) # key terms are short if num_words <= 15: layer_element = { "key_term": first_sentence, "locations": [0] } layer[label_id] = [layer_element]
def is_child_of(child_xml, header_xml, header_citations=None): """Children are paragraphs, have lower 'source', the header has citations and the child does not, the citations for header and child are the same or the citation in a child is incorrect""" if child_xml.tag != 'HD': return True else: if header_citations is None: header_citations = [c.label for c in internal_citations(header_xml.text, Label())] child_citations = [c.label for c in internal_citations(child_xml.text, Label())] if (child_xml.get('SOURCE') > header_xml.get('SOURCE') or (header_citations and not child_citations) or (header_citations and header_citations == child_citations)): return True elif header_citations and child_citations: return is_backtrack(header_citations[-1].to_list(), child_citations[0].to_list()) else: return False
def test_multiple_matches(self): text = "Please see A-5 and Q-2(r) and Z-12(g)(2)(ii) then more text" citations = internal_citations(text, Label(part='102', section='1')) self.assertEqual(3, len(citations)) citation = citations[0] self.assertEqual(citation.label.to_list(), ['102', 'A', '5']) self.assertEqual(to_text(citation, text), 'A-5') citation = citations[1] self.assertEqual(citation.label.to_list(), ['102', 'Q', '2(r)']) self.assertEqual(to_text(citation, text), 'Q-2(r)') citation = citations[2] self.assertEqual(citation.label.to_list(), ['102', 'Z', '12(g)(2)(ii)']) self.assertEqual(to_text(citation, text), 'Z-12(g)(2)(ii)') text = u"Appendices G and H—Yadda yadda" citations = internal_citations(text, Label(part='102')) self.assertEqual(2, len(citations)) citG, citH = citations self.assertEqual(citG.label.to_list(), ['102', 'G']) self.assertEqual(citH.label.to_list(), ['102', 'H'])
def test_single_match_multiple_paragraphs5(self): text = "See, e.g., comments 31(b)(1)(iv)-1 and 31(b)(1)(vi)-1" citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '31', 'b', '1', 'iv', 'Interp', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '31(b)(1)(iv)-1') citation = citations[1] self.assertEqual(['222', '31', 'b', '1', 'vi', 'Interp', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '31(b)(1)(vi)-1')
def test_single_match_multiple_paragraphs6(self): text = "comments 5(b)(3)-1 through -3" citations = internal_citations(text, Label(part='100', section='5')) citation = citations[0] self.assertEqual(2, len(citations)) self.assertEqual(['100', '5', 'b', '3', 'Interp', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '5(b)(3)-1') citation = citations[1] self.assertEqual(['100', '5', 'b', '3', 'Interp', '3'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '-3')
def parse(self, text, label): """ Parse the provided text, pulling out all the internal (self-referential) citations. """ to_layer = lambda pc: {'offsets': [(pc.start, pc.end)], 'citation': pc.label.to_list()} citations = internal_citations(text, label, require_marker=True) if self.verify_citations: citations = self.remove_missing_citations(citations, text) all_citations = list(map(to_layer, citations)) return self.strip_whitespace(text, all_citations)
def build_section_tree(text, part): """Construct the tree for a whole section. Assumes the section starts with an identifier""" title, text = utils.title_body(text) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(text, Label(part=part))] section = re.search(r'%d\.(\d+)\b' % part, title).group(1) label = [str(part), section] p_tree = regParser.build_tree( text, exclude=exclude, label=label, title=title) return p_tree
def parse(self, text, label, title=None): """ Parse the provided text, pulling out all the internal (self-referential) citations. """ to_layer = lambda pc: {'offsets': [(pc.start, pc.end)], 'citation': pc.label.to_list()} citations = internal_citations(text, label, require_marker=True, title=title) if self.verify_citations: citations = self.remove_missing_citations(citations, text) all_citations = list(map(to_layer, citations)) return self.strip_whitespace(text, all_citations)
def split_into_ttsr(sxs): """Split the provided list of xml nodes into a node with a title, a sequence of text nodes, a sequence of nodes associated with the sub sections of this header, and the remaining xml nodes""" title = sxs[0] title_citations = [c.label for c in internal_citations(title.text, Label())] section = list(takewhile(lambda e: is_child_of(e, title, title_citations), sxs[1:])) text_elements = list(takewhile(lambda e: e.tag != 'HD', section)) sub_sections = section[len(text_elements):] remaining = sxs[1+len(text_elements)+len(sub_sections):] return (title, text_elements, sub_sections, remaining)
def build_section_tree(text, part): """Construct the tree for a whole section. Assumes the section starts with an identifier""" title, text = utils.title_body(text) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(text, Label(part=part))] section = re.search(r'%d\.(\d+)\b' % part, title).group(1) label = [str(part), section] p_tree = regParser.build_tree(text, exclude=exclude, label=label, title=title) return p_tree
def parse(self, text, label, title=None): """ Parse the provided text, pulling out all the internal (self-referential) citations. """ def to_layer(pc): return {'offsets': [(pc.start, pc.end)], 'citation': pc.label.to_list()} citations = internal_citations(text, label, require_marker=True, title=title) if self.verify_citations: citations = self.remove_missing_citations(citations, text) all_citations = [to_layer(c) for c in citations] return self.strip_whitespace(text, all_citations)
def test_interp_headers(self): for text, label in [ ("Section 102.22Stuff", ['102', '22']), ("22(d) Content", ['101', '22', 'd']), ("22(d)(5) Content", ['101', '22', 'd', '5']), ("22(d)(5)(x) Content", ['101', '22', 'd', '5', 'x']), (u"§ 102.22(d)(5)(x) Content", ['102', '22', 'd', '5', 'x']), ("22(d)(5)(x)(Q) Content", ['101', '22', 'd', '5', 'x', 'Q']), ("Appendix A Heading", ['101', 'A']), ("Comment 21(c)-1 Heading", ['101', '21', 'c', 'Interp', '1']), ("Paragraph 38(l)(7)(i)(A)(2).", ['101', '38', 'l', '7', 'i', 'A', '2']), (u'Official Interpretations of § 102.33(c)(2)', ['102', '33', 'c', '2', 'Interp'])]: citations = internal_citations(text, Label(part='101')) self.assertEqual(1, len(citations)) self.assertEqual(citations[0].label.to_list(), label)
def paragraph_tree(appendix_letter, sections, text, label, title=None): """Use the paragraph parser to parse through each section in this appendix.""" if not sections: return Node(text, label=label, title=title, node_type=Node.APPENDIX) children = [] for begin, end in sections: seg_title, section_text = utils.title_body(text[begin:end]) sec_num = carving.get_appendix_section_number( seg_title, appendix_letter) exclude = [(pc.full_start, pc.full_end) for pc in internal_citations(section_text, Label(part=label[0]))] child = parParser.build_tree( section_text, exclude=exclude, label=label + [sec_num], title=seg_title) children.append(child) return Node(text[:sections[0][0]], children, label, title, Node.APPENDIX)
def test_interp_headers(self): for text, label in [ ("Section 102.22Stuff", ['102', '22']), ("22(d) Content", ['101', '22', 'd']), ("22(d)(5) Content", ['101', '22', 'd', '5']), ("22(d)(5)(x) Content", ['101', '22', 'd', '5', 'x']), (u"§ 102.22(d)(5)(x) Content", ['102', '22', 'd', '5', 'x']), ("22(d)(5)(x)(Q) Content", ['101', '22', 'd', '5', 'x', 'Q']), ("Appendix A Heading", ['101', 'A']), ("Comment 21(c)-1 Heading", ['101', '21', 'c', 'Interp', '1']), ("Paragraph 38(l)(7)(i)(A)(2).", ['101', '38', 'l', '7', 'i', 'A', '2']), (u'Official Interpretations of § 102.33(c)(2)', ['102', '33', 'c', '2', 'Interp']) ]: citations = internal_citations(text, Label(part='101')) self.assertEqual(1, len(citations)) self.assertEqual(citations[0].label.to_list(), label)
def split_paragraph_text(self, text, next_text=''): marker_positions = [] for marker in _first_markers: # text.index('(') to skip over the periods, spaces, etc. marker_positions.extend(text.index('(', m.start()) for m in marker.finditer(text)) # Remove any citations citations = internal_citations(text, require_marker=True) marker_positions = [pos for pos in marker_positions if not any(cit.start <= pos and cit.end >= pos for cit in citations)] texts = [] # Drop Zeros, add the end break_points = [p for p in marker_positions if p] + [len(text)] last_pos = 0 for pos in break_points: texts.append(text[last_pos:pos]) last_pos = pos texts.append(next_text) return texts
def split_paragraph_text(text): """Split text into a root node and its children (if the text contains collapsed markers""" marker_positions = [] for marker in _first_markers: # text.index('(') to skip over the periods, spaces, etc. marker_positions.extend(text.index('(', m.start()) for m in marker.finditer(text)) # Remove any citations citations = internal_citations(text, require_marker=True) marker_positions = [pos for pos in marker_positions if not any(cit.start <= pos and cit.end >= pos for cit in citations)] texts = [] # Drop Zeros, add the end break_points = [p for p in marker_positions if p] + [len(text)] last_pos = 0 for pos in break_points: texts.append(text[last_pos:pos]) last_pos = pos return texts
def test_single_references(self): for text, link, label in [ ("The requirements in paragraph (a)(4)(iii) of", 'paragraph (a)(4)(iii)', ['102', '6', 'a', '4', 'iii']), ("Creditors may comply with paragraphs (a)(6) of this section", 'paragraphs (a)(6)', ['102', '6', 'a', '6']), (u"date in § 1005.20(h)(1) must disclose", u'§ 1005.20(h)(1)', ['1005', '20', 'h', '1']), ('(a) Solicited issuance. Except as provided in paragraph (b) ' + 'of this section', 'paragraph (b)', ['102', '6', 'b']), ("And Section 222.87(d)(2)(i) says something", 'Section 222.87(d)(2)(i)', ['222', '87', 'd', '2', 'i']), ("More in paragraph 22(a)(4).", "paragraph 22(a)(4)", ["102", "22", "a", "4"]), ("See comment 32(b)(3) blah blah", 'comment 32(b)(3)', ['102', '32', 'b', '3', 'Interp']), ("refer to comment 36(a)(2)-3 of thing", 'comment 36(a)(2)-3', ['102', '36', 'a', '2', 'Interp', '3']), ("See Appendix A-5", "Appendix A-5", ['102', 'A', '5']), ("See Appendix A-5(R)", "Appendix A-5(R)", ['102', 'A', '5(R)']), ("See comment 3(v)-1.v. Another", "comment 3(v)-1.v", ['102', '3', 'v', 'Interp', '1', 'v']), ("See the commentary to 3(b)(1)", 'commentary to 3(b)(1)', ['102', '3', 'b', '1', 'Interp']), ("See comment 3(b)(1)-1.v.", 'comment 3(b)(1)-1.v', ['102', '3', 'b', '1', 'Interp', '1', 'v']), ("See appendix G, part V.4.D.", 'appendix G, part V.4.D', ['102', 'G', 'V', '4', 'D']), ("See comment 3-1 for things", 'comment 3-1', ['102', '3', 'Interp', '1']) ]: citations = internal_citations(text, Label(part='102', section='6')) self.assertEqual(1, len(citations)) citation = citations[0] self.assertEqual(citation.label.to_list(), label) self.assertEqual(link, to_full_text(citation, text))
def text_to_labels(text, initial_label, warn=True, force_start=False): """Convert header text used in interpretations into the interpretation label associated with them (e.g. 22(a) becomes XXX-22-a-Interp). warn: lets us know if there was an error in the conversion. force_start: ensure that the citations is at the *beginning* of the text""" all_citations = internal_citations(text.strip(), initial_label) all_citations = sorted(all_citations, key=lambda c: c.start) # We care only about the first citation and its clauses citations = all_citations[:1] if force_start: citations = [c for c in citations if c.full_start == 0] # Under certain situations, we need to infer from context initial_pars = [ match for match, start, _ in unified.any_depth_p.scanString(text) if start == 0 ] if citations: if citations[0].in_clause: # Clauses still in the first conjunction citations.extend( takewhile(lambda c: c.in_clause, all_citations[1:])) return [ citation.label.to_list() + [Node.INTERP_MARK] for citation in citations ] elif (initial_label.comment and initial_pars and initial_label.settings.get('appendix')): return [[ initial_label.settings['part'], initial_label.settings['appendix'] ] + list(initial_pars[0]) + [Node.INTERP_MARK]] elif warn: logger.warning("Couldn't turn into label: " + text) return []
def test_single_references(self): for text, link, label in [ ("The requirements in paragraph (a)(4)(iii) of", 'paragraph (a)(4)(iii)', ['102', '6', 'a', '4', 'iii']), ("Creditors may comply with paragraphs (a)(6) of this section", 'paragraphs (a)(6)', ['102', '6', 'a', '6']), (u"date in § 1005.20(h)(1) must disclose", u'§ 1005.20(h)(1)', ['1005', '20', 'h', '1']), ('(a) Solicited issuance. Except as provided in paragraph (b) ' + 'of this section', 'paragraph (b)', ['102', '6', 'b']), ("And Section 222.87(d)(2)(i) says something", 'Section 222.87(d)(2)(i)', ['222', '87', 'd', '2', 'i']), ("More in paragraph 22(a)(4).", "paragraph 22(a)(4)", ["102", "22", "a", "4"]), ("See comment 32(b)(3) blah blah", 'comment 32(b)(3)', ['102', '32', 'b', '3', 'Interp']), ("refer to comment 36(a)(2)-3 of thing", 'comment 36(a)(2)-3', ['102', '36', 'a', '2', 'Interp', '3']), ("See Appendix A-5", "Appendix A-5", ['102', 'A', '5']), ("See Appendix A-5(R)", "Appendix A-5(R)", ['102', 'A', '5(R)']), ("See comment 3(v)-1.v. Another", "comment 3(v)-1.v", ['102', '3', 'v', 'Interp', '1', 'v']), ("See the commentary to 3(b)(1)", 'commentary to 3(b)(1)', ['102', '3', 'b', '1', 'Interp']), ("See comment 3(b)(1)-1.v.", 'comment 3(b)(1)-1.v', ['102', '3', 'b', '1', 'Interp', '1', 'v']), ("See appendix G, part V.4.D.", 'appendix G, part V.4.D', ['102', 'G', 'V', '4', 'D']), ("See comment 3-1 for things", 'comment 3-1', ['102', '3', 'Interp', '1'])]: citations = internal_citations(text, Label(part='102', section='6')) self.assertEqual(1, len(citations)) citation = citations[0] self.assertEqual(citation.label.to_list(), label) self.assertEqual(link, to_full_text(citation, text))
def text_to_labels(text, initial_label, warn=True, force_start=False): """Convert header text used in interpretations into the interpretation label associated with them (e.g. 22(a) becomes XXX-22-a-Interp). warn: lets us know if there was an error in the conversion. force_start: ensure that the citations is at the *beginning* of the text""" all_citations = internal_citations(text.strip(), initial_label) all_citations = sorted(all_citations, key=lambda c: c.start) # We care only about the first citation and its clauses citations = all_citations[:1] if force_start: citations = [c for c in citations if c.full_start == 0] # Under certain situations, we need to infer from context initial_pars = [match for match, start, _ in unified.any_depth_p.scanString(text) if start == 0] if citations: if citations[0].in_clause: # Clauses still in the first conjunction citations.extend(takewhile(lambda c: c.in_clause, all_citations[1:])) return [citation.label.to_list() + [Node.INTERP_MARK] for citation in citations] elif (initial_label.comment and initial_pars and initial_label.settings.get('appendix')): return [[initial_label.settings['part'], initial_label.settings['appendix']] + list(initial_pars[0]) + [Node.INTERP_MARK]] elif warn: logger.warning("Couldn't turn into label: " + text) return []
def split_paragraph_text(text): """Split text into a root node and its children (if the text contains collapsed markers""" marker_positions = [] if text.lstrip()[:1] == '(': marker_set = _first_paren_markers else: marker_set = _first_period_markers for marker in marker_set: marker_positions.extend(m.end() - len(m.group(1)) for m in marker.finditer(text)) # Remove any citations citations = internal_citations(text, require_marker=True) marker_positions = [pos for pos in marker_positions if not any(cit.start <= pos and cit.end >= pos for cit in citations)] texts = [] # Drop Zeros, add the end break_points = [p for p in marker_positions if p] + [len(text)] last_pos = 0 for pos in break_points: texts.append(text[last_pos:pos]) last_pos = pos return texts
def test_single_match_multiple_paragraphs1(self): text = "the requirements of paragraphs (c)(3), (d)(2), (e)(1), " text += "(e)(3), and (f) of this section" citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(5, len(citations)) citation = citations[0] self.assertEqual(['222', '5', 'c', '3'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(c)(3)') citation = citations[1] self.assertEqual(['222', '5', 'd', '2'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(d)(2)') citation = citations[2] self.assertEqual(['222', '5', 'e', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(e)(1)') citation = citations[3] self.assertEqual(['222', '5', 'e', '3'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(e)(3)') citation = citations[4] self.assertEqual(['222', '5', 'f'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(f)') text = "set forth in paragraphs (b)(1) or (b)(2)" citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '5', 'b', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(b)(1)') citation = citations[1] self.assertEqual(['222', '5', 'b', '2'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(b)(2)') text = 'paragraphs (c)(1) and (2) of this section' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '5', 'c', '1'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(c)(1)') citation = citations[1] self.assertEqual(['222', '5', 'c', '2'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(2)') text = 'paragraphs (b)(1)(ii) and (iii)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '5', 'b', '1', 'ii'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(b)(1)(ii)') citation = citations[1] self.assertEqual(['222', '5', 'b', '1', 'iii'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(iii)') text = 'see paragraphs (z)(9)(vi)(A) and (D)' citations = internal_citations(text, Label(part='222', section='5')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '5', 'z', '9', 'vi', 'A'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(z)(9)(vi)(A)') citation = citations[1] self.assertEqual(['222', '5', 'z', '9', 'vi', 'D'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(D)') text = 'see 32(d)(6) and (7) Content content' citations = internal_citations(text, Label(part='222')) self.assertEqual(2, len(citations)) citation = citations[0] self.assertEqual(['222', '32', 'd', '6'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '32(d)(6)') citation = citations[1] self.assertEqual(['222', '32', 'd', '7'], citation.label.to_list()) self.assertEqual(to_text(citation, text), '(7)')
def test_single_reference_false_positives(self): text = "See the commentary. (a) child paragraph" citations = internal_citations(text, Label(part='102', section='1')) self.assertEqual(0, len(citations))
def test_single_match_multiple_p_false_positives(self): text = "-9 text and stuff -2. (b) new thing" citations = internal_citations(text, Label(part='100', section='4')) self.assertEqual(0, len(citations))
def test_single_match_multiple_paragraphs8(self): text = u'§ 105.2(a)(1)-(3)' citations = internal_citations(text, Label(part='100', section='2')) self.assertEqual(2, len(citations))
def parse_into_labels(txt, part): """Find what part+section+(paragraph) (could be multiple) this text is related to.""" citations = internal_citations(txt, Label(part=part)) labels = ['-'.join(cit.label.to_list()) for cit in citations] return labels
def test_single_reference_false_positives(self): text = "See the commentary. (a) child paragraph" citations = internal_citations( text, Label(part='102', section='1')) self.assertEqual(0, len(citations))