Exemple #1
0
def is_child_of(child_xml, header_xml):
    """Children are paragraphs, have lower 'source' or the header has
    citations and the child does not"""
    return (child_xml.tag != 'HD'
            or child_xml.get('SOURCE') > header_xml.get('SOURCE')
            or (internal_citations(header_xml.text, Label())
                and not internal_citations(child_xml.text, Label())))
Exemple #2
0
    def scope_of_text(self, text, label_struct, verify_prefix=True):
        """Given specific text, try to determine the definition scope it
        indicates. Implicit return None if none is found."""
        scopes = []
        #   First, make a list of potential scope indicators
        citations = internal_citations(text, label_struct,
                                       require_marker=True)
        indicators = [(c.full_start, c.label.to_list()) for c in citations]
        text = text.lower()
        label_list = label_struct.to_list()
        indicators.extend((m.start(), label_list[:1])
                          for m in Terms.part_re.finditer(text))
        indicators.extend((m.start(), label_list[:2])
                          for m in Terms.sect_re.finditer(text))
        indicators.extend((m.start(), label_list)
                          for m in Terms.par_re.finditer(text))
        #   Subpart's a bit more complicated, as it gets expanded into a
        #   list of sections
        for match in Terms.subpart_re.finditer(text):
            indicators.extend(
                (match.start(), subpart_label)
                for subpart_label in self.subpart_scope(label_list))

        #   Finally, add the scope if we verify its prefix
        for start, label in indicators:
            if not verify_prefix or Terms.scope_re.match(text[:start]):
                scopes.append(label)
            elif Terms.scope_used_re.match(text[:start]):
                scopes.append(label)

        #   Add interpretation to scopes
        scopes = scopes + [s + [struct.Node.INTERP_MARK] for s in scopes]
        if scopes:
            return [tuple(s) for s in scopes]
 def test_section_ref_in_appendix(self):
     text = u"""(a) Something something § 1005.7(b)(1)."""
     citations = internal_citations(
         text, Label(part='1005', appendix='A', appendix_section='2',
                     p1='a'))
     self.assertEqual(citations[0].label.to_list(),
                      ['1005', '7', 'b', '1'])
def segment_tree(text, part, parent_label):
    """Build a tree representing the interpretation of a section, paragraph,
    or appendix."""
    title, body = utils.title_body(text)
    exclude = [(pc.full_start, pc.full_end) for pc in
               internal_citations(body, Label(part=parent_label[0]))]

    label = merge_labels(text_to_labels(title, Label(part=part, comment=True)))
    return interpParser.build_tree(body, 1, exclude, label, title)
 def test_single_match_multiple_paragraphs4(self):
     text = "Listing sections 11.55(d) and 321.11 (h)(4)"
     citations = internal_citations(text, Label(part='222', section='5'))
     self.assertEqual(2, len(citations))
     citation = citations[0]
     self.assertEqual(['11', '55', 'd'], citation.label.to_list())
     self.assertEqual(to_text(citation, text), '11.55(d)')
     citation = citations[1]
     self.assertEqual(['321', '11', 'h', '4'], citation.label.to_list())
     self.assertEqual(to_text(citation, text), '321.11 (h)(4)')
Exemple #6
0
def add_spaces_to_title(title):
    """Federal Register often seems to miss spaces in the title of SxS
    sections. Make sure spaces get added if appropriate"""
    for citation in internal_citations(title, Label()):
        end = citation.end
        # Next char is an alpha and last char isn't a space
        if end < len(title) and title[end].isalpha() and title[end - 1] != ' ':
            title = title[:end] + ' ' + title[end:]
            break  # Assumes there is only one paragraph in a title
    return title
Exemple #7
0
def add_spaces_to_title(title):
    """Federal Register often seems to miss spaces in the title of SxS
    sections. Make sure spaces get added if appropriate"""
    for citation in internal_citations(title, Label()):
        end = citation.end
        # Next char is an alpha and last char isn't a space
        if end < len(title) and title[end].isalpha() and title[end-1] != ' ':
            title = title[:end] + ' ' + title[end:]
            break   # Assumes there is only one paragraph in a title
    return title
Exemple #8
0
def parse_into_labels(txt, part):
    """Find what part+section+(paragraph) (could be multiple) this text is
    related to."""
    citations = internal_citations(txt, Label(part=part))
    # odd corner case: headers shouldn't include both an appendix and regtext
    labels = [c.label for c in citations]
    if any('appendix' in l.settings for l in labels):
        labels = [l for l in labels if 'appendix' in l.settings]
    labels = ['-'.join(l.to_list()) for l in labels]
    return labels
Exemple #9
0
def parse_into_labels(txt, part):
    """Find what part+section+(paragraph) (could be multiple) this text is
    related to."""
    citations = internal_citations(txt, Label(part=part))
    # odd corner case: headers shouldn't include both an appendix and regtext
    labels = [c.label for c in citations]
    if any('appendix' in l.settings for l in labels):
        labels = [l for l in labels if 'appendix' in l.settings]
    labels = ['-'.join(l.to_list()) for l in labels]
    return labels
    def test_single_match_multiple_paragraphs2(self):
        text = u'§ 1005.10(a) and (d)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['1005', '10', 'a'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '1005.10(a)')
        citation = citations[1]
        self.assertEqual(['1005', '10', 'd'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(d)')

        text = u'§ 1005.7(b)(1), (2) and (3)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '7', 'b', '1'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '7', 'b', '2'],
                         citations[1].label.to_list())
        self.assertEqual(['1005', '7', 'b', '3'],
                         citations[2].label.to_list())

        text = u'§ 1005.15(d)(1)(i) and (ii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        self.assertEqual(['1005', '15', 'd', '1', 'i'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '15', 'd', '1', 'ii'],
                         citations[1].label.to_list())

        text = u'§ 1005.9(a)(5) (i), (ii), or (iii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '9', 'a', '5', 'i'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '9', 'a', '5', 'ii'],
                         citations[1].label.to_list())
        self.assertEqual(['1005', '9', 'a', '5', 'iii'],
                         citations[2].label.to_list())

        text = u'§ 1005.11(a)(1)(vi) or (vii).'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        self.assertEqual(['1005', '11', 'a', '1', 'vi'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '11', 'a', '1', 'vii'],
                         citations[1].label.to_list())

        text = u'§§ 1005.3(b)(2) and (3), 1005.10(b), (d), and (e), 1005.13, '
        text += 'and 1005.20'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(7, len(citations))

        text = 'Sections 1005.3, .4, and .5'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '3'], citations[0].label.to_list())
        self.assertEqual(['1005', '4'], citations[1].label.to_list())
        self.assertEqual(['1005', '5'], citations[2].label.to_list())
def generate_keyterm(node):
    label_id = node.label_id()
    if label_id in real_key_terms_layer:
        layer[label_id] = real_key_terms_layer[label_id]
    else:
        node_text = key_terms.KeyTerms.process_node_text(node)
        if not node_text:
            return

        # Our Appendix parsing isn't particularly accurate -- avoid keyterms
        if node.node_type == struct.Node.APPENDIX:
            return

        exclude = [(start, end) for _, start, end in
                   exclude_parser.scanString(node_text)]
        exclude.extend((pc.full_start, pc.full_end) for pc in
                       internal_citations(node_text, Label()))

        periods = [m.start() for m in period.finditer(node_text)]
        # Remove any periods which are part of a citation
        periods = filter(lambda p: all(p < start or p > end
                                       for start, end in exclude), periods)

        # Key terms must either have a full "sentence" or end with a hyphen
        if not periods and node_text[-1] != u'—':
            return

        if periods:
            first_p = periods[0]
            # Check for cases where the period is "inside" something;
            # include the period
            next_char = node_text[first_p + 1: first_p + 2]
            if next_char in (')', u'”'):
                first_sentence = node_text[:first_p + 2]
            else:
                first_sentence = node_text[:first_p + 1]
        else:
            first_sentence = node_text

        # Key terms can't be the entire text of a leaf node
        if first_sentence == node_text and not node.children:
            return

        words = first_sentence.split()
        if (not words[-1] == part_end and
                not first_sentence.startswith('![')):
            num_words = len(words)

            # key terms are short
            if num_words <= 15:
                layer_element = {
                    "key_term": first_sentence,
                    "locations": [0]
                }
                layer[label_id] = [layer_element]
Exemple #12
0
def generate_keyterm(node):
    label_id = node.label_id()
    if label_id in real_key_terms_layer:
        layer[label_id] = real_key_terms_layer[label_id]
    else:
        node_text = key_terms.KeyTerms.process_node_text(node)
        if not node_text:
            return

        # Our Appendix parsing isn't particularly accurate -- avoid keyterms
        if node.node_type == struct.Node.APPENDIX:
            return

        exclude = [(start, end) for _, start, end in
                   exclude_parser.scanString(node_text)]
        exclude.extend((pc.full_start, pc.full_end) for pc in
                       internal_citations(node_text, Label()))

        periods = [m.start() for m in period.finditer(node_text)]
        # Remove any periods which are part of a citation
        periods = filter(lambda p: all(p < start or p > end
                                       for start, end in exclude), periods)

        # Key terms must either have a full "sentence" or end with a hyphen
        if not periods and node_text[-1] != u'—':
            return

        if periods:
            first_p = periods[0]
            # Check for cases where the period is "inside" something;
            # include the period
            next_char = node_text[first_p + 1: first_p + 2]
            if next_char in (')', u'”'):
                first_sentence = node_text[:first_p + 2]
            else:
                first_sentence = node_text[:first_p + 1]
        else:
            first_sentence = node_text

        # Key terms can't be the entire text of a leaf node
        if first_sentence == node_text and not node.children:
            return

        words = first_sentence.split()
        if (not words[-1] == part_end and
                not first_sentence.startswith('![')):
            num_words = len(words)

            # key terms are short
            if num_words <= 15:
                layer_element = {
                    "key_term": first_sentence,
                    "locations": [0]
                }
                layer[label_id] = [layer_element]
Exemple #13
0
def is_child_of(child_xml, header_xml, header_citations=None):
    """Children are paragraphs, have lower 'source', the header has
    citations and the child does not, the citations for header and child
    are the same or the citation in a child is incorrect"""
    if child_xml.tag != 'HD':
        return True
    else:
        if header_citations is None:
            header_citations = [c.label for c in
                                internal_citations(header_xml.text, Label())]
        child_citations = [c.label for c in
                           internal_citations(child_xml.text, Label())]
        if (child_xml.get('SOURCE') > header_xml.get('SOURCE')
                or (header_citations and not child_citations)
                or (header_citations and header_citations == child_citations)):
            return True
        elif header_citations and child_citations:
            return is_backtrack(header_citations[-1].to_list(),
                                child_citations[0].to_list())
        else:
            return False
    def test_multiple_matches(self):
        text = "Please see A-5 and Q-2(r) and Z-12(g)(2)(ii) then more text"
        citations = internal_citations(text, Label(part='102', section='1'))
        self.assertEqual(3, len(citations))
        citation = citations[0]
        self.assertEqual(citation.label.to_list(), ['102', 'A', '5'])
        self.assertEqual(to_text(citation, text), 'A-5')
        citation = citations[1]
        self.assertEqual(citation.label.to_list(), ['102', 'Q', '2(r)'])
        self.assertEqual(to_text(citation, text), 'Q-2(r)')
        citation = citations[2]
        self.assertEqual(citation.label.to_list(),
                         ['102', 'Z', '12(g)(2)(ii)'])
        self.assertEqual(to_text(citation, text), 'Z-12(g)(2)(ii)')

        text = u"Appendices G and H—Yadda yadda"
        citations = internal_citations(text, Label(part='102'))
        self.assertEqual(2, len(citations))
        citG, citH = citations
        self.assertEqual(citG.label.to_list(), ['102', 'G'])
        self.assertEqual(citH.label.to_list(), ['102', 'H'])
    def test_multiple_matches(self):
        text = "Please see A-5 and Q-2(r) and Z-12(g)(2)(ii) then more text"
        citations = internal_citations(text, Label(part='102', section='1'))
        self.assertEqual(3, len(citations))
        citation = citations[0]
        self.assertEqual(citation.label.to_list(), ['102', 'A', '5'])
        self.assertEqual(to_text(citation, text), 'A-5')
        citation = citations[1]
        self.assertEqual(citation.label.to_list(), ['102', 'Q', '2(r)'])
        self.assertEqual(to_text(citation, text), 'Q-2(r)')
        citation = citations[2]
        self.assertEqual(citation.label.to_list(),
                         ['102', 'Z', '12(g)(2)(ii)'])
        self.assertEqual(to_text(citation, text), 'Z-12(g)(2)(ii)')

        text = u"Appendices G and H—Yadda yadda"
        citations = internal_citations(text, Label(part='102'))
        self.assertEqual(2, len(citations))
        citG, citH = citations
        self.assertEqual(citG.label.to_list(), ['102', 'G'])
        self.assertEqual(citH.label.to_list(), ['102', 'H'])
 def test_single_match_multiple_paragraphs5(self):
     text = "See, e.g., comments 31(b)(1)(iv)-1 and 31(b)(1)(vi)-1"
     citations = internal_citations(text, Label(part='222', section='5'))
     self.assertEqual(2, len(citations))
     citation = citations[0]
     self.assertEqual(['222', '31', 'b', '1', 'iv', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '31(b)(1)(iv)-1')
     citation = citations[1]
     self.assertEqual(['222', '31', 'b', '1', 'vi', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '31(b)(1)(vi)-1')
 def test_single_match_multiple_paragraphs6(self):
     text = "comments 5(b)(3)-1 through -3"
     citations = internal_citations(text, Label(part='100', section='5'))
     citation = citations[0]
     self.assertEqual(2, len(citations))
     self.assertEqual(['100', '5', 'b', '3', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '5(b)(3)-1')
     citation = citations[1]
     self.assertEqual(['100', '5', 'b', '3', 'Interp', '3'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '-3')
 def test_single_match_multiple_paragraphs6(self):
     text = "comments 5(b)(3)-1 through -3"
     citations = internal_citations(text, Label(part='100', section='5'))
     citation = citations[0]
     self.assertEqual(2, len(citations))
     self.assertEqual(['100', '5', 'b', '3', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '5(b)(3)-1')
     citation = citations[1]
     self.assertEqual(['100', '5', 'b', '3', 'Interp', '3'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '-3')
 def test_single_match_multiple_paragraphs5(self):
     text = "See, e.g., comments 31(b)(1)(iv)-1 and 31(b)(1)(vi)-1"
     citations = internal_citations(text, Label(part='222', section='5'))
     self.assertEqual(2, len(citations))
     citation = citations[0]
     self.assertEqual(['222', '31', 'b', '1', 'iv', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '31(b)(1)(iv)-1')
     citation = citations[1]
     self.assertEqual(['222', '31', 'b', '1', 'vi', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '31(b)(1)(vi)-1')
    def parse(self, text, label):
        """ Parse the provided text, pulling out all the internal
        (self-referential) citations. """

        to_layer = lambda pc: {'offsets': [(pc.start, pc.end)],
                               'citation': pc.label.to_list()}
        citations = internal_citations(text, label, require_marker=True)
        if self.verify_citations:
            citations = self.remove_missing_citations(citations, text)
        all_citations = list(map(to_layer, citations))

        return self.strip_whitespace(text, all_citations)
def build_section_tree(text, part):
    """Construct the tree for a whole section. Assumes the section starts
    with an identifier"""
    title, text = utils.title_body(text)

    exclude = [(pc.full_start, pc.full_end) for pc in
               internal_citations(text, Label(part=part))]
    section = re.search(r'%d\.(\d+)\b' % part, title).group(1)
    label = [str(part), section]
    p_tree = regParser.build_tree(
        text, exclude=exclude, label=label, title=title)
    return p_tree
    def test_single_match_multiple_paragraphs2(self):
        text = u'§ 1005.10(a) and (d)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['1005', '10', 'a'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '1005.10(a)')
        citation = citations[1]
        self.assertEqual(['1005', '10', 'd'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(d)')

        text = u'§ 1005.7(b)(1), (2) and (3)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '7', 'b', '1'], citations[0].label.to_list())
        self.assertEqual(['1005', '7', 'b', '2'], citations[1].label.to_list())
        self.assertEqual(['1005', '7', 'b', '3'], citations[2].label.to_list())

        text = u'§ 1005.15(d)(1)(i) and (ii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        self.assertEqual(['1005', '15', 'd', '1', 'i'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '15', 'd', '1', 'ii'],
                         citations[1].label.to_list())

        text = u'§ 1005.9(a)(5) (i), (ii), or (iii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '9', 'a', '5', 'i'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '9', 'a', '5', 'ii'],
                         citations[1].label.to_list())
        self.assertEqual(['1005', '9', 'a', '5', 'iii'],
                         citations[2].label.to_list())

        text = u'§ 1005.11(a)(1)(vi) or (vii).'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        self.assertEqual(['1005', '11', 'a', '1', 'vi'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '11', 'a', '1', 'vii'],
                         citations[1].label.to_list())

        text = u'§§ 1005.3(b)(2) and (3), 1005.10(b), (d), and (e), 1005.13, '
        text += 'and 1005.20'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(7, len(citations))

        text = 'Sections 1005.3, .4, and .5'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '3'], citations[0].label.to_list())
        self.assertEqual(['1005', '4'], citations[1].label.to_list())
        self.assertEqual(['1005', '5'], citations[2].label.to_list())
    def parse(self, text, label, title=None):
        """ Parse the provided text, pulling out all the internal
        (self-referential) citations. """

        to_layer = lambda pc: {'offsets': [(pc.start, pc.end)],
                               'citation': pc.label.to_list()}
        citations = internal_citations(text, label,
                                       require_marker=True, title=title)
        if self.verify_citations:
            citations = self.remove_missing_citations(citations, text)
        all_citations = list(map(to_layer, citations))

        return self.strip_whitespace(text, all_citations)
Exemple #24
0
def split_into_ttsr(sxs):
    """Split the provided list of xml nodes into a node with a title, a
    sequence of text nodes, a sequence of nodes associated with the sub
    sections of this header, and the remaining xml nodes"""
    title = sxs[0]
    title_citations = [c.label for c in
                       internal_citations(title.text, Label())]
    section = list(takewhile(lambda e: is_child_of(e, title, title_citations),
                             sxs[1:]))
    text_elements = list(takewhile(lambda e: e.tag != 'HD', section))
    sub_sections = section[len(text_elements):]
    remaining = sxs[1+len(text_elements)+len(sub_sections):]
    return (title, text_elements, sub_sections, remaining)
def build_section_tree(text, part):
    """Construct the tree for a whole section. Assumes the section starts
    with an identifier"""
    title, text = utils.title_body(text)

    exclude = [(pc.full_start, pc.full_end)
               for pc in internal_citations(text, Label(part=part))]
    section = re.search(r'%d\.(\d+)\b' % part, title).group(1)
    label = [str(part), section]
    p_tree = regParser.build_tree(text,
                                  exclude=exclude,
                                  label=label,
                                  title=title)
    return p_tree
    def parse(self, text, label, title=None):
        """ Parse the provided text, pulling out all the internal
        (self-referential) citations. """

        def to_layer(pc):
            return {'offsets': [(pc.start, pc.end)],
                    'citation': pc.label.to_list()}

        citations = internal_citations(text, label, require_marker=True,
                                       title=title)
        if self.verify_citations:
            citations = self.remove_missing_citations(citations, text)
        all_citations = [to_layer(c) for c in citations]

        return self.strip_whitespace(text, all_citations)
    def test_interp_headers(self):
        for text, label in [
            ("Section 102.22Stuff", ['102', '22']),
            ("22(d) Content", ['101', '22', 'd']),
            ("22(d)(5) Content", ['101', '22', 'd', '5']),
            ("22(d)(5)(x) Content", ['101', '22', 'd', '5', 'x']),
            (u"§ 102.22(d)(5)(x) Content", ['102', '22', 'd', '5', 'x']),
            ("22(d)(5)(x)(Q) Content", ['101', '22', 'd', '5', 'x', 'Q']),
            ("Appendix A Heading", ['101', 'A']),
            ("Comment 21(c)-1 Heading", ['101', '21', 'c', 'Interp', '1']),
            ("Paragraph 38(l)(7)(i)(A)(2).",
                ['101', '38', 'l', '7', 'i', 'A', '2']),
            (u'Official Interpretations of § 102.33(c)(2)',
                ['102', '33', 'c', '2', 'Interp'])]:

            citations = internal_citations(text, Label(part='101'))
            self.assertEqual(1, len(citations))
            self.assertEqual(citations[0].label.to_list(), label)
Exemple #28
0
def paragraph_tree(appendix_letter, sections, text, label, title=None):
    """Use the paragraph parser to parse through each section in this
    appendix."""
    if not sections:
        return Node(text, label=label, title=title, node_type=Node.APPENDIX)
    children = []
    for begin, end in sections:
        seg_title, section_text = utils.title_body(text[begin:end])
        sec_num = carving.get_appendix_section_number(
            seg_title, appendix_letter)
        exclude = [(pc.full_start, pc.full_end) for pc in
                   internal_citations(section_text, Label(part=label[0]))]

        child = parParser.build_tree(
            section_text, exclude=exclude, label=label + [sec_num],
            title=seg_title)

        children.append(child)
    return Node(text[:sections[0][0]], children, label, title, Node.APPENDIX)
    def test_interp_headers(self):
        for text, label in [
            ("Section 102.22Stuff", ['102', '22']),
            ("22(d) Content", ['101', '22', 'd']),
            ("22(d)(5) Content", ['101', '22', 'd', '5']),
            ("22(d)(5)(x) Content", ['101', '22', 'd', '5', 'x']),
            (u"§ 102.22(d)(5)(x) Content", ['102', '22', 'd', '5', 'x']),
            ("22(d)(5)(x)(Q) Content", ['101', '22', 'd', '5', 'x', 'Q']),
            ("Appendix A Heading", ['101', 'A']),
            ("Comment 21(c)-1 Heading", ['101', '21', 'c', 'Interp', '1']),
            ("Paragraph 38(l)(7)(i)(A)(2).",
             ['101', '38', 'l', '7', 'i', 'A', '2']),
            (u'Official Interpretations of § 102.33(c)(2)',
             ['102', '33', 'c', '2', 'Interp'])
        ]:

            citations = internal_citations(text, Label(part='101'))
            self.assertEqual(1, len(citations))
            self.assertEqual(citations[0].label.to_list(), label)
 def split_paragraph_text(self, text, next_text=''):
     marker_positions = []
     for marker in _first_markers:
         #   text.index('(') to skip over the periods, spaces, etc.
         marker_positions.extend(text.index('(', m.start())
                                 for m in marker.finditer(text))
     #   Remove any citations
     citations = internal_citations(text, require_marker=True)
     marker_positions = [pos for pos in marker_positions
                         if not any(cit.start <= pos and cit.end >= pos
                                    for cit in citations)]
     texts = []
     #   Drop Zeros, add the end
     break_points = [p for p in marker_positions if p] + [len(text)]
     last_pos = 0
     for pos in break_points:
         texts.append(text[last_pos:pos])
         last_pos = pos
     texts.append(next_text)
     return texts
def split_paragraph_text(text):
    """Split text into a root node and its children (if the text contains
    collapsed markers"""
    marker_positions = []
    for marker in _first_markers:
        #   text.index('(') to skip over the periods, spaces, etc.
        marker_positions.extend(text.index('(', m.start())
                                for m in marker.finditer(text))
    #   Remove any citations
    citations = internal_citations(text, require_marker=True)
    marker_positions = [pos for pos in marker_positions
                        if not any(cit.start <= pos and cit.end >= pos
                                   for cit in citations)]
    texts = []
    #   Drop Zeros, add the end
    break_points = [p for p in marker_positions if p] + [len(text)]
    last_pos = 0
    for pos in break_points:
        texts.append(text[last_pos:pos])
        last_pos = pos
    return texts
    def test_single_references(self):
        for text, link, label in [
            ("The requirements in paragraph (a)(4)(iii) of",
             'paragraph (a)(4)(iii)', ['102', '6', 'a', '4', 'iii']),
            ("Creditors may comply with paragraphs (a)(6) of this section",
             'paragraphs (a)(6)', ['102', '6', 'a', '6']),
            (u"date in § 1005.20(h)(1) must disclose", u'§ 1005.20(h)(1)',
             ['1005', '20', 'h', '1']),
            ('(a) Solicited issuance. Except as provided in paragraph (b) ' +
             'of this section', 'paragraph (b)', ['102', '6', 'b']),
            ("And Section 222.87(d)(2)(i) says something",
             'Section 222.87(d)(2)(i)', ['222', '87', 'd', '2', 'i']),
            ("More in paragraph 22(a)(4).", "paragraph 22(a)(4)",
             ["102", "22", "a", "4"]),
            ("See comment 32(b)(3) blah blah", 'comment 32(b)(3)',
             ['102', '32', 'b', '3', 'Interp']),
            ("refer to comment 36(a)(2)-3 of thing", 'comment 36(a)(2)-3',
             ['102', '36', 'a', '2', 'Interp', '3']),
            ("See Appendix A-5", "Appendix A-5", ['102', 'A', '5']),
            ("See Appendix A-5(R)", "Appendix A-5(R)", ['102', 'A', '5(R)']),
            ("See comment 3(v)-1.v. Another", "comment 3(v)-1.v",
             ['102', '3', 'v', 'Interp', '1', 'v']),
            ("See the commentary to 3(b)(1)", 'commentary to 3(b)(1)',
             ['102', '3', 'b', '1', 'Interp']),
            ("See comment 3(b)(1)-1.v.", 'comment 3(b)(1)-1.v',
             ['102', '3', 'b', '1', 'Interp', '1', 'v']),
            ("See appendix G, part V.4.D.", 'appendix G, part V.4.D',
             ['102', 'G', 'V', '4', 'D']),
            ("See comment 3-1 for things", 'comment 3-1',
             ['102', '3', 'Interp', '1'])
        ]:

            citations = internal_citations(text, Label(part='102',
                                                       section='6'))
            self.assertEqual(1, len(citations))
            citation = citations[0]
            self.assertEqual(citation.label.to_list(), label)
            self.assertEqual(link, to_full_text(citation, text))
Exemple #33
0
def text_to_labels(text, initial_label, warn=True, force_start=False):
    """Convert header text used in interpretations into the interpretation
    label associated with them (e.g. 22(a) becomes XXX-22-a-Interp).
    warn: lets us know if there was an error in the conversion.
    force_start: ensure that the citations is at the *beginning* of the
                 text"""
    all_citations = internal_citations(text.strip(), initial_label)
    all_citations = sorted(all_citations, key=lambda c: c.start)

    #   We care only about the first citation and its clauses
    citations = all_citations[:1]
    if force_start:
        citations = [c for c in citations if c.full_start == 0]

    #   Under certain situations, we need to infer from context
    initial_pars = [
        match for match, start, _ in unified.any_depth_p.scanString(text)
        if start == 0
    ]

    if citations:
        if citations[0].in_clause:
            #   Clauses still in the first conjunction
            citations.extend(
                takewhile(lambda c: c.in_clause, all_citations[1:]))

        return [
            citation.label.to_list() + [Node.INTERP_MARK]
            for citation in citations
        ]
    elif (initial_label.comment and initial_pars
          and initial_label.settings.get('appendix')):
        return [[
            initial_label.settings['part'], initial_label.settings['appendix']
        ] + list(initial_pars[0]) + [Node.INTERP_MARK]]
    elif warn:
        logger.warning("Couldn't turn into label: " + text)
    return []
    def test_single_references(self):
        for text, link, label in [
            ("The requirements in paragraph (a)(4)(iii) of",
             'paragraph (a)(4)(iii)', ['102', '6', 'a', '4', 'iii']),
            ("Creditors may comply with paragraphs (a)(6) of this section",
             'paragraphs (a)(6)', ['102', '6', 'a', '6']),
            (u"date in § 1005.20(h)(1) must disclose", u'§ 1005.20(h)(1)',
             ['1005', '20', 'h', '1']),
            ('(a) Solicited issuance. Except as provided in paragraph (b) ' +
             'of this section', 'paragraph (b)', ['102', '6', 'b']),
            ("And Section 222.87(d)(2)(i) says something",
             'Section 222.87(d)(2)(i)', ['222', '87', 'd', '2', 'i']),
            ("More in paragraph 22(a)(4).", "paragraph 22(a)(4)",
             ["102", "22", "a", "4"]),
            ("See comment 32(b)(3) blah blah", 'comment 32(b)(3)',
             ['102', '32', 'b', '3', 'Interp']),
            ("refer to comment 36(a)(2)-3 of thing", 'comment 36(a)(2)-3',
             ['102', '36', 'a', '2', 'Interp', '3']),
            ("See Appendix A-5", "Appendix A-5", ['102', 'A', '5']),
            ("See Appendix A-5(R)", "Appendix A-5(R)", ['102', 'A', '5(R)']),
            ("See comment 3(v)-1.v. Another", "comment 3(v)-1.v",
             ['102', '3', 'v', 'Interp', '1', 'v']),
            ("See the commentary to 3(b)(1)", 'commentary to 3(b)(1)',
             ['102', '3', 'b', '1', 'Interp']),
            ("See comment 3(b)(1)-1.v.", 'comment 3(b)(1)-1.v',
             ['102', '3', 'b', '1', 'Interp', '1', 'v']),
            ("See appendix G, part V.4.D.", 'appendix G, part V.4.D',
             ['102', 'G', 'V', '4', 'D']),
            ("See comment 3-1 for things", 'comment 3-1',
             ['102', '3', 'Interp', '1'])]:

            citations = internal_citations(text, Label(part='102',
                                                       section='6'))
            self.assertEqual(1, len(citations))
            citation = citations[0]
            self.assertEqual(citation.label.to_list(), label)
            self.assertEqual(link, to_full_text(citation, text))
def text_to_labels(text, initial_label, warn=True, force_start=False):
    """Convert header text used in interpretations into the interpretation
    label associated with them (e.g. 22(a) becomes XXX-22-a-Interp).
    warn: lets us know if there was an error in the conversion.
    force_start: ensure that the citations is at the *beginning* of the
                 text"""
    all_citations = internal_citations(text.strip(), initial_label)
    all_citations = sorted(all_citations, key=lambda c: c.start)

    #   We care only about the first citation and its clauses
    citations = all_citations[:1]
    if force_start:
        citations = [c for c in citations if c.full_start == 0]

    #   Under certain situations, we need to infer from context
    initial_pars = [match
                    for match, start, _ in unified.any_depth_p.scanString(text)
                    if start == 0]

    if citations:
        if citations[0].in_clause:
            #   Clauses still in the first conjunction
            citations.extend(takewhile(lambda c: c.in_clause,
                                       all_citations[1:]))

        return [citation.label.to_list() + [Node.INTERP_MARK]
                for citation in citations]
    elif (initial_label.comment and initial_pars and
          initial_label.settings.get('appendix')):
        return [[initial_label.settings['part'],
                 initial_label.settings['appendix']] +
                list(initial_pars[0]) +
                [Node.INTERP_MARK]]
    elif warn:
        logger.warning("Couldn't turn into label: " + text)
    return []
def split_paragraph_text(text):
    """Split text into a root node and its children (if the text contains
    collapsed markers"""
    marker_positions = []
    if text.lstrip()[:1] == '(':
        marker_set = _first_paren_markers
    else:
        marker_set = _first_period_markers
    for marker in marker_set:
        marker_positions.extend(m.end() - len(m.group(1))
                                for m in marker.finditer(text))
    #   Remove any citations
    citations = internal_citations(text, require_marker=True)
    marker_positions = [pos for pos in marker_positions
                        if not any(cit.start <= pos and cit.end >= pos
                                   for cit in citations)]
    texts = []
    #   Drop Zeros, add the end
    break_points = [p for p in marker_positions if p] + [len(text)]
    last_pos = 0
    for pos in break_points:
        texts.append(text[last_pos:pos])
        last_pos = pos
    return texts
    def test_single_match_multiple_paragraphs1(self):
        text = "the requirements of paragraphs (c)(3), (d)(2), (e)(1), "
        text += "(e)(3), and (f) of this section"
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(5, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'c', '3'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(c)(3)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'd', '2'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(d)(2)')
        citation = citations[2]
        self.assertEqual(['222', '5', 'e', '1'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(e)(1)')
        citation = citations[3]
        self.assertEqual(['222', '5', 'e', '3'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(e)(3)')
        citation = citations[4]
        self.assertEqual(['222', '5', 'f'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(f)')

        text = "set forth in paragraphs (b)(1) or (b)(2)"
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'b', '1'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(b)(1)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'b', '2'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(b)(2)')

        text = 'paragraphs (c)(1) and (2) of this section'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'c', '1'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(c)(1)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'c', '2'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(2)')

        text = 'paragraphs (b)(1)(ii) and (iii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'b', '1', 'ii'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(b)(1)(ii)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'b', '1', 'iii'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(iii)')

        text = 'see paragraphs (z)(9)(vi)(A) and (D)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'z', '9', 'vi', 'A'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(z)(9)(vi)(A)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'z', '9', 'vi', 'D'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(D)')

        text = 'see 32(d)(6) and (7) Content content'
        citations = internal_citations(text, Label(part='222'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '32', 'd', '6'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '32(d)(6)')
        citation = citations[1]
        self.assertEqual(['222', '32', 'd', '7'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(7)')
 def test_single_reference_false_positives(self):
     text = "See the commentary. (a) child paragraph"
     citations = internal_citations(text, Label(part='102', section='1'))
     self.assertEqual(0, len(citations))
 def test_single_match_multiple_p_false_positives(self):
     text = "-9 text and stuff -2. (b) new thing"
     citations = internal_citations(text, Label(part='100', section='4'))
     self.assertEqual(0, len(citations))
 def test_single_match_multiple_paragraphs8(self):
     text = u'§ 105.2(a)(1)-(3)'
     citations = internal_citations(text, Label(part='100', section='2'))
     self.assertEqual(2, len(citations))
 def test_single_match_multiple_paragraphs8(self):
     text = u'§ 105.2(a)(1)-(3)'
     citations = internal_citations(text, Label(part='100', section='2'))
     self.assertEqual(2, len(citations))
 def test_single_match_multiple_p_false_positives(self):
     text = "-9 text and stuff -2. (b) new thing"
     citations = internal_citations(text, Label(part='100', section='4'))
     self.assertEqual(0, len(citations))
Exemple #43
0
def parse_into_labels(txt, part):
    """Find what part+section+(paragraph) (could be multiple) this text is 
    related to."""
    citations = internal_citations(txt, Label(part=part))
    labels =  ['-'.join(cit.label.to_list()) for cit in citations]
    return labels
 def test_single_reference_false_positives(self):
     text = "See the commentary. (a) child paragraph"
     citations = internal_citations(
         text, Label(part='102', section='1'))
     self.assertEqual(0, len(citations))
    def test_single_match_multiple_paragraphs1(self):
        text = "the requirements of paragraphs (c)(3), (d)(2), (e)(1), "
        text += "(e)(3), and (f) of this section"
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(5, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'c', '3'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(c)(3)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'd', '2'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(d)(2)')
        citation = citations[2]
        self.assertEqual(['222', '5', 'e', '1'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(e)(1)')
        citation = citations[3]
        self.assertEqual(['222', '5', 'e', '3'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(e)(3)')
        citation = citations[4]
        self.assertEqual(['222', '5', 'f'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(f)')

        text = "set forth in paragraphs (b)(1) or (b)(2)"
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'b', '1'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(b)(1)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'b', '2'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(b)(2)')

        text = 'paragraphs (c)(1) and (2) of this section'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'c', '1'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(c)(1)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'c', '2'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(2)')

        text = 'paragraphs (b)(1)(ii) and (iii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'b', '1', 'ii'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(b)(1)(ii)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'b', '1', 'iii'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(iii)')

        text = 'see paragraphs (z)(9)(vi)(A) and (D)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '5', 'z', '9', 'vi', 'A'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(z)(9)(vi)(A)')
        citation = citations[1]
        self.assertEqual(['222', '5', 'z', '9', 'vi', 'D'],
                         citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(D)')

        text = 'see 32(d)(6) and (7) Content content'
        citations = internal_citations(text, Label(part='222'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['222', '32', 'd', '6'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '32(d)(6)')
        citation = citations[1]
        self.assertEqual(['222', '32', 'd', '7'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(7)')