def test_labels_until_sections(self):
     """We can fill in sections"""
     start = Label(cfr_title='11', part='222', section='33')
     end = Label(cfr_title='11', part='222', section='36')
     self.assertEqual(list(start.labels_until(end)),
                      [Label(cfr_title='11', part='222', section='34'),
                       Label(cfr_title='11', part='222', section='35')])
 def test_determine_schema(self):
     self.assertEqual(Label.app_sect_schema,
                      Label.determine_schema({'appendix_section': '1'}))
     self.assertEqual(Label.app_schema,
                      Label.determine_schema({'appendix': 'A'}))
     self.assertEqual(Label.regtext_schema,
                      Label.determine_schema({'section': '12'}))
     self.assertEqual(None, Label.determine_schema({}))
 def test_determine_schema(self):
     self.assertEqual(Label.app_sect_schema,
                      Label.determine_schema({'appendix_section': '1'}))
     self.assertEqual(Label.app_schema,
                      Label.determine_schema({'appendix': 'A'}))
     self.assertEqual(Label.regtext_schema,
                      Label.determine_schema({'section': '12'}))
     self.assertEqual(None, Label.determine_schema({}))
 def test_labels_until_paragraphs(self):
     """We can fill in paragraphs"""
     start = Label(cfr_title='11', part='222', section='33', p1='a', p2='2')
     end = Label(cfr_title='11', part='222', section='33', p1='a', p2='6')
     self.assertEqual(
         list(start.labels_until(end)),
         [Label(cfr_title='11', part='222', section='33', p1='a', p2='3'),
          Label(cfr_title='11', part='222', section='33', p1='a', p2='4'),
          Label(cfr_title='11', part='222', section='33', p1='a', p2='5')])
Ejemplo n.º 5
0
def segment_tree(text, part, parent_label):
    """Build a tree representing the interpretation of a section, paragraph,
    or appendix."""
    title, body = utils.title_body(text)
    exclude = [(pc.full_start, pc.full_end) for pc in
               internal_citations(body, Label(part=parent_label[0]))]

    label = merge_labels(text_to_labels(title, Label(part=part, comment=True)))
    return interpParser.build_tree(body, 1, exclude, label, title)
    def test_to_list(self):
        label = Label(part='222', section='11', p1='c', p2='2')
        self.assertEqual(['222', '11', 'c', '2'], label.to_list())

        label = Label(part='222', p1='d', appendix='R3')
        self.assertEqual(['222', 'R3', 'd'], label.to_list())

        label = Label(part='222', p1='d', appendix='R', appendix_section='4')
        self.assertEqual(['222', 'R', '4', 'd'], label.to_list())
def _p_with_label_in_child(xml_node):
    """E.g. <P><E>22(a)</E>.</P>"""
    children = xml_node.getchildren()
    return (xml_node.tag.upper() == 'P' and not (xml_node.text or '').strip()
            and len(children) == 1
            and not (children[0].tail or '').strip(" \n\t.")
            and text_to_labels(children[0].text, Label(), warn=False))
Ejemplo n.º 8
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if (self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        try:
            cfr_part = node.label[0]
        except IndexError:
            cfr_part = None

        if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part):
            for included_term, context in settings.INCLUDE_DEFINITIONS_IN[
                    cfr_part]:
                if context in node.text and included_term in node.text:
                    pos_start = node.text.index(included_term)
                    add_match(node, included_term.lower(),
                              (pos_start, pos_start + len(included_term)))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node,
                          term,
                          (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(match.scope, Label.from_node(node),
                                   verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[
                        match.term.pos[0]:match.term.pos[1]].lower()
                    match_len = len(term)
                    add_match(node,
                              term,
                              (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
Ejemplo n.º 9
0
    def determine_scope(self, stack):
        for node in stack.lineage():
            scopes = self.scope_of_text(node.text, Label.from_node(node))
            if scopes:
                return [tuple(s) for s in scopes]

        #   Couldn't determine scope; default to the entire reg
        return [tuple(node.label[:1])]
def is_title(xml_node):
    """Not all titles are created equal. Sometimes a title appears as a
    paragraph tag, mostly to add confusion."""
    if xml_node.getchildren():
        child = xml_node.getchildren()[0]
    else:
        child = None
    return bool(
        (xml_node.tag.upper() == 'HD' and xml_node.attrib['SOURCE'] != 'HED')
        or (xml_node.tag.upper() == 'P' and
            (xml_node.text is None or not xml_node.text.strip())
            and len(xml_node.getchildren()) == 1 and
            (child.tail is None or not child.tail.strip(" \n\t."))
            and text_to_labels(child.text, Label(), warn=False)) or
        (xml_node.tag.upper() == 'P' and len(xml_node.getchildren()) == 0
         and xml_node.text and not get_first_interp_marker(xml_node.text) and
         text_to_labels(xml_node.text, Label(), warn=False, force_start=True)))
 def test_section_ref_in_appendix(self):
     text = u"""(a) Something something § 1005.7(b)(1)."""
     citations = internal_citations(
         text, Label(part='1005',
                     appendix='A',
                     appendix_section='2',
                     p1='a'))
     self.assertEqual(citations[0].label.to_list(), ['1005', '7', 'b', '1'])
Ejemplo n.º 12
0
    def determine_scope(self, stack):
        for node in stack.lineage():
            scopes = self.scope_of_text(node.text, Label.from_node(node))
            if scopes:
                return [tuple(s) for s in scopes]

        #   Couldn't determine scope; default to the entire reg
        return [tuple(node.label[:1])]
Ejemplo n.º 13
0
def _non_interp_p_with_label(xml_node):
    """E.g. <P>22(a)</P> but not <P>ii. 22(a)</P>"""
    return (
        xml_node.tag.upper() == 'P' and
        not xml_node.getchildren() and
        xml_node.text and not get_first_interp_marker(xml_node.text) and
        text_to_labels(xml_node.text, Label(), warn=False, force_start=True)
    )
Ejemplo n.º 14
0
def test_text_to_labels():
    text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and "
    text += "(b)(2)"
    result = tree.text_to_labels(text, Label(part='1111', comment=True))
    assert result == [['1111', '9', 'c', '2', 'iii', 'Interp']]

    text = "Paragraphs 4(b)(7) and (b)(8)."
    result = tree.text_to_labels(text, Label(part='1111', comment=True))
    assert result == [['1111', '4', 'b', '7', 'Interp'],
                      ['1111', '4', 'b', '8', 'Interp']]

    text = "Appendices G and H-Something"
    result = tree.text_to_labels(text, Label(part='1111', comment=True))
    assert result == [['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']]

    text = "Paragraph 38(l)(7)(i)(A)(2)."
    result = tree.text_to_labels(text, Label(part='1111', comment=True))
    assert result == [['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']]
Ejemplo n.º 15
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if (self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        try:
            cfr_part = node.label[0]
        except IndexError:
            cfr_part = None

        if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part):
            for included_term, context in settings.INCLUDE_DEFINITIONS_IN[
                    cfr_part]:
                if context in node.text and included_term in node.text:
                    pos_start = node.text.index(included_term)
                    add_match(node, included_term.lower(),
                              (pos_start, pos_start + len(included_term)))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node, term, (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(
                    match.scope, Label.from_node(node), verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[match.term.pos[0]:match.term.
                                            pos[1]].lower()
                    match_len = len(term)
                    add_match(node, term, (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
 def test_single_match_multiple_paragraphs4(self):
     text = "Listing sections 11.55(d) and 321.11 (h)(4)"
     citations = internal_citations(text, Label(part='222', section='5'))
     self.assertEqual(2, len(citations))
     citation = citations[0]
     self.assertEqual(['11', '55', 'd'], citation.label.to_list())
     self.assertEqual(to_text(citation, text), '11.55(d)')
     citation = citations[1]
     self.assertEqual(['321', '11', 'h', '4'], citation.label.to_list())
     self.assertEqual(to_text(citation, text), '321.11 (h)(4)')
Ejemplo n.º 17
0
def parse_into_labels(txt, part):
    """Find what part+section+(paragraph) (could be multiple) this text is
    related to."""
    citations = internal_citations(txt, Label(part=part))
    # odd corner case: headers shouldn't include both an appendix and regtext
    labels = [c.label for c in citations]
    if any('appendix' in l.settings for l in labels):
        labels = [l for l in labels if 'appendix' in l.settings]
    labels = ['-'.join(l.to_list()) for l in labels]
    return labels
Ejemplo n.º 18
0
def add_spaces_to_title(title):
    """Federal Register often seems to miss spaces in the title of SxS
    sections. Make sure spaces get added if appropriate"""
    for citation in internal_citations(title, Label()):
        end = citation.end
        # Next char is an alpha and last char isn't a space
        if end < len(title) and title[end].isalpha() and title[end - 1] != ' ':
            title = title[:end] + ' ' + title[end:]
            break   # Assumes there is only one paragraph in a title
    return title
Ejemplo n.º 19
0
def generate_keyterm(node):
    label_id = node.label_id()
    if label_id in real_key_terms_layer:
        layer[label_id] = real_key_terms_layer[label_id]
    else:
        node_text = key_terms.KeyTerms.process_node_text(node)
        if not node_text:
            return

        # Our Appendix parsing isn't particularly accurate -- avoid keyterms
        if node.node_type == struct.Node.APPENDIX:
            return

        exclude = [(start, end) for _, start, end in
                   exclude_parser.scanString(node_text)]
        exclude.extend((pc.full_start, pc.full_end) for pc in
                       internal_citations(node_text, Label()))

        periods = [m.start() for m in period.finditer(node_text)]
        # Remove any periods which are part of a citation
        periods = filter(lambda p: all(p < start or p > end
                                       for start, end in exclude), periods)

        # Key terms must either have a full "sentence" or end with a hyphen
        if not periods and node_text[-1] != u'—':
            return

        if periods:
            first_p = periods[0]
            # Check for cases where the period is "inside" something;
            # include the period
            next_char = node_text[first_p + 1: first_p + 2]
            if next_char in (')', u'”'):
                first_sentence = node_text[:first_p + 2]
            else:
                first_sentence = node_text[:first_p + 1]
        else:
            first_sentence = node_text

        # Key terms can't be the entire text of a leaf node
        if first_sentence == node_text and not node.children:
            return

        words = first_sentence.split()
        if (not words[-1] == part_end and
                not first_sentence.startswith('![')):
            num_words = len(words)

            # key terms are short
            if num_words <= 15:
                layer_element = {
                    "key_term": first_sentence,
                    "locations": [0]
                }
                layer[label_id] = [layer_element]
 def test_labels_until_paragraphs(self):
     """We can fill in paragraphs"""
     start = Label(cfr_title='11', part='222', section='33', p1='a', p2='2')
     end = Label(cfr_title='11', part='222', section='33', p1='a', p2='6')
     self.assertEqual(list(start.labels_until(end)), [
         Label(cfr_title='11', part='222', section='33', p1='a', p2='3'),
         Label(cfr_title='11', part='222', section='33', p1='a', p2='4'),
         Label(cfr_title='11', part='222', section='33', p1='a', p2='5')
     ])
 def test_single_match_multiple_paragraphs5(self):
     text = "See, e.g., comments 31(b)(1)(iv)-1 and 31(b)(1)(vi)-1"
     citations = internal_citations(text, Label(part='222', section='5'))
     self.assertEqual(2, len(citations))
     citation = citations[0]
     self.assertEqual(['222', '31', 'b', '1', 'iv', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '31(b)(1)(iv)-1')
     citation = citations[1]
     self.assertEqual(['222', '31', 'b', '1', 'vi', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '31(b)(1)(vi)-1')
 def test_single_match_multiple_paragraphs6(self):
     text = "comments 5(b)(3)-1 through -3"
     citations = internal_citations(text, Label(part='100', section='5'))
     citation = citations[0]
     self.assertEqual(2, len(citations))
     self.assertEqual(['100', '5', 'b', '3', 'Interp', '1'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '5(b)(3)-1')
     citation = citations[1]
     self.assertEqual(['100', '5', 'b', '3', 'Interp', '3'],
                      citation.label.to_list())
     self.assertEqual(to_text(citation, text), '-3')
    def test_multiple_matches(self):
        text = "Please see A-5 and Q-2(r) and Z-12(g)(2)(ii) then more text"
        citations = internal_citations(text, Label(part='102', section='1'))
        self.assertEqual(3, len(citations))
        citation = citations[0]
        self.assertEqual(citation.label.to_list(), ['102', 'A', '5'])
        self.assertEqual(to_text(citation, text), 'A-5')
        citation = citations[1]
        self.assertEqual(citation.label.to_list(), ['102', 'Q', '2(r)'])
        self.assertEqual(to_text(citation, text), 'Q-2(r)')
        citation = citations[2]
        self.assertEqual(citation.label.to_list(),
                         ['102', 'Z', '12(g)(2)(ii)'])
        self.assertEqual(to_text(citation, text), 'Z-12(g)(2)(ii)')

        text = u"Appendices G and H—Yadda yadda"
        citations = internal_citations(text, Label(part='102'))
        self.assertEqual(2, len(citations))
        citG, citH = citations
        self.assertEqual(citG.label.to_list(), ['102', 'G'])
        self.assertEqual(citH.label.to_list(), ['102', 'H'])
Ejemplo n.º 24
0
 def find(self, node):
     refs = []
     for match, _, _ in grammar.scope_term_type_parser.scanString(
             node.text):
         valid_scope = self.finder.scope_of_text(
             match.scope, Label.from_node(node), verify_prefix=False)
         valid_term = re.match("^[a-z ]+$", match.term.tokens[0])
         if valid_scope and valid_term:
             term = match.term.tokens[0].strip()
             pos_start = node.text.index(term, match.term.pos[0])
             refs.append(Ref(term, node.label_id(), pos_start))
     return refs
    def test_single_match_multiple_paragraphs2(self):
        text = u'§ 1005.10(a) and (d)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        citation = citations[0]
        self.assertEqual(['1005', '10', 'a'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '1005.10(a)')
        citation = citations[1]
        self.assertEqual(['1005', '10', 'd'], citation.label.to_list())
        self.assertEqual(to_text(citation, text), '(d)')

        text = u'§ 1005.7(b)(1), (2) and (3)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '7', 'b', '1'], citations[0].label.to_list())
        self.assertEqual(['1005', '7', 'b', '2'], citations[1].label.to_list())
        self.assertEqual(['1005', '7', 'b', '3'], citations[2].label.to_list())

        text = u'§ 1005.15(d)(1)(i) and (ii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        self.assertEqual(['1005', '15', 'd', '1', 'i'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '15', 'd', '1', 'ii'],
                         citations[1].label.to_list())

        text = u'§ 1005.9(a)(5) (i), (ii), or (iii)'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '9', 'a', '5', 'i'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '9', 'a', '5', 'ii'],
                         citations[1].label.to_list())
        self.assertEqual(['1005', '9', 'a', '5', 'iii'],
                         citations[2].label.to_list())

        text = u'§ 1005.11(a)(1)(vi) or (vii).'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(2, len(citations))
        self.assertEqual(['1005', '11', 'a', '1', 'vi'],
                         citations[0].label.to_list())
        self.assertEqual(['1005', '11', 'a', '1', 'vii'],
                         citations[1].label.to_list())

        text = u'§§ 1005.3(b)(2) and (3), 1005.10(b), (d), and (e), 1005.13, '
        text += 'and 1005.20'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(7, len(citations))

        text = 'Sections 1005.3, .4, and .5'
        citations = internal_citations(text, Label(part='222', section='5'))
        self.assertEqual(3, len(citations))
        self.assertEqual(['1005', '3'], citations[0].label.to_list())
        self.assertEqual(['1005', '4'], citations[1].label.to_list())
        self.assertEqual(['1005', '5'], citations[2].label.to_list())
Ejemplo n.º 26
0
def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:   # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label,
                        title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)
    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]
def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:  # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label, title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)

    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]
Ejemplo n.º 28
0
 def find(self, node):
     refs = []
     for match, _, _ in grammar.scope_term_type_parser.scanString(
             node.text):
         valid_scope = self.finder.scope_of_text(match.scope,
                                                 Label.from_node(node),
                                                 verify_prefix=False)
         valid_term = re.match("^[a-z ]+$", match.term.tokens[0])
         if valid_scope and valid_term:
             term = match.term.tokens[0].strip()
             pos_start = node.text.index(term, match.term.pos[0])
             refs.append(Ref(term, node.label_id(), pos_start))
     return refs
Ejemplo n.º 29
0
    def test_text_to_labels(self):
        text = u"9(c)(2)(iii) Charges not Covered by § 1026.6(b)(1) and "
        text += "(b)(2)"
        self.assertEqual([['1111', '9', 'c', '2', 'iii', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))

        text = "Paragraphs 4(b)(7) and (b)(8)."
        self.assertEqual([['1111', '4', 'b', '7', 'Interp'],
                          ['1111', '4', 'b', '8', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))

        text = "Appendices G and H-Something"
        self.assertEqual([['1111', 'G', 'Interp'], ['1111', 'H', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))

        text = "Paragraph 38(l)(7)(i)(A)(2)."
        self.assertEqual([['1111', '38', 'l', '7', 'i', 'A', '2', 'Interp']],
                         interpretation.text_to_labels(
                             text, Label(part='1111', comment=True)))
 def test_labels_until_sections(self):
     """We can fill in sections"""
     start = Label(cfr_title='11', part='222', section='33')
     end = Label(cfr_title='11', part='222', section='36')
     self.assertEqual(list(start.labels_until(end)), [
         Label(cfr_title='11', part='222', section='34'),
         Label(cfr_title='11', part='222', section='35')
     ])
Ejemplo n.º 31
0
def build_section_tree(text, part):
    """Construct the tree for a whole section. Assumes the section starts
    with an identifier"""
    title, text = utils.title_body(text)

    exclude = [(pc.full_start, pc.full_end)
               for pc in internal_citations(text, Label(part=part))]
    section = re.search(r'%d\.(\d+)\b' % part, title).group(1)
    label = [str(part), section]
    p_tree = regParser.build_tree(text,
                                  exclude=exclude,
                                  label=label,
                                  title=title)
    return p_tree
 def test_lt(self):
     """Comparisons between labels"""
     self.assertTrue(
         Label(part='105', section='3') < Label(part='105', section='4'))
     self.assertTrue(
         Label(part='105', section='3') < Label(
             part='105', section='3', p1='a'))
     self.assertTrue(
         Label(part='105', section='3', p1='a') < Label(part='222'))
Ejemplo n.º 33
0
    def test_to_list(self):
        label = Label(part='222', section='11', p1='c', p2='2')
        self.assertEqual(['222', '11', 'c', '2'], label.to_list())

        label = Label(part='222', p1='d', appendix='R3')
        self.assertEqual(['222', 'R3', 'd'], label.to_list())

        label = Label(part='222', p1='d', appendix='R', appendix_section='4')
        self.assertEqual(['222', 'R', '4', 'd'], label.to_list())
Ejemplo n.º 34
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text. 'Act' is a special case,
        as it is also defined as an external citation."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if ((term == 'act' and list(uscode.scanString(n.text)))
                    or self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node,
                          term,
                          (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(match.scope, Label.from_node(node),
                                   verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[
                        match.term.pos[0]:match.term.pos[1]].lower()
                    match_len = len(term)
                    add_match(node,
                              term,
                              (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
Ejemplo n.º 35
0
        def per_node(node):
            if (node.node_type != struct.Node.INTERP
                    or node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])   # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
        def per_node(node):
            if (node.node_type != struct.Node.INTERP
                    or node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])  # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
Ejemplo n.º 37
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text. 'Act' is a special case,
        as it is also defined as an external citation."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if ((term == 'act' and list(uscode.scanString(n.text)))
                    or self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node, term, (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(
                    match.scope, Label.from_node(node), verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[match.term.pos[0]:match.term.
                                            pos[1]].lower()
                    match_len = len(term)
                    add_match(node, term, (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
Ejemplo n.º 38
0
 def test_from_node(self):
     for lst, typ in [(['111'], Node.REGTEXT),
                      (['111', '31', 'a', '3'], Node.REGTEXT),
                      (['111', 'A', 'b'], Node.APPENDIX),
                      (['111', 'A', '4', 'a'], Node.APPENDIX),
                      (['111', '21', 'Interp'], Node.INTERP),
                      (['111', '21', 'Interp', '1'], Node.INTERP),
                      (['111', '21', 'r', 'Interp'], Node.INTERP),
                      (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
                      (['111', 'G', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp'], Node.INTERP),
                      (['111', 'G', '2', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
                      (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
                      (['111', 'Subpart', 'A'], Node.SUBPART),
                      (['111', 'Subpart'], Node.EMPTYPART)]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
Ejemplo n.º 39
0
 def test_from_node(self):
     for lst, typ in [(['111'], Node.REGTEXT),
                      (['111', '31', 'a', '3'], Node.REGTEXT),
                      (['111', 'A', 'b'], Node.APPENDIX),
                      (['111', 'A', '4', 'a'], Node.APPENDIX),
                      (['111', '21', 'Interp'], Node.INTERP),
                      (['111', '21', 'Interp', '1'], Node.INTERP),
                      (['111', '21', 'r', 'Interp'], Node.INTERP),
                      (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
                      (['111', 'G', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp'], Node.INTERP),
                      (['111', 'G', '2', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
                      (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
                      (['111', 'Subpart', 'A'], Node.SUBPART),
                      (['111', 'Subpart'], Node.EMPTYPART)]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
Ejemplo n.º 40
0
def paragraph_tree(appendix_letter, sections, text, label, title=None):
    """Use the paragraph parser to parse through each section in this
    appendix."""
    if not sections:
        return Node(text, label=label, title=title, node_type=Node.APPENDIX)
    children = []
    for begin, end in sections:
        seg_title, section_text = utils.title_body(text[begin:end])
        sec_num = carving.get_appendix_section_number(
            seg_title, appendix_letter)
        exclude = [(pc.full_start, pc.full_end) for pc in
                   internal_citations(section_text, Label(part=label[0]))]

        child = parParser.build_tree(
            section_text, exclude=exclude, label=label + [sec_num],
            title=seg_title)

        children.append(child)
    return Node(text[:sections[0][0]], children, label, title, Node.APPENDIX)
    def test_interp_headers(self):
        for text, label in [
            ("Section 102.22Stuff", ['102', '22']),
            ("22(d) Content", ['101', '22', 'd']),
            ("22(d)(5) Content", ['101', '22', 'd', '5']),
            ("22(d)(5)(x) Content", ['101', '22', 'd', '5', 'x']),
            (u"§ 102.22(d)(5)(x) Content", ['102', '22', 'd', '5', 'x']),
            ("22(d)(5)(x)(Q) Content", ['101', '22', 'd', '5', 'x', 'Q']),
            ("Appendix A Heading", ['101', 'A']),
            ("Comment 21(c)-1 Heading", ['101', '21', 'c', 'Interp', '1']),
            ("Paragraph 38(l)(7)(i)(A)(2).",
             ['101', '38', 'l', '7', 'i', 'A', '2']),
            (u'Official Interpretations of § 102.33(c)(2)',
             ['102', '33', 'c', '2', 'Interp'])
        ]:

            citations = internal_citations(text, Label(part='101'))
            self.assertEqual(1, len(citations))
            self.assertEqual(citations[0].label.to_list(), label)
Ejemplo n.º 42
0
 def test_from_node(self):
     for lst, typ in [(['111'], Node.REGTEXT),
                      (['111', '31', 'a', '3'], Node.REGTEXT),
                      # _Very_ deeply nested, ignoring the recommended
                      # 6-level paragraph limit
                      (['111', '2', 'c', '4', 'v', 'F', '7', 'viii',
                        'p1', 'p1', 'p1'], Node.REGTEXT),
                      (['111', 'A', 'b'], Node.APPENDIX),
                      (['111', 'A', '4', 'a'], Node.APPENDIX),
                      (['111', '21', 'Interp'], Node.INTERP),
                      (['111', '21', 'Interp', '1'], Node.INTERP),
                      (['111', '21', 'r', 'Interp'], Node.INTERP),
                      (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
                      (['111', 'G', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp'], Node.INTERP),
                      (['111', 'G', '2', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
                      (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
                      (['111', 'Subpart', 'A'], Node.SUBPART),
                      (['111', 'Subpart'], Node.EMPTYPART)]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
Ejemplo n.º 43
0
    def test_copy(self):
        label = Label(part='222', section='11', p1='c', p2='2')
        label = label.copy(p3='ii')
        self.assertEqual(['222', '11', 'c', '2', 'ii'], label.to_list())

        label = label.copy(p2='4', p3='iv')
        self.assertEqual(['222', '11', 'c', '4', 'iv'], label.to_list())

        label = label.copy(section='12', p1='d')
        self.assertEqual(['222', '12', 'd'], label.to_list())

        label = label.copy(appendix='D', appendix_section='4')
        self.assertEqual(['222', 'D', '4'], label.to_list())

        label = label.copy(p1='c', p2='3')
        self.assertEqual(['222', 'D', '4', 'c', '3'], label.to_list())
 def process(self, node):
     citations_list = self.parse(node.text,
                                 label=Label.from_node(node),
                                 title=str(self.cfr_title))
     if citations_list:
         return citations_list