Beispiel #1
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if (self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        try:
            cfr_part = node.label[0]
        except IndexError:
            cfr_part = None

        if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part):
            for included_term, context in settings.INCLUDE_DEFINITIONS_IN[
                    cfr_part]:
                if context in node.text and included_term in node.text:
                    pos_start = node.text.index(included_term)
                    add_match(node, included_term.lower(),
                              (pos_start, pos_start + len(included_term)))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node,
                          term,
                          (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(match.scope, Label.from_node(node),
                                   verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[
                        match.term.pos[0]:match.term.pos[1]].lower()
                    match_len = len(term)
                    add_match(node,
                              term,
                              (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
Beispiel #2
0
    def determine_scope(self, stack):
        for node in stack.lineage():
            scopes = self.scope_of_text(node.text, Label.from_node(node))
            if scopes:
                return [tuple(s) for s in scopes]

        #   Couldn't determine scope; default to the entire reg
        return [tuple(node.label[:1])]
    def determine_scope(self, stack):
        for node in stack.lineage():
            scopes = self.scope_of_text(node.text, Label.from_node(node))
            if scopes:
                return [tuple(s) for s in scopes]

        #   Couldn't determine scope; default to the entire reg
        return [tuple(node.label[:1])]
Beispiel #4
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if (self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        try:
            cfr_part = node.label[0]
        except IndexError:
            cfr_part = None

        if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part):
            for included_term, context in settings.INCLUDE_DEFINITIONS_IN[
                    cfr_part]:
                if context in node.text and included_term in node.text:
                    pos_start = node.text.index(included_term)
                    add_match(node, included_term.lower(),
                              (pos_start, pos_start + len(included_term)))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node, term, (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(
                    match.scope, Label.from_node(node), verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[match.term.pos[0]:match.term.
                                            pos[1]].lower()
                    match_len = len(term)
                    add_match(node, term, (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
 def find(self, node):
     refs = []
     for match, _, _ in grammar.scope_term_type_parser.scanString(
             node.text):
         valid_scope = self.finder.scope_of_text(
             match.scope, Label.from_node(node), verify_prefix=False)
         valid_term = re.match("^[a-z ]+$", match.term.tokens[0])
         if valid_scope and valid_term:
             term = match.term.tokens[0].strip()
             pos_start = node.text.index(term, match.term.pos[0])
             refs.append(Ref(term, node.label_id(), pos_start))
     return refs
def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:   # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label,
                        title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)
    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]
def parse_from_xml(root, xml_nodes):
    """Core of supplement processing; shared by whole XML parsing and notice
    parsing. root is the root interpretation node (e.g. a Node with label
    '1005-Interp'). xml_nodes contains all XML nodes which will be relevant
    to the interpretations"""

    supplement_nodes = [root]

    last_label = root.label
    header_count = 0
    for ch in xml_nodes:
        node = Node(label=last_label, node_type=Node.INTERP)
        label_obj = Label.from_node(node)

        #   Explicitly ignore "subpart" headers, as they are inconsistent
        #   and they will be reconstructed as subterps client-side
        text = tree_utils.get_node_text(ch, add_spaces=True)
        if is_title(ch) and 'subpart' not in text.lower():
            labels = text_to_labels(text, label_obj)
            if labels:
                label = merge_labels(labels)
            else:  # Header without a label, like an Introduction, etc.
                header_count += 1
                label = root.label[:2] + ['h%d' % header_count]

            inner_stack = tree_utils.NodeStack()
            missing = missing_levels(last_label, label)
            supplement_nodes.extend(missing)
            last_label = label

            node = Node(node_type=Node.INTERP, label=label, title=text.strip())
            inner_stack.add(2, node)

            process_inner_children(inner_stack, ch, parent=node)

            while inner_stack.size() > 1:
                inner_stack.unwind()

            ch_node = inner_stack.m_stack[0][0][1]
            supplement_nodes.append(ch_node)

    supplement_tree = treeify(supplement_nodes)

    def per_node(node):
        node.label = [l.replace('<E T="03">', '') for l in node.label]
        for child in node.children:
            per_node(child)

    for node in supplement_tree:
        per_node(node)

    return supplement_tree[0]
Beispiel #8
0
 def find(self, node):
     refs = []
     for match, _, _ in grammar.scope_term_type_parser.scanString(
             node.text):
         valid_scope = self.finder.scope_of_text(match.scope,
                                                 Label.from_node(node),
                                                 verify_prefix=False)
         valid_term = re.match("^[a-z ]+$", match.term.tokens[0])
         if valid_scope and valid_term:
             term = match.term.tokens[0].strip()
             pos_start = node.text.index(term, match.term.pos[0])
             refs.append(Ref(term, node.label_id(), pos_start))
     return refs
Beispiel #9
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text. 'Act' is a special case,
        as it is also defined as an external citation."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if ((term == 'act' and list(uscode.scanString(n.text)))
                    or self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node,
                          term,
                          (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(match.scope, Label.from_node(node),
                                   verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[
                        match.term.pos[0]:match.term.pos[1]].lower()
                    match_len = len(term)
                    add_match(node,
                              term,
                              (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
        def per_node(node):
            if (node.node_type != struct.Node.INTERP
                    or node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])  # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
        def per_node(node):
            if (node.node_type != struct.Node.INTERP
                    or node.label[-1] != struct.Node.INTERP_MARK):
                return

            #   Always add a connection based on the interp's label
            self.lookup_table[tuple(node.label[:-1])].append(node)

            #   Also add connections based on the title
            for label in text_to_labels(node.title or '',
                                        Label.from_node(node),
                                        warn=False):
                label = tuple(label[:-1])   # Remove Interp marker
                if node not in self.lookup_table[label]:
                    self.lookup_table[label].append(node)
Beispiel #12
0
    def node_definitions(self, node, stack=None):
        """Find defined terms in this node's text. 'Act' is a special case,
        as it is also defined as an external citation."""
        included_defs = []
        excluded_defs = []

        def add_match(n, term, pos):
            if ((term == 'act' and list(uscode.scanString(n.text)))
                    or self.is_exclusion(term, n)):
                excluded_defs.append(Ref(term, n.label_id(), pos))
            else:
                included_defs.append(Ref(term, n.label_id(), pos))

        if stack and self.has_parent_definitions_indicator(stack):
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term.tokens[0].lower().strip(',.;')
                #   Don't use pos_end because we are stripping some chars
                pos_start = match.term.pos[0]
                add_match(node, term, (pos_start, pos_start + len(term)))

        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            # Check that both scope and term look valid
            if (self.scope_of_text(
                    match.scope, Label.from_node(node), verify_prefix=False)
                    and re.match("^[a-z ]+$", match.term.tokens[0])):
                term = match.term.tokens[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                add_match(node, term, (pos_start, pos_start + len(term)))

        if hasattr(node, 'tagged_text'):
            for match, _, _ in grammar.xml_term_parser.scanString(
                    node.tagged_text):
                """Position in match reflects XML tags, so its dropped in
                preference of new values based on node.text."""
                for match in chain([match.head], match.tail):
                    pos_start = self.pos_start_excluding(
                        match.term.tokens[0], node.text,
                        included_defs + excluded_defs)
                    term = node.tagged_text[match.term.pos[0]:match.term.
                                            pos[1]].lower()
                    match_len = len(term)
                    add_match(node, term, (pos_start, pos_start + match_len))

        return included_defs, excluded_defs
 def test_from_node(self):
     for lst, typ in [(['111'], Node.REGTEXT),
                      (['111', '31', 'a', '3'], Node.REGTEXT),
                      (['111', 'A', 'b'], Node.APPENDIX),
                      (['111', 'A', '4', 'a'], Node.APPENDIX),
                      (['111', '21', 'Interp'], Node.INTERP),
                      (['111', '21', 'Interp', '1'], Node.INTERP),
                      (['111', '21', 'r', 'Interp'], Node.INTERP),
                      (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
                      (['111', 'G', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp'], Node.INTERP),
                      (['111', 'G', '2', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
                      (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
                      (['111', 'Subpart', 'A'], Node.SUBPART),
                      (['111', 'Subpart'], Node.EMPTYPART)]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
Beispiel #14
0
 def test_from_node(self):
     for lst, typ in [(['111'], Node.REGTEXT),
                      (['111', '31', 'a', '3'], Node.REGTEXT),
                      (['111', 'A', 'b'], Node.APPENDIX),
                      (['111', 'A', '4', 'a'], Node.APPENDIX),
                      (['111', '21', 'Interp'], Node.INTERP),
                      (['111', '21', 'Interp', '1'], Node.INTERP),
                      (['111', '21', 'r', 'Interp'], Node.INTERP),
                      (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
                      (['111', 'G', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp'], Node.INTERP),
                      (['111', 'G', '2', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
                      (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
                      (['111', 'Subpart', 'A'], Node.SUBPART),
                      (['111', 'Subpart'], Node.EMPTYPART)]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
 def test_from_node(self):
     for lst, typ in [(['111'], Node.REGTEXT),
                      (['111', '31', 'a', '3'], Node.REGTEXT),
                      # _Very_ deeply nested, ignoring the recommended
                      # 6-level paragraph limit
                      (['111', '2', 'c', '4', 'v', 'F', '7', 'viii',
                        'p1', 'p1', 'p1'], Node.REGTEXT),
                      (['111', 'A', 'b'], Node.APPENDIX),
                      (['111', 'A', '4', 'a'], Node.APPENDIX),
                      (['111', '21', 'Interp'], Node.INTERP),
                      (['111', '21', 'Interp', '1'], Node.INTERP),
                      (['111', '21', 'r', 'Interp'], Node.INTERP),
                      (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
                      (['111', 'G', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp'], Node.INTERP),
                      (['111', 'G', '2', 'Interp'], Node.INTERP),
                      (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
                      (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
                      (['111', 'Subpart', 'A'], Node.SUBPART),
                      (['111', 'Subpart'], Node.EMPTYPART)]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
 def test_from_node(self):
     for lst, typ in [
         (['111'], Node.REGTEXT),
         (['111', '31', 'a', '3'], Node.REGTEXT),
             # _Very_ deeply nested, ignoring the recommended
             # 6-level paragraph limit
         (['111', '2', 'c', '4', 'v', 'F', '7', 'viii', 'p1', 'p1',
           'p1'], Node.REGTEXT),
         (['111', 'A', 'b'], Node.APPENDIX),
         (['111', 'A', '4', 'a'], Node.APPENDIX),
         (['111', '21', 'Interp'], Node.INTERP),
         (['111', '21', 'Interp', '1'], Node.INTERP),
         (['111', '21', 'r', 'Interp'], Node.INTERP),
         (['111', '21', 'r', 'Interp', '2'], Node.INTERP),
         (['111', 'G', 'Interp'], Node.INTERP),
         (['111', 'G3', 'r', 'Interp'], Node.INTERP),
         (['111', 'G', '2', 'Interp'], Node.INTERP),
         (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP),
         (['111', 'G', '2', 'Interp', '5'], Node.INTERP),
         (['111', 'Subpart', 'A'], Node.SUBPART),
         (['111', 'Subpart'], Node.EMPTYPART)
     ]:
         n = Node(label=lst, node_type=typ)
         self.assertEqual(Label.from_node(n).to_list(), lst)
 def process(self, node):
     citations_list = self.parse(node.text,
                                 label=Label.from_node(node),
                                 title=str(self.cfr_title))
     if citations_list:
         return citations_list
 def process(self, node):
     citations_list = self.parse(node.text,
                                 label=Label.from_node(node),
                                 title=str(self.cfr_title))
     if citations_list:
         return citations_list