def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if (self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) try: cfr_part = node.label[0] except IndexError: cfr_part = None if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part): for included_term, context in settings.INCLUDE_DEFINITIONS_IN[ cfr_part]: if context in node.text and included_term in node.text: pos_start = node.text.index(included_term) add_match(node, included_term.lower(), (pos_start, pos_start + len(included_term))) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text(match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[ match.term.pos[0]:match.term.pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def determine_scope(self, stack): for node in stack.lineage(): scopes = self.scope_of_text(node.text, Label.from_node(node)) if scopes: return [tuple(s) for s in scopes] # Couldn't determine scope; default to the entire reg return [tuple(node.label[:1])]
def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if (self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) try: cfr_part = node.label[0] except IndexError: cfr_part = None if settings.INCLUDE_DEFINITIONS_IN.get(cfr_part): for included_term, context in settings.INCLUDE_DEFINITIONS_IN[ cfr_part]: if context in node.text and included_term in node.text: pos_start = node.text.index(included_term) add_match(node, included_term.lower(), (pos_start, pos_start + len(included_term))) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text( match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[match.term.pos[0]:match.term. pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def find(self, node): refs = [] for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): valid_scope = self.finder.scope_of_text( match.scope, Label.from_node(node), verify_prefix=False) valid_term = re.match("^[a-z ]+$", match.term.tokens[0]) if valid_scope and valid_term: term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) refs.append(Ref(term, node.label_id(), pos_start)) return refs
def parse_from_xml(root, xml_nodes): """Core of supplement processing; shared by whole XML parsing and notice parsing. root is the root interpretation node (e.g. a Node with label '1005-Interp'). xml_nodes contains all XML nodes which will be relevant to the interpretations""" supplement_nodes = [root] last_label = root.label header_count = 0 for ch in xml_nodes: node = Node(label=last_label, node_type=Node.INTERP) label_obj = Label.from_node(node) # Explicitly ignore "subpart" headers, as they are inconsistent # and they will be reconstructed as subterps client-side text = tree_utils.get_node_text(ch, add_spaces=True) if is_title(ch) and 'subpart' not in text.lower(): labels = text_to_labels(text, label_obj) if labels: label = merge_labels(labels) else: # Header without a label, like an Introduction, etc. header_count += 1 label = root.label[:2] + ['h%d' % header_count] inner_stack = tree_utils.NodeStack() missing = missing_levels(last_label, label) supplement_nodes.extend(missing) last_label = label node = Node(node_type=Node.INTERP, label=label, title=text.strip()) inner_stack.add(2, node) process_inner_children(inner_stack, ch, parent=node) while inner_stack.size() > 1: inner_stack.unwind() ch_node = inner_stack.m_stack[0][0][1] supplement_nodes.append(ch_node) supplement_tree = treeify(supplement_nodes) def per_node(node): node.label = [l.replace('<E T="03">', '') for l in node.label] for child in node.children: per_node(child) for node in supplement_tree: per_node(node) return supplement_tree[0]
def find(self, node): refs = [] for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): valid_scope = self.finder.scope_of_text(match.scope, Label.from_node(node), verify_prefix=False) valid_term = re.match("^[a-z ]+$", match.term.tokens[0]) if valid_scope and valid_term: term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) refs.append(Ref(term, node.label_id(), pos_start)) return refs
def node_definitions(self, node, stack=None): """Find defined terms in this node's text. 'Act' is a special case, as it is also defined as an external citation.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if ((term == 'act' and list(uscode.scanString(n.text))) or self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text(match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[ match.term.pos[0]:match.term.pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def per_node(node): if (node.node_type != struct.Node.INTERP or node.label[-1] != struct.Node.INTERP_MARK): return # Always add a connection based on the interp's label self.lookup_table[tuple(node.label[:-1])].append(node) # Also add connections based on the title for label in text_to_labels(node.title or '', Label.from_node(node), warn=False): label = tuple(label[:-1]) # Remove Interp marker if node not in self.lookup_table[label]: self.lookup_table[label].append(node)
def node_definitions(self, node, stack=None): """Find defined terms in this node's text. 'Act' is a special case, as it is also defined as an external citation.""" included_defs = [] excluded_defs = [] def add_match(n, term, pos): if ((term == 'act' and list(uscode.scanString(n.text))) or self.is_exclusion(term, n)): excluded_defs.append(Ref(term, n.label_id(), pos)) else: included_defs.append(Ref(term, n.label_id(), pos)) if stack and self.has_parent_definitions_indicator(stack): for match, _, _ in grammar.smart_quotes.scanString(node.text): term = match.term.tokens[0].lower().strip(',.;') # Don't use pos_end because we are stripping some chars pos_start = match.term.pos[0] add_match(node, term, (pos_start, pos_start + len(term))) for match, _, _ in grammar.scope_term_type_parser.scanString( node.text): # Check that both scope and term look valid if (self.scope_of_text( match.scope, Label.from_node(node), verify_prefix=False) and re.match("^[a-z ]+$", match.term.tokens[0])): term = match.term.tokens[0].strip() pos_start = node.text.index(term, match.term.pos[0]) add_match(node, term, (pos_start, pos_start + len(term))) if hasattr(node, 'tagged_text'): for match, _, _ in grammar.xml_term_parser.scanString( node.tagged_text): """Position in match reflects XML tags, so its dropped in preference of new values based on node.text.""" for match in chain([match.head], match.tail): pos_start = self.pos_start_excluding( match.term.tokens[0], node.text, included_defs + excluded_defs) term = node.tagged_text[match.term.pos[0]:match.term. pos[1]].lower() match_len = len(term) add_match(node, term, (pos_start, pos_start + match_len)) return included_defs, excluded_defs
def test_from_node(self): for lst, typ in [(['111'], Node.REGTEXT), (['111', '31', 'a', '3'], Node.REGTEXT), (['111', 'A', 'b'], Node.APPENDIX), (['111', 'A', '4', 'a'], Node.APPENDIX), (['111', '21', 'Interp'], Node.INTERP), (['111', '21', 'Interp', '1'], Node.INTERP), (['111', '21', 'r', 'Interp'], Node.INTERP), (['111', '21', 'r', 'Interp', '2'], Node.INTERP), (['111', 'G', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp'], Node.INTERP), (['111', 'G', '2', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP), (['111', 'G', '2', 'Interp', '5'], Node.INTERP), (['111', 'Subpart', 'A'], Node.SUBPART), (['111', 'Subpart'], Node.EMPTYPART)]: n = Node(label=lst, node_type=typ) self.assertEqual(Label.from_node(n).to_list(), lst)
def test_from_node(self): for lst, typ in [(['111'], Node.REGTEXT), (['111', '31', 'a', '3'], Node.REGTEXT), # _Very_ deeply nested, ignoring the recommended # 6-level paragraph limit (['111', '2', 'c', '4', 'v', 'F', '7', 'viii', 'p1', 'p1', 'p1'], Node.REGTEXT), (['111', 'A', 'b'], Node.APPENDIX), (['111', 'A', '4', 'a'], Node.APPENDIX), (['111', '21', 'Interp'], Node.INTERP), (['111', '21', 'Interp', '1'], Node.INTERP), (['111', '21', 'r', 'Interp'], Node.INTERP), (['111', '21', 'r', 'Interp', '2'], Node.INTERP), (['111', 'G', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp'], Node.INTERP), (['111', 'G', '2', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP), (['111', 'G', '2', 'Interp', '5'], Node.INTERP), (['111', 'Subpart', 'A'], Node.SUBPART), (['111', 'Subpart'], Node.EMPTYPART)]: n = Node(label=lst, node_type=typ) self.assertEqual(Label.from_node(n).to_list(), lst)
def test_from_node(self): for lst, typ in [ (['111'], Node.REGTEXT), (['111', '31', 'a', '3'], Node.REGTEXT), # _Very_ deeply nested, ignoring the recommended # 6-level paragraph limit (['111', '2', 'c', '4', 'v', 'F', '7', 'viii', 'p1', 'p1', 'p1'], Node.REGTEXT), (['111', 'A', 'b'], Node.APPENDIX), (['111', 'A', '4', 'a'], Node.APPENDIX), (['111', '21', 'Interp'], Node.INTERP), (['111', '21', 'Interp', '1'], Node.INTERP), (['111', '21', 'r', 'Interp'], Node.INTERP), (['111', '21', 'r', 'Interp', '2'], Node.INTERP), (['111', 'G', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp'], Node.INTERP), (['111', 'G', '2', 'Interp'], Node.INTERP), (['111', 'G3', 'r', 'Interp', '3'], Node.INTERP), (['111', 'G', '2', 'Interp', '5'], Node.INTERP), (['111', 'Subpart', 'A'], Node.SUBPART), (['111', 'Subpart'], Node.EMPTYPART) ]: n = Node(label=lst, node_type=typ) self.assertEqual(Label.from_node(n).to_list(), lst)
def process(self, node): citations_list = self.parse(node.text, label=Label.from_node(node), title=str(self.cfr_title)) if citations_list: return citations_list