def section_node(self): n1 = Node('n2', label=['200', '2']) n2 = Node('n2a', label=['200', '2', 'a']) n1.children = [n2] root = Node('root', label=['200'], children=[n1]) return root
def build_tree(reg_xml): doc = etree.fromstring(reg_xml) preprocess_xml(doc) reg_part = get_reg_part(doc) title = get_title(doc) tree = Node("", [], [reg_part], title) part = doc.xpath('//PART')[0] subpart_xmls = [c for c in part.getchildren() if c.tag == 'SUBPART'] if len(subpart_xmls) > 0: subparts = [build_subpart(reg_part, s) for s in subpart_xmls] tree.children = subparts else: section_xmls = [c for c in part.getchildren() if c.tag == 'SECTION'] sections = [] for section_xml in section_xmls: sections.extend(build_from_section(reg_part, section_xml)) empty_part = reg_text.build_empty_part(reg_part) empty_part.children = sections tree.children = [empty_part] non_reg_sections = build_non_reg_text(doc, reg_part) tree.children += non_reg_sections return tree
def test_dict_to_node(self): dict_node = { 'text': 'node text', 'label': ['205', 'A'], 'node_type': 'appendix'} node = compiler.dict_to_node(dict_node) self.assertEqual( node, Node('node text', [], ['205', 'A'], None, 'appendix')) dict_node['tagged_text'] = '<E> Tagged </E> text.' node = compiler.dict_to_node(dict_node) actual_node = Node('node text', [], ['205', 'A'], None, 'appendix') actual_node.tagged_text = '<E> Tagged </E> text.' created_node = compiler.dict_to_node(dict_node) self.assertEqual(actual_node, created_node) self.assertEqual(actual_node.tagged_text, created_node.tagged_text) dict_node = { 'text': 'node text' } node = compiler.dict_to_node(dict_node) self.assertEqual(node, dict_node)
def test_create_xml_changes_child_stars(self): labels_amended = [Amendment('PUT', '200-2-a')] xml = etree.fromstring("<ROOT><P>(a) Content</P><STARS /></ROOT>") n2a = Node('(a) Content', label=['200', '2', 'a'], source_xml=xml.xpath('//P')[0]) n2b = Node('(b) Content', label=['200', '2', 'b']) n2 = Node('n2', label=['200', '2'], children=[n2a, n2b]) root = Node('root', label=['200'], children=[n2]) notice_changes = changes.NoticeChanges() build.create_xml_changes(labels_amended, root, notice_changes) self.assertTrue('200-2-a' in notice_changes.changes) self.assertTrue(1, len(notice_changes.changes['200-2-a'])) change = notice_changes.changes['200-2-a'][0] self.assertEqual('PUT', change['action']) self.assertFalse('field' in change) n2a.text = n2a.text + ":" n2a.source_xml.text = n2a.source_xml.text + ":" notice_changes = changes.NoticeChanges() build.create_xml_changes(labels_amended, root, notice_changes) self.assertTrue('200-2-a' in notice_changes.changes) self.assertTrue(1, len(notice_changes.changes['200-2-a'])) change = notice_changes.changes['200-2-a'][0] self.assertEqual('PUT', change['action']) self.assertEqual('[text]', change.get('field'))
def section_node(self): n1 = Node("n2", label=["200", "2"]) n2 = Node("n2a", label=["200", "2", "a"]) n1.children = [n2] root = Node("root", label=["200"], children=[n1]) return root
def test_keyterm_is_first_not_first(self): node = Node('(a) This has a list: apples et seq.', label=['101', '22', 'a']) node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>' kt = KeyTerms(None) self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def test_create_xml_changes_child_stars(): labels_amended = [Amendment('PUT', '200-?-2-a')] with XMLBuilder("ROOT") as ctx: ctx.P("(a) Content") ctx.STARS() n2a = Node('(a) Content', label=['200', '2', 'a'], source_xml=ctx.xml.xpath('//P')[0]) n2b = Node('(b) Content', label=['200', '2', 'b']) n2 = Node('n2', label=['200', '2'], children=[n2a, n2b]) root = Node('root', label=['200'], children=[n2]) notice_changes = changes.NoticeChanges() fetch.create_xml_changes(labels_amended, root, notice_changes) data = notice_changes[None] assert '200-2-a' in data assert len(data['200-2-a']) == 1 change = data['200-2-a'][0] assert change['action'] == 'PUT' assert 'field' not in change n2a.text = n2a.text + ":" n2a.source_xml.text = n2a.source_xml.text + ":" notice_changes = changes.NoticeChanges() fetch.create_xml_changes(labels_amended, root, notice_changes) data = notice_changes[None] assert '200-2-a' in data assert len(data['200-2-a']) == 1 change = data['200-2-a'][0] assert change['action'] == 'PUT' assert change.get('field') == '[text]'
def test_no_keyterm(self): node = Node('(a) Apples are grown in New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) Apples are grown in New Zealand.' kt = KeyTerms(None) results = kt.process(node) self.assertEquals(results, None)
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';'*len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def test_create_xml_changes_child_stars(self): labels_amended = [Amendment('PUT', '200-?-2-a')] with XMLBuilder("ROOT") as ctx: ctx.P("(a) Content") ctx.STARS() n2a = Node('(a) Content', label=['200', '2', 'a'], source_xml=ctx.xml.xpath('//P')[0]) n2b = Node('(b) Content', label=['200', '2', 'b']) n2 = Node('n2', label=['200', '2'], children=[n2a, n2b]) root = Node('root', label=['200'], children=[n2]) notice_changes = changes.NoticeChanges() amendments.create_xml_changes(labels_amended, root, notice_changes) data = notice_changes.changes_by_xml[None] self.assertIn('200-2-a', data) self.assertTrue(1, len(data['200-2-a'])) change = data['200-2-a'][0] self.assertEqual('PUT', change['action']) self.assertNotIn('field', change) n2a.text = n2a.text + ":" n2a.source_xml.text = n2a.source_xml.text + ":" notice_changes = changes.NoticeChanges() amendments.create_xml_changes(labels_amended, root, notice_changes) data = notice_changes.changes_by_xml[None] self.assertIn('200-2-a', data) self.assertTrue(1, len(data['200-2-a'])) change = data['200-2-a'][0] self.assertEqual('PUT', change['action']) self.assertEqual('[text]', change.get('field'))
def test_create_xml_changes_child_stars(self): labels_amended = [Amendment("PUT", "200-2-a")] xml = etree.fromstring("<ROOT><P>(a) Content</P><STARS /></ROOT>") n2a = Node("(a) Content", label=["200", "2", "a"], source_xml=xml.xpath("//P")[0]) n2b = Node("(b) Content", label=["200", "2", "b"]) n2 = Node("n2", label=["200", "2"], children=[n2a, n2b]) root = Node("root", label=["200"], children=[n2]) notice_changes = changes.NoticeChanges() build.create_xml_changes(labels_amended, root, notice_changes) self.assertTrue("200-2-a" in notice_changes.changes) self.assertTrue(1, len(notice_changes.changes["200-2-a"])) change = notice_changes.changes["200-2-a"][0] self.assertEqual("PUT", change["action"]) self.assertFalse("field" in change) n2a.text = n2a.text + ":" n2a.source_xml.text = n2a.source_xml.text + ":" notice_changes = changes.NoticeChanges() build.create_xml_changes(labels_amended, root, notice_changes) self.assertTrue("200-2-a" in notice_changes.changes) self.assertTrue(1, len(notice_changes.changes["200-2-a"])) change = notice_changes.changes["200-2-a"][0] self.assertEqual("PUT", change["action"]) self.assertEqual("[text]", change.get("field"))
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.' * len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = [(m, m.start(), m.end()) for m in marker.finditer(node_text)] possible = remove_citation_overlaps(node_text, possible) possible = [triplet[0] for triplet in possible] collapsed_markers.extend( match for match in possible if not false_collapsed_marker(match, node_text, tagged_text) ) return collapsed_markers
def nodes_from_interp_p(xml_node): """Given an XML node that contains text for an interpretation paragraph, split it into sub-paragraphs and account for trailing stars""" node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags yield n if n.text.endswith('* * *'): yield Node(label=[mtypes.INLINE_STARS]) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) yield n if n.text.endswith('* * *'): yield Node(label=[mtypes.INLINE_STARS])
def test_keyterm_and_emphasis(self): node = Node('(a) Apples. Apples are grown in ' + 'New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples.</E> Apples are grown in ' +\ 'New <E T="03">Zealand.</E>' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'Apples.') self.assertEqual(results[0]['locations'], [0])
def tree_with_subparts(self): nsa = Node("nsa", label=["205", "Subpart", "A"], node_type=Node.SUBPART) nsb = Node("nsb", label=["205", "Subpart", "B"], node_type=Node.SUBPART) nappa = Node("nappa", label=["205", "Appendix", "C"], node_type=Node.APPENDIX) root = Node("", label=["205"]) root.children = [nsa, nsb, nappa] return root
def test_emphasis_close_to_front(self): """ An emphasized word is close to the front, but is not a key term. """ node = Node('(a) T et seq. has a list: apples', label=['101', '22', 'a']) node.tagged_text = '(a) T <E T="03">et seq.</E> has a list: apples' kt = KeyTerms(None) self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
def test_interpretation_markers(self): node = Node('3. et seq. has a list: apples', label=['101', 'c', Node.INTERP_MARK, '3'], node_type=Node.INTERP) node.tagged_text = '3. <E T="03">et seq.</E> has a list: apples' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'et seq.') self.assertEqual(results[0]['locations'], [0])
def paragraph_with_marker(self, text, next_text=''): """The paragraph has an (a) or a. etc.""" marker, _ = initial_marker(text) n = Node(text, node_type=Node.APPENDIX, label=[marker]) if initial_marker(next_text): next_marker, _ = initial_marker(next_text) else: next_marker = None this_p_levels = set(idx for idx, lvl in enumerate(p_levels) if marker in lvl) next_p_levels = set(idx for idx, lvl in enumerate(p_levels) if next_marker in lvl) previous_levels = [l for l in self.m_stack.m_stack if l] previous_p_levels = set() for stack_level in previous_levels: previous_p_levels.update(sn.p_level for _, sn in stack_level if hasattr(sn, 'p_level')) # Ambiguity, e.g. 'i', 'v'. Disambiguate by looking forward if len(this_p_levels) > 1 and len(next_p_levels) == 1: next_p_level = next_p_levels.pop() # e.g. an 'i' followed by a 'ii' if next_p_level in this_p_levels: this_p_idx = p_levels[next_p_level].index(marker) next_p_idx = p_levels[next_p_level].index(next_marker) if this_p_idx < next_p_idx: # Heuristic n.p_level = next_p_level # e.g. (a)(1)(i) followed by an 'A' new_level = this_p_levels - previous_p_levels if next_p_level not in previous_p_levels and new_level: n.p_level = new_level.pop() # Ambiguity. Disambiguate by looking backwards if len(this_p_levels) > 1 and not hasattr(n, 'p_level'): for stack_level in previous_levels: for lvl, stack_node in stack_level: if getattr(stack_node, 'p_level', None) in this_p_levels: # Later levels replace earlier ones n.p_level = stack_node.p_level # Simple case (no ambiguity) and cases not seen above if not getattr(n, 'p_level', None): n.p_level = min(this_p_levels) # rule of thumb: favor lower case # Check if we've seen this type of marker before found_in_prev = False for stack_level in previous_levels: if stack_level and in_same_p_level(n, stack_level): found_in_prev = True self.depth = stack_level[-1][0] if not found_in_prev: # New type of marker self.depth += 1 self.m_stack.add(self.depth, n)
def test_node_definitions_multiple_xml(self): """Find xml definitions which are separated by `and`""" stack = ParentStack().add(0, Node(label=['9999'])) winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4']) winter.tagged_text = ('(4) <E T="03">Cold</E> and ' '<E T="03">dreary</E> mean winter.') inc, _ = Terms(None).node_definitions(winter, stack) self.assertEqual(len(inc), 2) cold, dreary = inc self.assertEqual(cold, Ref('cold', '9999-4', 4)) self.assertEqual(dreary, Ref('dreary', '9999-4', 13))
def build_tree(self): n1 = Node('n1', label=['200', '1']) n2 = Node('n1i', label=['200', 1, 'i']) n3 = Node('n2', label=['200', '2']) n4 = Node('n3', label=['200', '3']) n5 = Node('n3a', label=['200', '3', 'a']) n1.children = [n2] n4.children = [n5] root = Node('root', label=['200'], children=[n1, n3, n4]) return root
def assert_finds_result(self, tagged_text, parent_title, *refs): """Given the tags and a title for a parent node, verify that the provided references are found""" parent = Node(label=['1000', '1'], title=parent_title) node = Node(re.sub(r"<[^>]*>", "", tagged_text)) # removes tags node.tagged_text = tagged_text results = def_finders.DefinitionKeyterm(parent).find(node) self.assertEqual(len(results), len(refs)) for expected, actual in zip(refs, results): self.assertEqual(expected.term, actual.term) self.assertEqual(expected.start, actual.start)
def build_tree(self): n1 = Node("n1", label=["200", "1"]) n2 = Node("n1i", label=["200", 1, "i"]) n3 = Node("n2", label=["200", "2"]) n4 = Node("n3", label=["200", "3"]) n5 = Node("n3a", label=["200", "3", "a"]) n1.children = [n2] n4.children = [n5] root = Node("root", label=["200"], children=[n1, n3, n4]) return root
def test_keyterm_see(self): """ Keyterm tags sometimes enclose phrases such as 'See also' because those tags are also used for emphasis. """ node = Node('(a) Apples. See Section 101.2', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples. See also</E>' kt = KeyTerms(None) results = kt.process(node) self.assertEqual('Apples.', results[0]['key_term'])
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node('(a) This has a list: apples et seq.', label=['101', '22', 'a']) node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def _assert_finds(self, tagged_text, *refs): """Compare the derived results to an expected number of references""" finder = def_finders.XMLTermMeans() text = re.sub(r"<[^>]*>", "", tagged_text) # removes tags node = Node(text) node.tagged_text = tagged_text actual = finder.find(node) self.assertEqual(len(refs), len(actual)) for ref, actual in zip(refs, actual): self.assertEqual(ref.term, actual.term) self.assertEqual(ref.start, actual.start)
def tree_with_paragraphs(self): n1 = Node("n1", label=["205", "1"]) n2 = Node("n2", label=["205", "2"]) n4 = Node("n4", label=["205", "4"]) n2a = Node("n2a", label=["205", "2", "a"]) n2b = Node("n2b", label=["205", "2", "b"]) n2.children = [n2a, n2b] root = Node("", label=["205"]) root.children = [n1, n2, n4] return root
def test_node_definitions_xml_or(self): """Find xml definitions which are separated by `or`""" stack = ParentStack().add(0, Node(label=['9999'])) tamale = Node("(i) Hot tamale or tamale means nom nom", label=['9999', '4']) tamale.tagged_text = ('(i) <E T="03">Hot tamale</E> or <E T="03"> ' 'tamale</E> means nom nom ') inc, _ = Terms(None).node_definitions(tamale, stack) self.assertEqual(len(inc), 2) hot, tamale = inc self.assertEqual(hot, Ref('hot tamale', '9999-4', 4)) self.assertEqual(tamale, Ref('tamale', '9999-4', 18))
def derive_nodes(self, xml, processor=None): nodes = [] plain_text = '' for marker, plain_text, tagged_text in split_by_markers(xml): node = Node(text=plain_text.strip(), label=[marker], source_xml=xml) node.tagged_text = six.text_type(tagged_text.strip()) nodes.append(node) if plain_text.endswith('* * *'): # last in loop nodes.append(Node(label=[mtypes.INLINE_STARS])) return nodes
def tree_with_paragraphs(self): n1 = Node('n1', label=['205', '1']) n2 = Node('n2', label=['205', '2']) n4 = Node('n4', label=['205', '4']) n2a = Node('n2a', label=['205', '2', 'a']) n2b = Node('n2b', label=['205', '2', 'b']) n2.children = [n2a, n2b] root = Node('', label=['205']) root.children = [n1, n2, n4] return root
def test_move_interps(self): n1 = Node('n1', label=['205', '1', 'Interp'], node_type=Node.INTERP) n2 = Node('n2', label=['205', '2', 'Interp'], node_type=Node.INTERP) n4 = Node('n4', label=['205', '4', 'Interp'], node_type=Node.INTERP) n4c = Node('n4c', label=['205', '4', 'c', 'Interp'], node_type=Node.INTERP) n4.children = [n4c] n2a = Node('n2a', label=['205', '2', 'a', 'Interp'], node_type=Node.INTERP) n2b = Node('n2b', label=['205', '2', 'b', 'Interp'], node_type=Node.INTERP) n2a1 = Node('1. First', label=['205', '2', 'a', 'Interp', '1'], node_type=Node.INTERP) n2a.children = [n2a1] n2.children = [n2a, n2b] root = Node('', label=['205', 'Interp'], node_type=Node.INTERP) root.children = [n1, n2, n4] reg_tree = compiler.RegulationTree(root) reg_tree.move('205-2-a-Interp-1', ['205', '4', 'c', 'Interp', '5'])
def test_underparagraph(self): text = 'Something something underparagraphs (a)(4) through (5)' citations = self.parser.process(Node(text, label=['1005', '6'])) self.assertEqual(len(citations), 2)
def test_has_parent_definitions_indicator_p_marker(self): t = Terms(None) stack = ParentStack() stack.add(0, Node("(a) Definitions. For purposes of this " + "section except blah")) self.assertTrue(t.has_parent_definitions_indicator(stack))
def test_multiple_paragraph_or(self): """ Ensure that an 'or' between internal citations is matched correctly. """ text = u"set forth in paragraphs (b)(1) or (b)(2)" citations = self.parser.process(Node(text, label=['1005', '6'])) self.assertEquals(2, len(citations))
def derive_nodes(self, xml, processor=None): processor = USCodeProcessor() node = Node(label=[mtypes.MARKERLESS], source_xml=xml) return [processor.process(xml, node)]
def test_pre_process(self): noname_subpart = Node( '', label=['88', 'Subpart'], node_type=Node.EMPTYPART, children=[ Node(u"Definition. For the purposes of this part, " + u"“abcd” is an alphabet", label=['88', '1'])]) xqxq_subpart = Node( '', title='Subpart XQXQ: The unreadable', label=['88', 'Subpart', 'XQXQ'], node_type=Node.SUBPART, children=[ Node(label=['88', '2'], children=[ Node(label=['88', '2', 'a'], text="Definitions come later for the purposes of " + "this section ", children=[ Node(u"“AXAX” means axe-cop", label=['88', '2', 'a', '1'])]), Node(label=['88', '2', 'b'], children=[ Node(label=['88', '2', 'b', 'i'], children=[ Node(label=['88', '2', 'b', 'i', 'A'], text=u"Definition. “Awesome sauce” means " + "great for the purposes of this " + "paragraph",)])])])]) tree = Node(label=['88'], children=[noname_subpart, xqxq_subpart]) t = Terms(tree) t.pre_process() self.assertTrue(('88',) in t.scoped_terms) self.assertEqual([Ref('abcd', '88-1', (44, 48))], t.scoped_terms[('88',)]) self.assertTrue(('88', '2') in t.scoped_terms) self.assertEqual([Ref('axax', '88-2-a-1', (1, 5))], t.scoped_terms[('88', '2')]) self.assertTrue(('88', '2', 'b', 'i', 'A') in t.scoped_terms) self.assertEqual([Ref('awesome sauce', '88-2-b-i-A', (13, 26))], t.scoped_terms[('88', '2', 'b', 'i', 'A')]) # Check subparts are correct self.assertEqual({None: ['1'], 'XQXQ': ['2']}, dict(t.subpart_map)) # Finally, make sure the references are added referenced = t.layer['referenced'] self.assertTrue('abcd:88-1' in referenced) self.assertEqual('abcd', referenced['abcd:88-1']['term']) self.assertEqual('88-1', referenced['abcd:88-1']['reference']) self.assertEqual((44, 48), referenced['abcd:88-1']['position']) self.assertTrue('axax:88-2-a-1' in referenced) self.assertEqual('axax', referenced['axax:88-2-a-1']['term']) self.assertEqual('88-2-a-1', referenced['axax:88-2-a-1']['reference']) self.assertEqual((1, 5), referenced['axax:88-2-a-1']['position']) self.assertTrue('awesome sauce:88-2-b-i-A' in referenced) self.assertEqual('awesome sauce', referenced['awesome sauce:88-2-b-i-A']['term']) self.assertEqual('88-2-b-i-A', referenced['awesome sauce:88-2-b-i-A']['reference']) self.assertEqual((13, 26), referenced['awesome sauce:88-2-b-i-A']['position'])
def test_write_notice(self, mock_preamble, mock_fdsys, mock_build_analysis): changes = {'1234-2': {'op': 'modified'}, '1234-3': {'op': 'deleted'}, '1234-4': {'op': 'added'}} reg_tree = Node("I'm the root", label=['1234'], children=[ Node("I'll get analysis", label=['1234', '1']), Node("I will be modified", label=['1234', '2']), Node("I will be deleted", label=['1234', '3']), Node("I will be added", label=['1234', '4']), ]) # Ensure we have some analysis just to include layers = {'analyses': {'1234-1': [{}]}} mock_build_analysis.return_value = etree.fromstring(""" <analysisSection target="1234-1" notice="2015-12345" date=""> This is some analysis </analysisSection> """) # An FDSYS mock_fdsys.return_value = etree.fromstring(""" <fdsys> This is an fdsys </fdsys> """) # A preamble mock_preamble.return_value = etree.fromstring(""" <preamble> This is the preamble </preamble> """) writer = XMLWriteContent("a/path", '2015-12345', layers=layers, notices={}) # Without reg_tree with self.assertRaises(RuntimeError): writer.write_notice({}) # Write a notice file mock_file = mock_open() with patch.object(builtins, 'open', mock_file, create=True): writer.write_notice({}, changes=changes, reg_tree=reg_tree, left_doc_number='2015-01234') # Get the resulting XML file_handle = mock_file() xml_string = file_handle.write.call_args[0][0] notice_xml = etree.fromstring(xml_string) # Introspect our changes changeset = notice_xml.find('.//{eregs}changeset') self.assertEqual('2015-01234', changeset.get('leftDocumentNumber')) self.assertEqual('2015-12345', changeset.get('rightDocumentNumber')) changes = notice_xml.findall('.//{eregs}change') self.assertEqual(len(changes), 4) self.assertEqual( 2, len([c for c in changes if c.get('operation') == 'modified'])) self.assertEqual( 1, len([c for c in changes if c.get('operation') == 'deleted'])) self.assertEqual( 1, len([c for c in changes if c.get('operation') == 'added'])) self.assertEqual( 1, len(notice_xml.findall('./{eregs}analysis')))
def test_node_definitions(self): t = Terms(None) smart_quotes = [ (u'This has a “worD” and then more', [Ref('word', 'aaa', (12, 16))]), (u'I have “anotheR word” term and “moree”', [ Ref('another word', 'bbb', (8, 20)), Ref('moree', 'bbb', (32, 37)) ]), (u'But the child “DoeS sEe”?', [Ref('does see', 'ccc', (15, 23))]), (u'Start with “this,”', [Ref('this', 'hhh', (12, 16))]), (u'Start with “this;”', [Ref('this', 'iii', (12, 16))]), (u'Start with “this.”', [Ref('this', 'jjj', (12, 16))]), (u'As do “subchildren”', [Ref('subchildren', 'ddd', (7, 18))]) ] no_defs = [ u'This has no defs', u'Also has no terms', u'Still no terms, but', u'the next one does' ] xml_defs = [ (u'(4) Thing means a thing that is defined', u'(4) <E T="03">Thing</E> means a thing that is defined', Ref('thing', 'eee', (4, 9))), (u'(e) Well-meaning lawyers means people who do weird things', u'(e) <E T="03">Well-meaning lawyers</E> means people who do ' u'weird things', Ref('well-meaning lawyers', 'fff', (4, 24))), (u'(e) Words have the same meaning as in a dictionary', u'(e) <E T="03">Words</E> have the same meaning as in a ' u'dictionary', Ref('words', 'ffg', (4, 9))), (u'(e) Banana has the same meaning as bonono', u'(e) <E T="03">Banana</E> has the same meaning as bonono', Ref('banana', 'fgf', (4, 10))), (u'(f) Huge billowy clouds means I want to take a nap', u'(f) <E T="03">Huge billowy clouds</E> means I want to take a ' u'nap', Ref('huge billowy clouds', 'ggg', (4, 23))), (u'(v) Lawyers, in relation to coders, means something very ' u'different', u'(v) <E T="03">Lawyers</E>, in relation to coders, means ' u'something very different', Ref(u'lawyers', '', (4, 11))), ] xml_no_defs = [ (u'(d) Term1 or term2 means stuff', u'(d) <E T="03">Term1</E> or <E T="03">term2></E> means stuff') ] scope_term_defs = [ ('For purposes of this section, the term blue means the color', Ref('blue', '11-11', (39, 43))), ('For purposes of paragraph (a)(1) of this section, the term ' + 'cool bro means hip cat', Ref('cool bro', '11-22', (59, 67))), ('For purposes of this paragraph, po jo means "poor Joe"', Ref('po jo', '11-33', (32, 37))) ] stack = ParentStack() stack.add(0, Node(label=['999'])) for txt in no_defs: defs, exc = t.node_definitions(Node(txt), stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, refs in smart_quotes: defs, exc = t.node_definitions(Node(txt), stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, xml in xml_no_defs: node = Node(txt) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, xml, ref in xml_defs: node = Node(txt, label=[ref.label]) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([ref], defs) self.assertEqual([], exc) for txt, ref in scope_term_defs: defs, exc = t.node_definitions( Node(txt, label=ref.label.split('-')), stack) self.assertEqual([ref], defs) self.assertEqual([], exc) # smart quotes are affected by the parent stack.add(1, Node('Definitions', label=['999', '1'])) for txt in no_defs: defs, exc = t.node_definitions(Node(txt), stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, refs in smart_quotes: defs, exc = t.node_definitions(Node(txt, label=[refs[0].label]), stack) self.assertEqual(refs, defs) self.assertEqual([], exc) for txt, xml in xml_no_defs: node = Node(txt) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, xml, ref in xml_defs: node = Node(txt, label=[ref.label]) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([ref], defs) self.assertEqual([], exc)
def test_determine_scope(self): stack = ParentStack() t = Terms(None) stack.add(0, Node(label=['1000'])) stack.add(1, Node(label=['1000', '1'])) # Defaults to the entire reg self.assertEqual([('1000',)], t.determine_scope(stack)) stack.add(1, Node('For the purposes of this part, blah blah', label=['1001', '2'])) self.assertEqual([('1001',), ('1001', Node.INTERP_MARK)], t.determine_scope(stack)) t.subpart_map = { 'SubPart 1': ['A', '3'], 'Other': [] } stack.add(1, Node(label=['1000', '3'])) stack.add(2, Node('For the purposes of this subpart, yada yada', label=['1000', '3', 'c'])) self.assertEqual([('1000', 'A'), ('1000', '3'), ('1000', 'A', Node.INTERP_MARK), ('1000', '3', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(2, Node('For the purposes of this section, blah blah', label=['1000', '3', 'd'])) self.assertEqual([('1000', '3'), ('1000', '3', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(3, Node('For the purposes of this paragraph, blah blah', label=['1000', '3', 'd', '5'])) self.assertEqual([('1000', '3', 'd', '5'), ('1000', '3', 'd', '5', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(3, Node(label=['1002', '3', 'd', '6'])) self.assertEqual([('1000', '3'), ('1000', '3', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(3, Node('Blah as used in this paragraph, blah blah', label=['1000', '3', 'd', '7'])) self.assertEqual([('1000', '3', 'd', '7'), ('1000', '3', 'd', '7', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(4, Node(u'For the purposes of this § 1000.3(d)(6)(i), blah', label=['1000', '3', 'd', '6', 'i'])) self.assertEqual([('1000', '3', 'd', '6', 'i'), ('1000', '3', 'd', '6', 'i', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(4, Node(u'For the purposes of § 1000.3, blah', label=['1000', '3', 'd', '6', 'ii'])) self.assertEqual([('1000', '3'), ('1000', '3', Node.INTERP_MARK)], t.determine_scope(stack)) stack.add(4, Node('As used in this section, blah blah', label=['1000', '3', 'd', '6', 'iii'])) self.assertEqual( [('1000', '3'), ('1000', '3', Node.INTERP_MARK)], t.determine_scope(stack))
def test_pre_process_subpart(self): root = Node("", label=['1212']) subpartA = Node("", label=['1212', 'Subpart', 'A'], title='Subpart A') section2 = Node("", label=['1212', '2'], title='1212.2') def1 = Node(u"“totes” means in total", label=['1212', '2', 'a']) subpartB = Node("", label=['1212', 'Subpart', 'B'], title='Subpart B') section22 = Node("\nFor the purposes of this subpart", label=['1212', '22'], title='1212.22') def2 = Node(u"“totes” means in extremely", label=['1212', '22', 'a']) root.children = [subpartA, subpartB] subpartA.children, subpartB.children = [section2], [section22] section2.children, section22.children = [def1], [def2] t = Terms(root) t.pre_process() self.assertTrue(('1212', ) in t.scoped_terms) self.assertEqual(len(t.scoped_terms[('1212', )]), 1) self.assertEqual('1212-2-a', t.scoped_terms[('1212', )][0].label) self.assertTrue(('1212', '22') in t.scoped_terms) self.assertEqual(len(t.scoped_terms[('1212', '22')]), 1) self.assertEqual('1212-22-a', t.scoped_terms[('1212', '22')][0].label)
def test_write(self): """Integration test.""" p3a = Node('(a) Par a', label=['1111', '3', 'a']) p3b = Node('(b) Par b', label=['1111', '3', 'b']) p3 = Node('Things like: ', label=['1111', '3'], title='Section 3', children=[p3a, p3b]) sub = Node('', label=['1111', 'Subpart', 'E'], title='Subpart E', node_type=Node.SUBPART, children=[p3]) a3a = Node('Appendix A-3(a)', label=['1111', 'A', '3(a)'], title='A-3(a) - Some Title', node_type=Node.APPENDIX) app = Node('', label=['1111', 'A'], title='Appendix A', node_type=Node.APPENDIX, children=[a3a]) i3a1 = Node('1. P1', label=['1111', '3', 'a', 'Interp', '1'], node_type=Node.INTERP) i3a = Node('', label=['1111', '3', 'a', 'Interp'], node_type=Node.INTERP, children=[i3a1], title='Paragraph 3(a)') i31 = Node('1. Section 3', label=['1111', '3', 'Interp', '1'], node_type=Node.INTERP) i3 = Node('', label=['1111', '3', 'Interp'], node_type=Node.INTERP, title='Section 1111.3', children=[i3a, i31]) i = Node('', label=['1111', 'Interp'], node_type=Node.INTERP, title='Supplement I', children=[i3]) tree = Node('Root text', label=['1111'], title='Regulation Joe', children=[sub, app, i]) writer = GitWriteContent("/regulation/1111/v1v1") writer.write(tree) dir_path = settings.GIT_OUTPUT_DIR + "regulation" + os.path.sep dir_path += '1111' + os.path.sep self.assertTrue(os.path.exists(dir_path + '.git')) dirs, files = [], [] for dirname, child_dirs, filenames in os.walk(dir_path): if ".git" not in dirname: dirs.extend( os.path.join(dirname, c) for c in child_dirs if c != '.git') files.extend(os.path.join(dirname, f) for f in filenames) for path in (('Subpart-E', ), ('Subpart-E', '3'), ('Subpart-E', '3', 'a'), ('Subpart-E', '3', 'b'), ('A', ), ('A', '3(a)'), ('Interp', ), ('Interp', '3-Interp'), ('Interp', '3-Interp', '1'), ('Interp', '3-Interp', 'a-Interp'), ('Interp', '3-Interp', 'a-Interp', '1')): path = dir_path + os.path.join(*path) self.assertTrue(path in dirs) self.assertTrue(path + os.path.sep + 'index.md' in files) p3c = p3b p3c.text = '(c) Moved!' p3c.label = ['1111', '3', 'c'] writer = GitWriteContent("/regulation/1111/v2v2") writer.write(tree) dir_path = settings.GIT_OUTPUT_DIR + "regulation" + os.path.sep dir_path += '1111' + os.path.sep self.assertTrue(os.path.exists(dir_path + '.git')) dirs, files = [], [] for dirname, child_dirs, filenames in os.walk(dir_path): if ".git" not in dirname: dirs.extend( os.path.join(dirname, c) for c in child_dirs if c != '.git') files.extend(os.path.join(dirname, f) for f in filenames) for path in (('Subpart-E', ), ('Subpart-E', '3'), ('Subpart-E', '3', 'a'), ('Subpart-E', '3', 'c'), ('A', ), ('A', '3(a)'), ('Interp', ), ('Interp', '3-Interp'), ('Interp', '3-Interp', '1'), ('Interp', '3-Interp', 'a-Interp'), ('Interp', '3-Interp', 'a-Interp', '1')): path = dir_path + os.path.join(*path) self.assertTrue(path in dirs) self.assertTrue(path + os.path.sep + 'index.md' in files) self.assertFalse(dir_path + os.path.join('Subpart-E', '3', 'b') in dirs) commit = Repo(dir_path).head.commit self.assertTrue('v2v2' in commit.message) self.assertEqual(1, len(commit.parents)) commit = commit.parents[0] self.assertTrue('v1v1' in commit.message) self.assertEqual(1, len(commit.parents)) commit = commit.parents[0] self.assertTrue('1111' in commit.message) self.assertEqual(0, len(commit.parents))
def derive_nodes(self, xml, processor=None): node = Node(table_xml_to_plaintext(xml), label=[mtypes.MARKERLESS], source_xml=xml) node.tagged_text = etree.tounicode(xml).strip() return [node]
def test_pre_process(self): el = ExampleLayer(Node('some text')) self.assertEqual(None, el.pre_process())
def derive_nodes(self, xml, processor=None): text = '' for gid_xml in xml.xpath('./GID'): text += '![]({0})'.format(gid_xml.text) return [Node(text, label=[mtypes.MARKERLESS])]
def derive_nodes(self, xml, processor=None): tagged = tree_utils.get_node_text_tags_preserved(xml).strip() return [Node(text=tree_utils.get_node_text(xml).strip(), tagged_text=tagged, label=[mtypes.MARKERLESS])]
def derive_nodes(self, xml, processor=None): processor = FlatParagraphProcessor() text = (xml.text or '').strip() node = Node(text=text, node_type=self.node_type, label=[mtypes.MARKERLESS]) return [processor.process(xml, node)]
def derive_nodes(self, xml, processor=None): return [Node(label=[mtypes.STARS_TAG])]
def test_process_method(self): node = Node("The requirements in paragraph (a)(4)(iii) of", label=['1005', '6']) citations = self.parser.process(node) self.assertEqual(len(citations), 1)
def derive_nodes(self, xml, processor=None): processor = SimpleHierarchyProcessor() node = Node(label=[mtypes.MARKERLESS], source_xml=xml, node_type=self.node_type) return [processor.process(xml, node)]
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logger.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def test_process(self): root = Node(children=[ Node("Interp11a", [Node("child1"), Node("child2")], ['102', '11', 'a', Node.INTERP_MARK], node_type=Node.INTERP), Node("Interp11c5v", label=['102', '11', 'c', '5', 'v', Node.INTERP_MARK], node_type=Node.INTERP), Node("InterpB5ii", label=['102', 'B', '5', 'ii', Node.INTERP_MARK], node_type=Node.INTERP), Node(children=[ Node(children=[ Node("Interp9c1", label=['102', '9', 'c', '1', Node.INTERP_MARK], node_type=Node.INTERP) ], label=['102']) ]) ]) interp = Interpretations(root) interp.pre_process() interp11a = interp.process(Node(label=['102', '11', 'a'])) interp11c5v = interp.process(Node(label=['102', '11', 'c', '5', 'v'])) interpB5ii = interp.process(Node(label=['102', 'B', '5', 'ii'])) interp9c1 = interp.process(Node(label=['102', '9', 'c', '1'])) self.assertEqual(1, len(interp11a)) self.assertEqual(1, len(interp11c5v)) self.assertEqual(1, len(interpB5ii)) self.assertEqual(1, len(interp9c1)) self.assertEqual('102-11-a-Interp', interp11a[0]['reference']) self.assertEqual('102-11-c-5-v-Interp', interp11c5v[0]['reference']) self.assertEqual('102-B-5-ii-Interp', interpB5ii[0]['reference']) self.assertEqual('102-9-c-1-Interp', interp9c1[0]['reference']) self.assertEqual(None, interp.process(Node(label=["102", "10", "a"])))
def test_add_child_interp(self): reg_tree = compiler.RegulationTree(None) n1 = Node('n1', label=['205', '1', 'Interp']) n5 = Node('n5', label=['205', '5', 'Interp']) n9 = Node('n9', label=['205', '9', 'Interp']) n10 = Node('n10', label=['205', '10', 'Interp']) children = [n1, n5, n10] children = reg_tree.add_child(children, n9) self.assertEqual(children, [n1, n5, n9, n10]) n1.label = ['205', '1', 'a', '1', 'i', 'Interp'] n5.label = ['205', '1', 'a', '1', 'v', 'Interp'] n9.label = ['205', '1', 'a', '1', 'ix', 'Interp'] n10.label = ['205', '1', 'a', '1', 'x', 'Interp'] children = [n1, n5, n10] children = reg_tree.add_child(children, n9) self.assertEqual(children, [n1, n5, n9, n10]) n1.label = ['205', '1', 'a', 'Interp', '1', 'i'] n5.label = ['205', '1', 'a', 'Interp', '1', 'v'] n9.label = ['205', '1', 'a', 'Interp', '1', 'ix'] n10.label = ['205', '1', 'a', 'Interp', '1', 'x'] children = [n1, n5, n10] children = reg_tree.add_child(children, n9) self.assertEqual(children, [n1, n5, n9, n10]) n1.label = ['205', '1', 'Interp', '1'] n5.label = ['205', '1', 'a', 'Interp'] children = [n1] children = reg_tree.add_child(children, n5) self.assertEqual(children, [n1, n5]) children = [n5] children = reg_tree.add_child(children, n1) self.assertEqual(children, [n1, n5])
def test_cfr_format(self): """We aren't processing this form yet""" text = "12 CFR 1026.3(d)" result = self.parser.process(Node(text, label=['1111'])) self.assertEqual(None, result)
def test_process(self): el = ExampleLayer(Node("other text")) self.assertEqual(NotImplemented, el.process(Node("oo")))
def test_to_xml_interp(self): """ Test that interpretations get formatted correctly """ interp_nodes = Node( text=u'', children=[ Node(text=u'Interp for section', children=[ Node(text=u'Interp targetting reg paragraph', children=[ Node(text=u'A Keyterm. Interp sp.', children=[], label=[u'1111', u'1', 'a', u'Interp', u'1'], title=None, node_type=u'interp'), Node(text=u'Lone Keyterm. Or not.', children=[], label=[u'1111', u'1', 'a', u'Interp', u'2'], title=None, node_type=u'interp'), ], label=[u'1111', u'1', 'a', u'Interp'], title=u'1111.1 (a) Interp', node_type=u'interp'), ], label=[u'1111', u'1', u'Interp'], title=u'1111.1 Interp', node_type=u'interp'), ], label=[u'1111', u'Interp'], title=u'Interpretations', node_type=u'interp') layers = { 'terms': { "1111-1-a-Interp-2": [{ "offsets": [[0, 12]], "ref": "lone keyterm:1111-1-a" }], 'referenced': {}}, 'graphics': {}, 'keyterms': { u'1111-1-a-Interp-1': [{'locations': [0], 'key_term': u'A Keyterm.'}], u'1111-1-a-Interp-2': [{'locations': [0], 'key_term': u'Lone Keyterm.'}], }, 'interpretations': { u'1111-1-a': [{'reference': u'1111-1-a-Interp'}], }, 'paragraph-markers': { u'1111-1-a-Interp-1': [{"text": "1.", "locations": [0]}], u'1111-1-a-Interp-2': [{"text": "2.", "locations": [0]}], }, } notices = [{ 'document_number': '2015-12345', }] writer = XMLWriteContent("a/path", '2015-12345', layers=layers, notices=notices) elm = writer.to_xml(interp_nodes) interp_para = elm.find( './/interpParagraph[@label="1111-1-a-Interp"]') interp_sub_paras = interp_para.findall( 'interpParagraph') # Check that paragraph targets are correct. self.assertEqual(interp_para.get('target'), '1111-1-a') self.assertEqual(interp_sub_paras[0].get('target'), None) # Check that title keyterm is correct self.assertNotEqual(interp_para.find('title'), None) self.assertEqual(interp_sub_paras[0].find('title').get('type'), 'keyterm') self.assertTrue('A Keyterm.' not in interp_sub_paras[0].find('content').text) # For the second sub para there should be a <ref> in <title> and # nothing in content self.assertEqual(interp_sub_paras[1].find('title').get('type'), 'keyterm') self.assertTrue(interp_sub_paras[1].find('content').text is None) # self.assertTrue(len(interp_sub_paras[1].find('content')) is 0) # Check that paragraph markers are correct self.assertEqual(interp_para.get('marker'), None) self.assertEqual(interp_sub_paras[0].get('marker'), '1.') self.assertEqual(interp_sub_paras[1].get('marker'), '2.')
def test_replace_node_and_subtree(self): n1 = Node('n1', label=['205', '1']) n2 = Node('n2', label=['205', '2']) n4 = Node('n4', label=['205', '4']) n2a = Node('n2a', label=['205', '2', 'a']) n2b = Node('n2b', label=['205', '2', 'b']) n2.children = [n2a, n2b] root = Node('', label=['205']) root.children = [n1, n2, n4] reg_tree = compiler.RegulationTree(root) a2 = Node('a2', label=['205', '2']) a2e = Node('a2e', label=['205', '2', 'e']) a2f = Node('a2f', label=['205', '2', 'f']) a2.children = [a2e, a2f] reg_tree.replace_node_and_subtree(a2) new_tree = Node('', label=[205]) new_tree.children = [n1, a2, n4] self.assertEqual(new_tree, reg_tree.tree) self.assertEqual(None, find(reg_tree.tree, '205-2-a'))
def test_look_for_defs(self, node_definitions): """We should be walking through the tree to find terms. Test this by documenting which nodes are touched. We should be _ignoring_ certain subtrees (notable, any which aren't associated w/ regtext)""" node_definitions.side_effect = lambda n, _: ([], [n.label_id()]) t = Terms(None) root = Node(label=['111'], children=[ Node(label=['111', 'Subpart'], node_type=Node.EMPTYPART, children=[ Node(label=['111', '1'], children=[ Node(label=['111', '1', 'a']), Node(label=['111', '1', 'b']), Node(label=['111', '1', 'c'])]), Node(label=['111', '2'], children=[ Node(label=['111', '2', 'p1'], node_type=Node.EXTRACT, children=[Node(label=['111', '2', 'p1', 'p1'])]) ])]), Node(label=['111', 'A'], node_type=Node.APPENDIX, children=[ Node(label=['111', 'A', '1'], node_type=Node.APPENDIX)])]) t.look_for_defs(root) self.assertItemsEqual( t.scoped_terms['EXCLUDED'], # note the absence of APPENDIX, and anything below an EXTRACT ['111', '111-Subpart', '111-1', '111-1-a', '111-1-b', '111-1-c', '111-2'])
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy_flag = False try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[ part]: manual_hierarchy_flag = True except Exception: pass children = itertools.takewhile(lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate( filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning( "Couldn't determine interp marker. " "Appending node and hoping that manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy_flag: depths = derive_depths([n.label[0] for n in nodes], [ rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if not manual_hierarchy_flag and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy_flag: logging.warning('Using manual depth hierarchy.') depths = PARAGRAPH_HIERARCHY[part][part_and_section] if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy_flag: logging.warning('Could not derive depth (interp):\n {}'.format( [n.label[0] for n in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def paragraph_no_marker(self, text): """The paragraph has no (a) or a. etc.""" self.paragraph_counter += 1 n = Node(text, node_type=Node.APPENDIX, label=['p' + str(self.paragraph_counter)]) self.nodes.append(n)
def derive_nodes(self, xml, processor=None): # This should match HD elements only at lower levels, and for now we'll # just put them into the titles return [Node(text='', title=tree_utils.get_node_text(xml).strip(), label=[mtypes.MARKERLESS])]