def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';'*len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, ';' * len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace(';' * len(keyterm), keyterm) # label_candidate = [initial_marker(mtext)[0]] # existing_node = None # for node in self.nodes: # if node.label == label_candidate: # existing_node = node # if existing_node: # self.paragraph_counter += 1 # node = Node(mtext, node_type=Node.APPENDIX, # label=['dup{}'.format(self.paragraph_counter), # initial_marker(mtext)[0]]) # else: node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) node.tagged_text = tagged_text self.nodes.append(node)
def test_dict_to_node(self): dict_node = { 'text': 'node text', 'label': ['205', 'A'], 'node_type': 'appendix'} node = compiler.dict_to_node(dict_node) self.assertEqual( node, Node('node text', [], ['205', 'A'], None, 'appendix')) dict_node['tagged_text'] = '<E> Tagged </E> text.' node = compiler.dict_to_node(dict_node) actual_node = Node('node text', [], ['205', 'A'], None, 'appendix') actual_node.tagged_text = '<E> Tagged </E> text.' created_node = compiler.dict_to_node(dict_node) self.assertEqual(actual_node, created_node) self.assertEqual(actual_node.tagged_text, created_node.tagged_text) dict_node = { 'text': 'node text' } node = compiler.dict_to_node(dict_node) self.assertEqual(node, dict_node)
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.'*len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = ((m, m.start(), m.end()) for m in marker.finditer(node_text) if m.start() > 0) possible = remove_citation_overlaps(node_text, possible) # If certain characters follow, kill it for following in ("e.", ")", u"”", '"', "'"): possible = [(m, s, end) for m, s, end in possible if not node_text[end:].startswith(following)] possible = [m for m, _, _ in possible] # As all "1." collapsed markers must be emphasized, run a quick # check to weed out some false positives if '<E T="03">1' not in tagged_text: possible = filter(lambda m: m.group(1) != '1', possible) collapsed_markers.extend(possible) return collapsed_markers
def test_keyterm_is_first_not_first(self): node = Node('(a) This has a list: apples et seq.', label=['101', '22', 'a']) node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>' kt = KeyTerms(None) self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
def test_keyterm_definition(self): node = Node("(a) Terminator means I'll be back", label=['101', '22', 'a']) node.tagged_text = """(a) <E T="03">Terminator</E> means I'll be """ node.tagged_text += 'back' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None) node = Node("(1) Act means pretend", label=['101', '22', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means pretend""" node = Node("(1) Act means the Truth in Lending Act (15 U.S.C. 1601 et seq.).", label=['1026', '2', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means the Truth in Lending Act (15 U.S.C. 1601 <E T="03">et seq.</E>).""" kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def test_no_keyterm(self): node = Node('(a) Apples are grown in New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) Apples are grown in New Zealand.' kt = KeyTerms(None) results = kt.process(node) self.assertEquals(results, None)
def collapsed_markers_matches(node_text, tagged_text): """Find collapsed markers, i.e. tree node paragraphs that begin within a single XML node, within this text. Remove citations and other false positives. This is pretty hacky right now -- it focuses on the plain text but takes cues from the tagged text. @todo: streamline logic""" # In addition to the regex above, keyterms are an acceptable prefix. We # therefore convert keyterms to satisfy the above regex node_for_keyterms = Node(node_text, node_type=Node.INTERP, label=[get_first_interp_marker(node_text)]) node_for_keyterms.tagged_text = tagged_text keyterm = KeyTerms.keyterm_in_node(node_for_keyterms) if keyterm: node_text = node_text.replace(keyterm, '.' * len(keyterm)) collapsed_markers = [] for marker in _first_markers: possible = [(m, m.start(), m.end()) for m in marker.finditer(node_text)] possible = remove_citation_overlaps(node_text, possible) possible = [triplet[0] for triplet in possible] collapsed_markers.extend( match for match in possible if not false_collapsed_marker(match, node_text, tagged_text) ) return collapsed_markers
def nodes_from_interp_p(xml_node): """Given an XML node that contains text for an interpretation paragraph, split it into sub-paragraphs and account for trailing stars""" node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags yield n if n.text.endswith('* * *'): yield Node(label=[mtypes.INLINE_STARS]) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) yield n if n.text.endswith('* * *'): yield Node(label=[mtypes.INLINE_STARS])
def test_keyterm_definition(self): node = Node("(a) Terminator means I'll be back", label=['101', '22', 'a']) node.tagged_text = """(a) <E T="03">Terminator</E> means I'll be """ node.tagged_text += 'back' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None) node = Node("(1) Act means pretend", label=['101', '22', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means pretend""" node = Node( "(1) Act means the Truth in Lending Act (15 U.S.C. 1601 et seq.).", label=['1026', '2', 'a', '1']) node.tagged_text = """(1) <E T="03">Act</E> means the Truth in Lending Act (15 U.S.C. 1601 <E T="03">et seq.</E>).""" kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def test_keyterm_see(self): """ Keyterm tags sometimes enclose phrases such as 'See also' because those tags are also used for emphasis. """ node = Node('(a) Apples. See Section 101.2', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples. See also</E>' kt = KeyTerms(None) results = kt.process(node) self.assertEqual('Apples.', results[0]['key_term'])
def test_keyterm_and_emphasis(self): node = Node('(a) Apples. Apples are grown in ' + 'New Zealand.', label=['101', '22', 'a']) node.tagged_text = '(a) <E T="03">Apples.</E> Apples are grown in ' +\ 'New <E T="03">Zealand.</E>' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'Apples.') self.assertEqual(results[0]['locations'], [0])
def test_emphasis_close_to_front(self): """ An emphasized word is close to the front, but is not a key term. """ node = Node('(a) T et seq. has a list: apples', label=['101', '22', 'a']) node.tagged_text = '(a) T <E T="03">et seq.</E> has a list: apples' kt = KeyTerms(None) self.assertFalse(kt.keyterm_is_first(node, 'et seq.'))
def test_interpretation_markers(self): node = Node('3. et seq. has a list: apples', label=['101', 'c', Node.INTERP_MARK, '3'], node_type=Node.INTERP) node.tagged_text = '3. <E T="03">et seq.</E> has a list: apples' kt = KeyTerms(None) results = kt.process(node) self.assertNotEqual(results, None) self.assertEqual(results[0]['key_term'], 'et seq.') self.assertEqual(results[0]['locations'], [0])
def _assert_finds(self, tagged_text, *refs): """Compare the derived results to an expected number of references""" finder = def_finders.XMLTermMeans() text = re.sub(r"<[^>]*>", "", tagged_text) # removes tags node = Node(text) node.tagged_text = tagged_text actual = finder.find(node) self.assertEqual(len(refs), len(actual)) for ref, actual in zip(refs, actual): self.assertEqual(ref.term, actual.term) self.assertEqual(ref.start, actual.start)
def test_node_definitions_multiple_xml(self): """Find xml definitions which are separated by `and`""" stack = ParentStack().add(0, Node(label=['9999'])) winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4']) winter.tagged_text = ('(4) <E T="03">Cold</E> and ' '<E T="03">dreary</E> mean winter.') inc, _ = Terms(None).node_definitions(winter, stack) self.assertEqual(len(inc), 2) cold, dreary = inc self.assertEqual(cold, Ref('cold', '9999-4', 4)) self.assertEqual(dreary, Ref('dreary', '9999-4', 13))
def assert_finds_result(self, tagged_text, parent_title, *refs): """Given the tags and a title for a parent node, verify that the provided references are found""" parent = Node(label=['1000', '1'], title=parent_title) node = Node(re.sub(r"<[^>]*>", "", tagged_text)) # removes tags node.tagged_text = tagged_text results = def_finders.DefinitionKeyterm(parent).find(node) self.assertEqual(len(results), len(refs)) for expected, actual in zip(refs, results): self.assertEqual(expected.term, actual.term) self.assertEqual(expected.start, actual.start)
def test_emphasis_later(self): """ Don't pick up something that is emphasized later in a paragraph as a key-term. """ node = Node('(a) This has a list: apples et seq.', label=['101', '22', 'a']) node.tagged_text = '(a) This has a list: apples <E T="03">et seq.</E>' kt = KeyTerms(None) results = kt.process(node) self.assertEqual(results, None)
def test_node_definitions_xml_or(self): """Find xml definitions which are separated by `or`""" stack = ParentStack().add(0, Node(label=['9999'])) tamale = Node("(i) Hot tamale or tamale means nom nom", label=['9999', '4']) tamale.tagged_text = ('(i) <E T="03">Hot tamale</E> or <E T="03"> ' 'tamale</E> means nom nom ') inc, _ = Terms(None).node_definitions(tamale, stack) self.assertEqual(len(inc), 2) hot, tamale = inc self.assertEqual(hot, Ref('hot tamale', '9999-4', 4)) self.assertEqual(tamale, Ref('tamale', '9999-4', 18))
def derive_nodes(self, xml, processor=None): nodes = [] plain_text = '' for marker, plain_text, tagged_text in split_by_markers(xml): node = Node(text=plain_text.strip(), label=[marker], source_xml=xml) node.tagged_text = six.text_type(tagged_text.strip()) nodes.append(node) if plain_text.endswith('* * *'): # last in loop nodes.append(Node(label=[mtypes.INLINE_STARS])) return nodes
def test_node_definitions_xml_commas(self): """Find xml definitions which have commas separating them""" stack = ParentStack().add(0, Node(label=['9999'])) summer = Node("(i) Hot, humid, or dry means summer.", label=['9999', '4']) summer.tagged_text = ('(i) <E T="03">Hot</E>, <E T="03">humid</E>, ' 'or <E T="03">dry</E> means summer.') inc, _ = Terms(None).node_definitions(summer, stack) self.assertEqual(len(inc), 3) hot, humid, dry = inc self.assertEqual(hot, Ref('hot', '9999-4', 4)) self.assertEqual(humid, Ref('humid', '9999-4', 9)) self.assertEqual(dry, Ref('dry', '9999-4', 19))
def derive_nodes(self, xml, processor=None): nodes = [] plain_text = '' for marker, plain_text, tagged_text in split_by_markers(xml): node = Node(text=plain_text.strip(), label=[marker], source_xml=xml) node.tagged_text = unicode(tagged_text.strip()) nodes.append(node) if plain_text.endswith('* * *'): # last in loop nodes.append(Node(label=[mtypes.INLINE_STARS])) return nodes
def derive_nodes(self, xml, processor=None): text = '' tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = get_markers(tagged_text, self.next_marker(xml)) nodes = [] for m, node_text in get_markers_and_text(xml, markers_list): text, tagged_text = node_text node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = unicode(tagged_text.strip()) nodes.append(node) if text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) return nodes
def derive_nodes(self, xml, processor=None): nodes = [] text = tree_utils.get_node_text(xml).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = self.paragraph_markers(text) with_parens = ['({})'.format(m) for m in markers_list] triplets = zip(markers_list, tree_utils.split_text(text, with_parens), tree_utils.split_text(tagged_text, with_parens)) for m, text, tagged_text in triplets: node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = unicode(tagged_text.strip()) nodes.append(node) return nodes
def derive_nodes(self, xml, processor=None): text = tree_utils.get_node_text(xml).strip() node = Node(text=text, source_xml=xml) node.tagged_text = six.text_type( tree_utils.get_node_text_tags_preserved(xml).strip()) regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX match = regex.match(text) if match: node.label = [match.group('marker')] else: node.label = [mtypes.MARKERLESS] return [node]
def derive_nodes(self, xml, processor=None): text = tree_utils.get_node_text(xml).strip() node = Node(text=text, source_xml=xml) node.tagged_text = unicode( tree_utils.get_node_text_tags_preserved(xml).strip()) regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX match = regex.match(text) if match: node.label = [match.group('marker')] else: node.label = [mtypes.MARKERLESS] return [node]
def derive_nodes(self, xml, processor=None): nodes = [] text = tree_utils.get_node_text(xml).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = self.paragraph_markers(text) with_parens = ['({})'.format(m) for m in markers_list] triplets = zip(markers_list, tree_utils.split_text(text, with_parens), tree_utils.split_text(tagged_text, with_parens)) for m, text, tagged_text in triplets: node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = six.text_type(tagged_text.strip()) nodes.append(node) return nodes
def test_node_definitions_multiple_xml(self): t = Terms(None) stack = ParentStack() stack.add(0, Node(label=['9999'])) winter = Node("(4) Cold and dreary mean winter.", label=['9999', '4']) tagged = '(4) <E T="03">Cold</E> and <E T="03">dreary</E> mean ' tagged += 'winter.' winter.tagged_text = tagged inc, _ = t.node_definitions(winter, stack) self.assertEqual(len(inc), 2) cold, dreary = inc self.assertEqual(cold, Ref('cold', '9999-4', (4, 8))) self.assertEqual(dreary, Ref('dreary', '9999-4', (13, 19))) summer = Node("(i) Hot, humid, or dry means summer.", label=['9999', '4']) tagged = '(i) <E T="03">Hot</E>, <E T="03">humid</E>, or ' tagged += '<E T="03">dry</E> means summer.' summer.tagged_text = tagged inc, _ = t.node_definitions(summer, stack) self.assertEqual(len(inc), 3) hot, humid, dry = inc self.assertEqual(hot, Ref('hot', '9999-4', (4, 7))) self.assertEqual(humid, Ref('humid', '9999-4', (9, 14))) self.assertEqual(dry, Ref('dry', '9999-4', (19, 22))) tamale = Node("(i) Hot tamale or tamale means nom nom", label=['9999', '4']) tagged = '(i) <E T="03">Hot tamale</E> or <E T="03"> tamale</E> ' tagged += 'means nom nom ' tamale.tagged_text = tagged inc, _ = t.node_definitions(tamale, stack) self.assertEqual(len(inc), 2) hot, tamale = inc self.assertEqual(hot, Ref('hot tamale', '9999-4', (4, 14))) self.assertEqual(tamale, Ref('tamale', '9999-4', (18, 24)))
def build_from_section(reg_part, section_xml): section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = (subject_xml[0].text or '').strip() section_nums = [] for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no): secnum_candidate = match.group(1) if secnum_candidate.isdigit(): secnum_candidate = int(secnum_candidate) section_nums.append(secnum_candidate) # Merge spans longer than 3 sections section_span_end = None if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums if last - first + 1 > 3: section_span_end = str(last) section_nums = [first] else: section_nums = [] for i in range(first, last + 1): section_nums.append(i) section_nodes = [] for section_number in section_nums: section_number = str(section_number) section_text = (section_xml.text or '').strip() tagged_section_text = section_xml.text if section_span_end: section_title = u"§§ {}.{}-{}".format( reg_part, section_number, section_span_end) else: section_title = u"§ {}.{}".format(reg_part, section_number) if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text section_nodes.append( RegtextParagraphProcessor().process(section_xml, sect_node) ) return section_nodes
def dict_to_node(node_dict): """ Convert a dictionary representation of a node into a Node object if it contains the minimum required fields. Otherwise, pass it through unchanged. """ minimum_fields = set(('text', 'label', 'node_type')) if minimum_fields.issubset(node_dict.keys()): node = Node(node_dict['text'], [], node_dict['label'], node_dict.get('title', None), node_dict['node_type']) if 'tagged_text' in node_dict: node.tagged_text = node_dict['tagged_text'] if 'child_labels' in node_dict: node.child_labels = node_dict['child_labels'] return node else: return node_dict
def dict_to_node(node_dict): """ Convert a dictionary representation of a node into a Node object if it contains the minimum required fields. Otherwise, pass it through unchanged. """ minimum_fields = set(('text', 'label', 'node_type')) if minimum_fields.issubset(node_dict.keys()): node = Node( node_dict['text'], [], node_dict['label'], node_dict.get('title', None), node_dict['node_type']) if 'tagged_text' in node_dict: node.tagged_text = node_dict['tagged_text'] if 'child_labels' in node_dict: node.child_labels = node_dict['child_labels'] return node else: return node_dict
def paragraph_with_marker(self, text, tagged_text): """The paragraph has a marker, like (a) or a. etc.""" # To aid in determining collapsed paragraphs, replace any # keyterms present node_for_keyterms = Node(text, node_type=Node.APPENDIX) node_for_keyterms.tagged_text = tagged_text node_for_keyterms.label = [initial_marker(text)[0]] keyterm = KeyTerms.get_keyterm(node_for_keyterms) if keyterm: mtext = text.replace(keyterm, '.'*len(keyterm)) else: mtext = text for mtext in split_paragraph_text(mtext): if keyterm: # still need the original text mtext = mtext.replace('.'*len(keyterm), keyterm) node = Node(mtext, node_type=Node.APPENDIX, label=[initial_marker(mtext)[0]]) self.nodes.append(node)
def build_from_section(reg_part, section_xml): section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = (subject_xml[0].text or '').strip() section_nums = [] for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no): secnum_candidate = match.group(1) if secnum_candidate.isdigit(): secnum_candidate = int(secnum_candidate) section_nums.append(secnum_candidate) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) section_nodes = [] for section_number in section_nums: section_number = str(section_number) section_text = (section_xml.text or '').strip() tagged_section_text = section_xml.text section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text section_nodes.append( RegtextParagraphProcessor().process(section_xml, sect_node) ) return section_nodes
def build_from_section(reg_part, section_xml): section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = (subject_xml[0].text or '').strip() section_nums = [] for match in re.finditer(r'%s\.(\d+[a-z]*)' % reg_part, section_no): secnum_candidate = match.group(1) if secnum_candidate.isdigit(): secnum_candidate = int(secnum_candidate) section_nums.append(secnum_candidate) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) section_nodes = [] for section_number in section_nums: section_number = str(section_number) section_text = (section_xml.text or '').strip() tagged_section_text = section_xml.text section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text section_nodes.append(RegtextParagraphProcessor().process( section_xml, sect_node)) return section_nodes
def child_with_marker(child_node, stack): """Machinery to build a node for an interp's inner child. Assumes the paragraph begins with a paragraph marker.""" node_text = tree_utils.get_node_text(child_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(child_node) first_marker = get_first_interp_marker(text_with_tags) collapsed = collapsed_markers_matches(node_text) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags last = stack.peek() if len(last) == 0: stack.push_last((interpretation_level(first_marker), n)) else: node_level = interpretation_level(first_marker, last[0][0]) if node_level is None: logging.warning("Couldn't determine node_level for this " + "interpretation paragraph: " + n.text) node_level = last[0][0] + 1 stack.add(node_level, n) # Collapsed-marker children for match, end in zip(collapsed, ends): n = Node(node_text[match.end() - 2:end], label=[match.group(1)], node_type=Node.INTERP) node_level = interpretation_level(match.group(1)) last = stack.peek() if len(last) == 0: stack.push_last((node_level, n)) else: stack.add(node_level, n)
def test_dict_to_node(self): dict_node = {"text": "node text", "label": ["205", "A"], "node_type": "appendix"} node = compiler.dict_to_node(dict_node) self.assertEqual(node, Node("node text", [], ["205", "A"], None, "appendix")) dict_node["tagged_text"] = "<E> Tagged </E> text." node = compiler.dict_to_node(dict_node) actual_node = Node("node text", [], ["205", "A"], None, "appendix") actual_node.tagged_text = "<E> Tagged </E> text." created_node = compiler.dict_to_node(dict_node) self.assertEqual(actual_node, created_node) self.assertEqual(actual_node.tagged_text, created_node.tagged_text) dict_node = {"text": "node text"} node = compiler.dict_to_node(dict_node) self.assertEqual(node, dict_node)
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] section_no = section_xml.xpath('SECTNO')[0].text section_no_without_marker = re.search('[0-9]+\.[0-9]+', section_no).group(0) subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text manual_hierarchy_flag = False if reg_part in PARAGRAPH_HIERARCHY and section_no_without_marker in PARAGRAPH_HIERARCHY[ reg_part]: manual_hierarchy_flag = True # Collect paragraph markers and section text (intro text for the # section) i = 0 children = [ ch for ch in section_xml.getchildren() if ch.tag in ['P', 'STARS'] ] for ch in children: text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: # is this a bunch of definitions that don't have numbers next to them? if len(nodes) > 0: if (subject_text.find('Definitions.') > -1 or nodes[-1].text.find( 'For the purposes of this section')): #TODO: create a grammar for definitions if text.find('means') > -1: def_marker = text.split('means')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) elif text.find('shall have the same meaning') > -1: def_marker = text.split('shall')[0].strip().split() def_marker = ''.join([ word[0].upper() + word[1:] for word in def_marker ]) else: def_marker = 'def{0}'.format(i) i += 1 n = Node(text, label=[def_marker], source_xml=ch) n.tagged_text = tagged_text #nodes[-1].children.append(n) nodes.append(n) else: section_texts.append((text, tagged_text)) else: if len(children) > 1: def_marker = 'def{0}'.format(i) n = Node(text, [], [def_marker], source_xml=ch) n.tagged_text = tagged_text i += 1 nodes.append(n) else: # this is the only node around section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] m_stack = tree_utils.NodeStack() # Use constraint programming to figure out possible depth assignments if not manual_hierarchy_flag: depths = derive_depths([n.label[0] for n in nodes], [ rules.depth_type_order([ mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman ]) ]) if not manual_hierarchy_flag and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) elif nodes and manual_hierarchy_flag: logging.warning('Using manual depth hierarchy.') depths = PARAGRAPH_HIERARCHY[reg_part][section_no_without_marker] if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((1 + depth, node)) else: m_stack.add(1 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!' ' ({0} nodes but {1} provided)'.format(len(nodes), len(depths))) elif nodes and not manual_hierarchy_flag: logging.warning( 'Could not determine depth when parsing {0}:\n{1}'.format( section_no_without_marker, [n.label[0] for n in nodes])) for node in nodes: last = m_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: m_stack.push_last((3, node)) else: m_stack.add(3, node) nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) sect_node = Node(section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def build_from_section(reg_part, section_xml): section_texts = [] nodes = [] # Collect paragraph markers and section text (intro text for the # section) for ch in filter(lambda ch: ch.tag in ('P', 'STARS'), section_xml.getchildren()): text = tree_utils.get_node_text(ch, add_spaces=True) tagged_text = tree_utils.get_node_text_tags_preserved(ch) markers_list = get_markers(tagged_text.strip()) if ch.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not markers_list: section_texts.append((text, tagged_text)) else: for m, node_text in get_markers_and_text(ch, markers_list): n = Node(node_text[0], [], [m], source_xml=ch) n.tagged_text = unicode(node_text[1]) nodes.append(n) if node_text[0].endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([mtypes.lower, mtypes.ints, mtypes.roman, mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) m_stack = tree_utils.NodeStack() if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = m_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: m_stack.push_last((1 + par.depth, node)) else: m_stack.add(1 + par.depth, node) section_no = section_xml.xpath('SECTNO')[0].text subject_xml = section_xml.xpath('SUBJECT') if not subject_xml: subject_xml = section_xml.xpath('RESERVED') subject_text = subject_xml[0].text nodes = [] section_nums = [] for match in re.finditer(r'%s\.(\d+)' % reg_part, section_no): section_nums.append(int(match.group(1))) # Span of section numbers if u'§§' == section_no[:2] and '-' in section_no: first, last = section_nums section_nums = [] for i in range(first, last + 1): section_nums.append(i) for section_number in section_nums: section_number = str(section_number) plain_sect_texts = [s[0] for s in section_texts] tagged_sect_texts = [s[1] for s in section_texts] section_text = ' '.join([section_xml.text] + plain_sect_texts) tagged_section_text = ' '.join([section_xml.text] + tagged_sect_texts) section_title = u"§ " + reg_part + "." + section_number if subject_text: section_title += " " + subject_text sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) sect_node.tagged_text = tagged_section_text m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: m_stack.unwind() nodes.append(m_stack.pop()[0][1]) return nodes
def process_inner_children(inner_stack, xml_node, parent=None): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" # manual hierarchy should work here too manual_hierarchy = [] try: part_and_section = re.search('[0-9]+\.[0-9]+', xml_node.text).group(0) part, section = part_and_section.split('.') part_and_section += '-Interp' if (part in PARAGRAPH_HIERARCHY and part_and_section in PARAGRAPH_HIERARCHY[part]): manual_hierarchy = PARAGRAPH_HIERARCHY[part][part_and_section] except Exception: pass children = itertools.takewhile(lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for i, xml_node in enumerate( filter(lambda c: c.tag in ('P', 'STARS'), children)): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) # If the node has a 'DEPTH' attribute, we're in manual # hierarchy mode, just constructed from the XML instead of # specified in configuration. # This presumes that every child in the section has DEPTH # specified, if not, things will break in and around # derive_depths below. if xml_node.get("depth") is not None: manual_hierarchy.append(int(xml_node.get("depth"))) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes and manual_hierarchy: logging.warning("Couldn't determine interp marker. " "Manual hierarchy is specified") n = Node(node_text, label=[str(i)], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) elif not first_marker and not manual_hierarchy: logging.warning( "Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) if nodes: previous = nodes[-1] else: previous = parent previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments # use manual hierarchy if it's specified if not manual_hierarchy: depths = derive_depths([node.label[0] for node in nodes], [ rules.depth_type_order( [(mtypes.ints, mtypes.em_ints), (mtypes.lower, mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman]) ]) if not manual_hierarchy and depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node) elif nodes and manual_hierarchy: logging.warning('Using manual depth hierarchy.') depths = manual_hierarchy if len(nodes) == len(depths): for node, depth in zip(nodes, depths): last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3 + depth, node)) else: inner_stack.add(3 + depth, node) else: logging.error( 'Manual hierarchy length does not match node list length!') elif nodes and not manual_hierarchy: logging.warning('Could not derive depth (interp):\n {}'.format( [node.label[0] for node in nodes])) # just add nodes in sequential order then for node in nodes: last = inner_stack.peek() node.label = [ l.replace('<E T="03">', '').replace('</E>', '') for l in node.label ] if len(last) == 0: inner_stack.push_last((3, node)) else: inner_stack.add(3, node)
def derive_nodes(self, xml, processor=None): node = Node(table_xml_to_plaintext(xml), label=[mtypes.MARKERLESS], source_xml=xml) node.tagged_text = etree.tounicode(xml).strip() return [node]
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [n.label[0] for n in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)
def test_node_definitions(self): t = Terms(None) smart_quotes = [ (u'This has a “worD” and then more', [Ref('word', 'aaa', (12, 16))]), (u'I have “anotheR word” term and “moree”', [Ref('another word', 'bbb', (8, 20)), Ref('moree', 'bbb', (32, 37))]), (u'But the child “DoeS sEe”?', [Ref('does see', 'ccc', (15, 23))]), (u'Start with “this,”', [Ref('this', 'hhh', (12, 16))]), (u'Start with “this;”', [Ref('this', 'iii', (12, 16))]), (u'Start with “this.”', [Ref('this', 'jjj', (12, 16))]), (u'As do “subchildren”', [Ref('subchildren', 'ddd', (7, 18))])] no_defs = [ u'This has no defs', u'Also has no terms', u'Still no terms, but', u'the next one does'] xml_defs = [ (u'(4) Thing means a thing that is defined', u'(4) <E T="03">Thing</E> means a thing that is defined', Ref('thing', 'eee', (4, 9))), (u'(e) Well-meaning lawyers means people who do weird things', u'(e) <E T="03">Well-meaning lawyers</E> means people who do ' + 'weird things', Ref('well-meaning lawyers', 'fff', (4, 24))), (u'(e) Words have the same meaning as in a dictionary', u'(e) <E T="03">Words</E> have the same meaning as in a ' + 'dictionary', Ref('words', 'ffg', (4, 9))), (u'(e) Banana has the same meaning as bonono', u'(e) <E T="03">Banana</E> has the same meaning as bonono', Ref('banana', 'fgf', (4, 10))), (u'(f) Huge billowy clouds means I want to take a nap', u'(f) <E T="03">Huge billowy clouds</E> means I want to take a ' + 'nap', Ref('huge billowy clouds', 'ggg', (4, 23)))] xml_no_defs = [ (u'(d) Term1 or term2 means stuff', u'(d) <E T="03">Term1</E> or <E T="03">term2></E> means stuff'), (u'This term means should not match', u'<E T="03">This term</E> means should not match')] scope_term_defs = [ ('For purposes of this section, the term blue means the color', Ref('blue', '11-11', (39, 43))), ('For purposes of paragraph (a)(1) of this section, the term ' + 'cool bro means hip cat', Ref('cool bro', '11-22', (59, 67))), ('For purposes of this paragraph, po jo means "poor Joe"', Ref('po jo', '11-33', (32, 37)))] stack = ParentStack() stack.add(0, Node(label=['999'])) for txt in no_defs: defs, exc = t.node_definitions(Node(txt), stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, refs in smart_quotes: defs, exc = t.node_definitions(Node(txt), stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, xml in xml_no_defs: node = Node(txt) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, xml, ref in xml_defs: node = Node(txt, label=[ref.label]) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([ref], defs) self.assertEqual([], exc) for txt, ref in scope_term_defs: defs, exc = t.node_definitions( Node(txt, label=ref.label.split('-')), stack) self.assertEqual([ref], defs) self.assertEqual([], exc) # smart quotes are affected by the parent stack.add(1, Node('Definitions', label=['999', '1'])) for txt in no_defs: defs, exc = t.node_definitions(Node(txt), stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, refs in smart_quotes: defs, exc = t.node_definitions(Node(txt, label=[refs[0].label]), stack) self.assertEqual(refs, defs) self.assertEqual([], exc) for txt, xml in xml_no_defs: node = Node(txt) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([], defs) self.assertEqual([], exc) for txt, xml, ref in xml_defs: node = Node(txt, label=[ref.label]) node.tagged_text = xml defs, exc = t.node_definitions(node, stack) self.assertEqual([ref], defs) self.assertEqual([], exc)
def process_inner_children(inner_stack, xml_node): """Process the following nodes as children of this interpretation. This is very similar to reg_text.py:build_from_section()""" children = itertools.takewhile( lambda x: not is_title(x), xml_node.itersiblings()) nodes = [] for xml_node in filter(lambda c: c.tag in ('P', 'STARS'), children): node_text = tree_utils.get_node_text(xml_node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(xml_node) first_marker = get_first_interp_marker(text_with_tags) if xml_node.tag == 'STARS': nodes.append(Node(label=[mtypes.STARS_TAG])) elif not first_marker and nodes: logging.warning("Couldn't determine interp marker. Appending to " "previous paragraph: %s", node_text) previous = nodes[-1] previous.text += "\n\n" + node_text if hasattr(previous, 'tagged_text'): previous.tagged_text += "\n\n" + text_with_tags else: previous.tagged_text = text_with_tags else: collapsed = collapsed_markers_matches(node_text, text_with_tags) # -2 throughout to account for matching the character + period ends = [m.end() - 2 for m in collapsed[1:]] + [len(node_text)] starts = [m.end() - 2 for m in collapsed] + [len(node_text)] # Node for this paragraph n = Node(node_text[0:starts[0]], label=[first_marker], node_type=Node.INTERP) n.tagged_text = text_with_tags nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Collapsed-marker children for match, end in zip(collapsed, ends): marker = match.group(1) if marker == '1': marker = '<E T="03">1</E>' n = Node(node_text[match.end() - 2:end], label=[marker], node_type=Node.INTERP) nodes.append(n) if n.text.endswith('* * *'): nodes.append(Node(label=[mtypes.INLINE_STARS])) # Trailing stars don't matter; slightly more efficient to ignore them while nodes and nodes[-1].label[0] in mtypes.stars: nodes = nodes[:-1] # Use constraint programming to figure out possible depth assignments depths = derive_depths( [node.label[0] for node in nodes], [rules.depth_type_order([(mtypes.ints, mtypes.em_ints), (mtypes.roman, mtypes.upper), mtypes.upper, mtypes.em_ints, mtypes.em_roman])]) if depths: # Find the assignment which violates the least of our heuristics depths = heuristics.prefer_multiple_children(depths, 0.5) depths = sorted(depths, key=lambda d: d.weight, reverse=True) depths = depths[0] for node, par in zip(nodes, depths): if par.typ != mtypes.stars: last = inner_stack.peek() node.label = [l.replace('<E T="03">', '').replace('</E>', '') for l in node.label] if len(last) == 0: inner_stack.push_last((3 + par.depth, node)) else: inner_stack.add(3 + par.depth, node)