def derive_nodes(self, xml, processor=None): nodes = [] text = tree_utils.get_node_text(xml).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = self.paragraph_markers(text) with_parens = ['({})'.format(m) for m in markers_list] triplets = zip(markers_list, tree_utils.split_text(text, with_parens), tree_utils.split_text(tagged_text, with_parens)) for m, text, tagged_text in triplets: node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = unicode(tagged_text.strip()) nodes.append(node) return nodes
def get_markers_and_text(node, markers_list): node_text = tree_utils.get_node_text(node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(node) if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] plain_markers = [m.replace('<E T="03">', '').replace('</E>', '') for m in actual_markers] node_texts = tree_utils.split_text(node_text, plain_markers) tagged_texts = tree_utils.split_text(text_with_tags, actual_markers) node_text_list = zip(node_texts, tagged_texts) elif markers_list: node_text_list = [(node_text, text_with_tags)] return zip(markers_list, node_text_list)
def derive_nodes(self, xml, processor=None): nodes = [] text = tree_utils.get_node_text(xml).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = self.paragraph_markers(text) with_parens = ['({})'.format(m) for m in markers_list] triplets = zip(markers_list, tree_utils.split_text(text, with_parens), tree_utils.split_text(tagged_text, with_parens)) for m, text, tagged_text in triplets: node = Node(text=text.strip(), label=[m], source_xml=xml) node.tagged_text = six.text_type(tagged_text.strip()) nodes.append(node) return nodes
def get_markers_and_text(node, markers_list): node_text = tree_utils.get_node_text(node, add_spaces=True) text_with_tags = tree_utils.get_node_text_tags_preserved(node) actual_markers = ['(%s)' % m for m in markers_list] plain_markers = [m.replace('<E T="03">', '').replace('</E>', '') for m in actual_markers] node_texts = tree_utils.split_text(node_text, plain_markers) tagged_texts = tree_utils.split_text(text_with_tags, actual_markers) node_text_list = zip(node_texts, tagged_texts) if len(node_text_list) > len(markers_list): # diff can only be 1 markers_list.insert(0, mtypes.MARKERLESS) return zip(markers_list, node_text_list)
def test_consecutive_markers(self): text = "(A)(2) Bananas" tokens = ['(A)', '(2)'] result = tree_utils.split_text(text, tokens) expected = ['(A)', '(2) Bananas'] self.assertEqual(expected, result)
def test_split_text(self): text = "(A) Apples (B) Bananas (Z) Zebras" tokens = ['(A)', '(B)'] result = tree_utils.split_text(text, tokens) expected = ['(A) Apples ', '(B) Bananas (Z) Zebras'] self.assertEqual(expected, result)
def split_by_markers(xml): """Given an xml node, pull out triplets of (marker, plain-text following, text-with-tags following) for each subparagraph found""" plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip() tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip() markers_list = get_markers(tagged_text, next_marker(xml)) plain_markers = ['({})'.format(mtypes.deemphasize(m)) for m in markers_list] node_texts = tree_utils.split_text(plain_text, plain_markers) tagged_texts = tree_utils.split_text( tagged_text, ['({})'.format(m) for m in markers_list]) if len(node_texts) > len(markers_list): # due to initial MARKERLESS markers_list.insert(0, mtypes.MARKERLESS) return list(zip(markers_list, node_texts, tagged_texts))
def test_split_text(self): text = "(A) Apples (B) Bananas (Z) Zebras" tokens = ["(A)", "(B)"] result = tree_utils.split_text(text, tokens) expected = ["(A) Apples ", "(B) Bananas (Z) Zebras"] self.assertEqual(expected, result)
def test_split_text_with_prefix(self): """Don't wipe out the intro text, if present""" text = "Some content here (A) Apples (B) Bananas (Z) Zebras" tokens = ["(A)", "(B)"] result = tree_utils.split_text(text, tokens) expected = ["Some content here ", "(A) Apples ", "(B) Bananas (Z) Zebras"] self.assertEqual(expected, result)
def process_appendix(m_stack, current_section, child): html_parser = HTMLParser.HTMLParser() for ch in child.getchildren(): if ch.tag == 'HD': appendix_section = get_appendix_section_number( ch.text, current_section) if appendix_section is None: appendix_section = determine_next_section(m_stack, 2) n = Node( node_type=Node.APPENDIX, label=[appendix_section], title=ch.text) node_level = 2 tree_utils.add_to_stack(m_stack, node_level, n) if ch.tag == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) markers_list = tree_utils.get_paragraph_markers(text) node_text = tree_utils.get_node_text(ch) if len(markers_list) > 0: if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] node_text = tree_utils.split_text( node_text, actual_markers) else: node_text = [node_text] for m, node_text in zip(markers_list, node_text): n = Node( node_text, label=[str(m)], node_type=Node.APPENDIX) last = m_stack.peek() node_level = determine_level(m, last[0][0]) if m == 'i': #This is bit of a hack, since we can't easily #distinguish between the Roman numeral #(i) and the #letter (i) to determine the level. We look ahead to #help. This is not #a complete solution and we should #circle back at some point. next_text = ' '.join( [ch.getnext().text] + [c.tail for c in ch.getnext() if c.tail]) next_markers = tree_utils.get_paragraph_markers( next_text) if next_markers[0] == 'ii': node_level = 5 tree_utils.add_to_stack(m_stack, node_level, n) else: last = m_stack.peek_last() last[1].text = last[1].text + '\n %s' % node_text
def test_split_text_with_prefix(self): """Don't wipe out the intro text, if present""" text = "Some content here (A) Apples (B) Bananas (Z) Zebras" tokens = ['(A)', '(B)'] result = tree_utils.split_text(text, tokens) expected = ['Some content here ', '(A) Apples ', '(B) Bananas (Z) Zebras'] self.assertEqual(expected, result)
def build_section(reg_part, section_xml): p_level = 1 m_stack = NodeStack() section_texts = [] for ch in section_xml.getchildren(): if ch.tag == 'P': text = ' '.join([ch.text] + [c.tail for c in ch if c.tail]) markers_list = tree_utils.get_paragraph_markers(text) node_text = tree_utils.get_node_text(ch) if len(markers_list) > 1: actual_markers = ['(%s)' % m for m in markers_list] node_text = tree_utils.split_text(node_text, actual_markers) elif markers_list: node_text = [node_text] else: # Does not contain paragraph markers section_texts.append(node_text) for m, node_text in zip(markers_list, node_text): n = Node(node_text, [], [str(m)]) new_p_level = determine_level(m, p_level) last = m_stack.peek() if len(last) == 0: m_stack.push_last((new_p_level, n)) else: tree_utils.add_to_stack(m_stack, new_p_level, n) p_level = new_p_level section_title = section_xml.xpath('SECTNO')[0].text subject_text = section_xml.xpath('SUBJECT')[0].text if subject_text: section_title += " " + subject_text section_number_match = re.search(r'%s\.(\d+)' % reg_part, section_title) # Sometimes not reg text sections get mixed in if section_number_match: section_number = section_number_match.group(1) section_text = ' '.join([section_xml.text] + section_texts) sect_node = Node( section_text, label=[reg_part, section_number], title=section_title) m_stack.add_to_bottom((1, sect_node)) while m_stack.size() > 1: tree_utils.unwind_stack(m_stack) return m_stack.pop()[0][1]