Example #1
0
 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list, tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = unicode(tagged_text.strip())
         nodes.append(node)
     return nodes
def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    if len(markers_list) > 1:
        actual_markers = ['(%s)' % m for m in markers_list]
        plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                         for m in actual_markers]
        node_texts = tree_utils.split_text(node_text, plain_markers)
        tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
        node_text_list = zip(node_texts, tagged_texts)
    elif markers_list:
        node_text_list = [(node_text, text_with_tags)]
    return zip(markers_list, node_text_list)
def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    if len(markers_list) > 1:
        actual_markers = ['(%s)' % m for m in markers_list]
        plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                         for m in actual_markers]
        node_texts = tree_utils.split_text(node_text, plain_markers)
        tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
        node_text_list = zip(node_texts, tagged_texts)
    elif markers_list:
        node_text_list = [(node_text, text_with_tags)]
    return zip(markers_list, node_text_list)
 def derive_nodes(self, xml, processor=None):
     nodes = []
     text = tree_utils.get_node_text(xml).strip()
     tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
     markers_list = self.paragraph_markers(text)
     with_parens = ['({})'.format(m) for m in markers_list]
     triplets = zip(markers_list,
                    tree_utils.split_text(text, with_parens),
                    tree_utils.split_text(tagged_text, with_parens))
     for m, text, tagged_text in triplets:
         node = Node(text=text.strip(), label=[m], source_xml=xml)
         node.tagged_text = six.text_type(tagged_text.strip())
         nodes.append(node)
     return nodes
def get_markers_and_text(node, markers_list):
    node_text = tree_utils.get_node_text(node, add_spaces=True)
    text_with_tags = tree_utils.get_node_text_tags_preserved(node)

    actual_markers = ['(%s)' % m for m in markers_list]
    plain_markers = [m.replace('<E T="03">', '').replace('</E>', '')
                     for m in actual_markers]
    node_texts = tree_utils.split_text(node_text, plain_markers)
    tagged_texts = tree_utils.split_text(text_with_tags, actual_markers)
    node_text_list = zip(node_texts, tagged_texts)

    if len(node_text_list) > len(markers_list):     # diff can only be 1
        markers_list.insert(0, mtypes.MARKERLESS)
    return zip(markers_list, node_text_list)
    def test_consecutive_markers(self):
        text = "(A)(2) Bananas"
        tokens = ['(A)', '(2)']

        result = tree_utils.split_text(text, tokens)
        expected = ['(A)', '(2) Bananas']
        self.assertEqual(expected, result)
    def test_split_text(self):
        text = "(A) Apples (B) Bananas (Z) Zebras"
        tokens = ['(A)', '(B)']

        result = tree_utils.split_text(text, tokens)
        expected = ['(A) Apples ', '(B) Bananas (Z) Zebras']
        self.assertEqual(expected, result)
    def test_consecutive_markers(self):
        text = "(A)(2) Bananas"
        tokens = ['(A)', '(2)']

        result = tree_utils.split_text(text, tokens)
        expected = ['(A)', '(2) Bananas']
        self.assertEqual(expected, result)
    def test_split_text(self):
        text = "(A) Apples (B) Bananas (Z) Zebras"
        tokens = ['(A)', '(B)']

        result = tree_utils.split_text(text, tokens)
        expected = ['(A) Apples ', '(B) Bananas (Z) Zebras']
        self.assertEqual(expected, result)
def split_by_markers(xml):
    """Given an xml node, pull out triplets of
        (marker, plain-text following, text-with-tags following)
    for each subparagraph found"""
    plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip()
    tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
    markers_list = get_markers(tagged_text, next_marker(xml))

    plain_markers = ['({})'.format(mtypes.deemphasize(m))
                     for m in markers_list]
    node_texts = tree_utils.split_text(plain_text, plain_markers)
    tagged_texts = tree_utils.split_text(
        tagged_text, ['({})'.format(m) for m in markers_list])
    if len(node_texts) > len(markers_list):     # due to initial MARKERLESS
        markers_list.insert(0, mtypes.MARKERLESS)
    return list(zip(markers_list, node_texts, tagged_texts))
Example #11
0
def split_by_markers(xml):
    """Given an xml node, pull out triplets of
        (marker, plain-text following, text-with-tags following)
    for each subparagraph found"""
    plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip()
    tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
    markers_list = get_markers(tagged_text, next_marker(xml))

    plain_markers = ['({})'.format(mtypes.deemphasize(m))
                     for m in markers_list]
    node_texts = tree_utils.split_text(plain_text, plain_markers)
    tagged_texts = tree_utils.split_text(
        tagged_text, ['({})'.format(m) for m in markers_list])
    if len(node_texts) > len(markers_list):     # due to initial MARKERLESS
        markers_list.insert(0, mtypes.MARKERLESS)
    return list(zip(markers_list, node_texts, tagged_texts))
    def test_split_text(self):
        text = "(A) Apples (B) Bananas (Z) Zebras"
        tokens = ["(A)", "(B)"]

        result = tree_utils.split_text(text, tokens)
        expected = ["(A) Apples ", "(B) Bananas (Z) Zebras"]
        self.assertEqual(expected, result)
    def test_split_text_with_prefix(self):
        """Don't wipe out the intro text, if present"""
        text = "Some content here (A) Apples (B) Bananas (Z) Zebras"
        tokens = ["(A)", "(B)"]

        result = tree_utils.split_text(text, tokens)
        expected = ["Some content here ", "(A) Apples ", "(B) Bananas (Z) Zebras"]
        self.assertEqual(expected, result)
def process_appendix(m_stack, current_section, child):
    html_parser = HTMLParser.HTMLParser()

    for ch in child.getchildren():
        if ch.tag == 'HD':
            appendix_section = get_appendix_section_number(
                ch.text, current_section)

            if appendix_section is None:
                appendix_section = determine_next_section(m_stack, 2)

            n = Node(
                node_type=Node.APPENDIX, label=[appendix_section],
                title=ch.text)

            node_level = 2
            tree_utils.add_to_stack(m_stack, node_level, n)
        if ch.tag == 'P':
            text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
            markers_list = tree_utils.get_paragraph_markers(text)

            node_text = tree_utils.get_node_text(ch)

            if len(markers_list) > 0:
                if len(markers_list) > 1:
                    actual_markers = ['(%s)' % m for m in markers_list]
                    node_text = tree_utils.split_text(
                        node_text, actual_markers)
                else:
                    node_text = [node_text]

                for m, node_text in zip(markers_list, node_text):
                    n = Node(
                        node_text, label=[str(m)], node_type=Node.APPENDIX)

                    last = m_stack.peek()
                    node_level = determine_level(m, last[0][0])

                    if m == 'i':
                        #This is bit of a hack, since we can't easily
                        #distinguish between the Roman numeral #(i) and the
                        #letter (i) to determine the level. We look ahead to
                        #help. This is not #a complete solution and we should
                        #circle back at some point.

                        next_text = ' '.join(
                            [ch.getnext().text] +
                            [c.tail for c in ch.getnext() if c.tail])

                        next_markers = tree_utils.get_paragraph_markers(
                            next_text)

                        if next_markers[0] == 'ii':
                            node_level = 5
                    tree_utils.add_to_stack(m_stack, node_level, n)
            else:
                last = m_stack.peek_last()
                last[1].text = last[1].text + '\n %s' % node_text
    def test_split_text_with_prefix(self):
        """Don't wipe out the intro text, if present"""
        text = "Some content here (A) Apples (B) Bananas (Z) Zebras"
        tokens = ['(A)', '(B)']

        result = tree_utils.split_text(text, tokens)
        expected = ['Some content here ', '(A) Apples ',
                    '(B) Bananas (Z) Zebras']
        self.assertEqual(expected, result)
def build_section(reg_part, section_xml):
    p_level = 1
    m_stack = NodeStack()
    section_texts = []
    for ch in section_xml.getchildren():
        if ch.tag == 'P':
            text = ' '.join([ch.text] + [c.tail for c in ch if c.tail])
            markers_list = tree_utils.get_paragraph_markers(text)
            node_text = tree_utils.get_node_text(ch)

            if len(markers_list) > 1:
                actual_markers = ['(%s)' % m for m in markers_list]
                node_text = tree_utils.split_text(node_text, actual_markers)
            elif markers_list:
                node_text = [node_text]
            else:   # Does not contain paragraph markers
                section_texts.append(node_text)

            for m, node_text in zip(markers_list, node_text):
                n = Node(node_text, [], [str(m)])

                new_p_level = determine_level(m, p_level)
                last = m_stack.peek()
                if len(last) == 0:
                    m_stack.push_last((new_p_level, n))
                else:
                    tree_utils.add_to_stack(m_stack, new_p_level, n)
                p_level = new_p_level

    section_title = section_xml.xpath('SECTNO')[0].text
    subject_text = section_xml.xpath('SUBJECT')[0].text
    if subject_text:
        section_title += " " + subject_text

    section_number_match = re.search(r'%s\.(\d+)' % reg_part, section_title)
    #   Sometimes not reg text sections get mixed in
    if section_number_match:
        section_number = section_number_match.group(1)
        section_text = ' '.join([section_xml.text] + section_texts)
        sect_node = Node(
            section_text, label=[reg_part, section_number],
            title=section_title)

        m_stack.add_to_bottom((1, sect_node))

        while m_stack.size() > 1:
            tree_utils.unwind_stack(m_stack)

        return m_stack.pop()[0][1]