def build_hierarchy(self, root, nodes, depths):
     """Given a root node, a flat list of child nodes, and a list of
     depths, build a node hierarchy around the root"""
     stack = tree_utils.NodeStack()
     stack.add(0, root)
     for node, depth_info in zip(nodes, depths):
         node.label = [mtypes.deemphasize(l) for l in node.label]
         self.replace_markerless(stack, node, depth_info.depth + 1)
         self.carry_label_to_children(node)
         if depth_info.typ != mtypes.stars:
             stack.add(1 + depth_info.depth, node)
     return stack.collapse()
 def build_hierarchy(self, root, nodes, depths):
     """Given a root node, a flat list of child nodes, and a list of
     depths, build a node hierarchy around the root"""
     stack = tree_utils.NodeStack()
     stack.add(0, root)
     for node, depth_info in zip(nodes, depths):
         node.label = [mtypes.deemphasize(l) for l in node.label]
         self.replace_markerless(stack, node, depth_info.depth + 1)
         self.carry_label_to_children(node)
         if depth_info.typ != mtypes.stars:
             stack.add(1 + depth_info.depth, node)
     return stack.collapse()
def split_by_markers(xml):
    """Given an xml node, pull out triplets of
        (marker, plain-text following, text-with-tags following)
    for each subparagraph found"""
    plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip()
    tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
    markers_list = get_markers(tagged_text, next_marker(xml))

    plain_markers = ['({})'.format(mtypes.deemphasize(m))
                     for m in markers_list]
    node_texts = tree_utils.split_text(plain_text, plain_markers)
    tagged_texts = tree_utils.split_text(
        tagged_text, ['({})'.format(m) for m in markers_list])
    if len(node_texts) > len(markers_list):     # due to initial MARKERLESS
        markers_list.insert(0, mtypes.MARKERLESS)
    return list(zip(markers_list, node_texts, tagged_texts))
Esempio n. 4
0
def split_by_markers(xml):
    """Given an xml node, pull out triplets of
        (marker, plain-text following, text-with-tags following)
    for each subparagraph found"""
    plain_text = tree_utils.get_node_text(xml, add_spaces=True).strip()
    tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
    markers_list = get_markers(tagged_text, next_marker(xml))

    plain_markers = ['({})'.format(mtypes.deemphasize(m))
                     for m in markers_list]
    node_texts = tree_utils.split_text(plain_text, plain_markers)
    tagged_texts = tree_utils.split_text(
        tagged_text, ['({})'.format(m) for m in markers_list])
    if len(node_texts) > len(markers_list):     # due to initial MARKERLESS
        markers_list.insert(0, mtypes.MARKERLESS)
    return list(zip(markers_list, node_texts, tagged_texts))
Esempio n. 5
0
    def process(self, xml, root):
        nodes = self.parse_nodes(xml)
        intro_node, nodes = self.separate_intro(nodes)
        if intro_node:
            root.text = " ".join([root.text, intro_node.text]).strip()
            # @todo - this is ugly. Make tagged_text a legitimate field on Node
            tagged_text_list = []
            if getattr(root, 'tagged_text', None):
                tagged_text_list.append(root.tagged_text)
            if getattr(intro_node, 'tagged_text', None):
                tagged_text_list.append(intro_node.tagged_text)
            if tagged_text_list:
                root.tagged_text = ' '.join(tagged_text_list)
        if nodes:
            markers = [node.label[0] for node in nodes]
            constraints = self.additional_constraints()
            depths = derive_depths(markers, constraints)

            if not depths:
                logging.warning("Could not derive paragraph depths."
                                " Retrying with relaxed constraints.")
                deemphasized_markers = [deemphasize(m) for m in markers]
                constraints = self.relaxed_constraints()
                depths = derive_depths(deemphasized_markers, constraints)

            if not depths:
                fails_at = debug_idx(markers, constraints)
                logging.error(
                    "Could not determine paragraph depths (<%s /> %s):\n"
                    "%s\n"
                    "?? %s\n"
                    "Remaining markers: %s",
                    xml.tag, root.label_id(),
                    derive_depths(markers[:fails_at],
                                  constraints)[0].pretty_str(),
                    markers[fails_at], markers[fails_at + 1:])
            depths = self.select_depth(depths)
            return self.build_hierarchy(root, nodes, depths)
        else:
            return root