def test_debug_idx(self):
     """Find the index of the first error when attempting to derive
     depths"""
     self.assertEqual(debug_idx(['1', '2', '3']), 3)
     self.assertEqual(debug_idx(['1', '4']), 1)
     self.assertEqual(debug_idx(['1', '2', '4']), 2)
     self.assertEqual(
         debug_idx(['1', 'a', '2', 'A'], [rules.depth_type_inverses]), 3)
 def test_debug_idx(self):
     """Find the index of the first error when attempting to derive
     depths"""
     self.assertEqual(debug_idx(['1', '2', '3']), 3)
     self.assertEqual(debug_idx(['1', 'c']), 1)
     self.assertEqual(debug_idx(['1', '2', 'c']), 2)
     self.assertEqual(
         debug_idx(['1', 'a', '2', 'A'],
                   [optional_rules.depth_type_inverses]), 3)
 def process(self, xml, root):
     nodes = self.parse_nodes(xml)
     intro_node, nodes = self.separate_intro(nodes)
     if intro_node:
         root.text = " ".join([root.text, intro_node.text]).strip()
         # @todo - this is ugly. Make tagged_text a legitimate field on Node
         tagged_text_list = []
         if hasattr(root, 'tagged_text'):
             tagged_text_list.append(root.tagged_text)
         if hasattr(intro_node, 'tagged_text'):
             tagged_text_list.append(intro_node.tagged_text)
         if tagged_text_list:
             root.tagged_text = ' '.join(tagged_text_list)
     if nodes:
         markers = [node.label[0] for node in nodes]
         constraints = self.additional_constraints()
         depths = derive_depths(markers, constraints)
         if not depths:
             fails_at = debug_idx(markers, constraints)
             logging.error(
                 "Could not determine paragraph depths (<%s /> %s):\n"
                 "%s\n"
                 "?? %s\n"
                 "Remaining markers: %s",
                 xml.tag, root.label_id(),
                 derive_depths(markers[:fails_at],
                               constraints)[0].pretty_str(),
                 markers[fails_at], markers[fails_at + 1:])
         depths = self.select_depth(depths)
         return self.build_hierarchy(root, nodes, depths)
     else:
         return root
Ejemplo n.º 4
0
    def process(self, xml, root):
        nodes = self.parse_nodes(xml)
        intro_node, nodes = self.separate_intro(nodes)
        if intro_node:
            root.text = " ".join([root.text, intro_node.text]).strip()
            # @todo - this is ugly. Make tagged_text a legitimate field on Node
            tagged_text_list = []
            if getattr(root, 'tagged_text', None):
                tagged_text_list.append(root.tagged_text)
            if getattr(intro_node, 'tagged_text', None):
                tagged_text_list.append(intro_node.tagged_text)
            if tagged_text_list:
                root.tagged_text = ' '.join(tagged_text_list)
        if nodes:
            markers = [node.label[0] for node in nodes]
            constraints = self.additional_constraints()
            depths = derive_depths(markers, constraints)

            if not depths:
                logging.warning("Could not derive paragraph depths."
                                " Retrying with relaxed constraints.")
                deemphasized_markers = [deemphasize(m) for m in markers]
                constraints = self.relaxed_constraints()
                depths = derive_depths(deemphasized_markers, constraints)

            if not depths:
                fails_at = debug_idx(markers, constraints)
                logging.error(
                    "Could not determine paragraph depths (<%s /> %s):\n"
                    "%s\n"
                    "?? %s\n"
                    "Remaining markers: %s",
                    xml.tag, root.label_id(),
                    derive_depths(markers[:fails_at],
                                  constraints)[0].pretty_str(),
                    markers[fails_at], markers[fails_at + 1:])
            depths = self.select_depth(depths)
            return self.build_hierarchy(root, nodes, depths)
        else:
            return root