def test_debug_idx(self): """Find the index of the first error when attempting to derive depths""" self.assertEqual(debug_idx(['1', '2', '3']), 3) self.assertEqual(debug_idx(['1', '4']), 1) self.assertEqual(debug_idx(['1', '2', '4']), 2) self.assertEqual( debug_idx(['1', 'a', '2', 'A'], [rules.depth_type_inverses]), 3)
def test_debug_idx(self): """Find the index of the first error when attempting to derive depths""" self.assertEqual(debug_idx(['1', '2', '3']), 3) self.assertEqual(debug_idx(['1', 'c']), 1) self.assertEqual(debug_idx(['1', '2', 'c']), 2) self.assertEqual( debug_idx(['1', 'a', '2', 'A'], [optional_rules.depth_type_inverses]), 3)
def process(self, xml, root): nodes = self.parse_nodes(xml) intro_node, nodes = self.separate_intro(nodes) if intro_node: root.text = " ".join([root.text, intro_node.text]).strip() # @todo - this is ugly. Make tagged_text a legitimate field on Node tagged_text_list = [] if hasattr(root, 'tagged_text'): tagged_text_list.append(root.tagged_text) if hasattr(intro_node, 'tagged_text'): tagged_text_list.append(intro_node.tagged_text) if tagged_text_list: root.tagged_text = ' '.join(tagged_text_list) if nodes: markers = [node.label[0] for node in nodes] constraints = self.additional_constraints() depths = derive_depths(markers, constraints) if not depths: fails_at = debug_idx(markers, constraints) logging.error( "Could not determine paragraph depths (<%s /> %s):\n" "%s\n" "?? %s\n" "Remaining markers: %s", xml.tag, root.label_id(), derive_depths(markers[:fails_at], constraints)[0].pretty_str(), markers[fails_at], markers[fails_at + 1:]) depths = self.select_depth(depths) return self.build_hierarchy(root, nodes, depths) else: return root
def process(self, xml, root): nodes = self.parse_nodes(xml) intro_node, nodes = self.separate_intro(nodes) if intro_node: root.text = " ".join([root.text, intro_node.text]).strip() # @todo - this is ugly. Make tagged_text a legitimate field on Node tagged_text_list = [] if getattr(root, 'tagged_text', None): tagged_text_list.append(root.tagged_text) if getattr(intro_node, 'tagged_text', None): tagged_text_list.append(intro_node.tagged_text) if tagged_text_list: root.tagged_text = ' '.join(tagged_text_list) if nodes: markers = [node.label[0] for node in nodes] constraints = self.additional_constraints() depths = derive_depths(markers, constraints) if not depths: logging.warning("Could not derive paragraph depths." " Retrying with relaxed constraints.") deemphasized_markers = [deemphasize(m) for m in markers] constraints = self.relaxed_constraints() depths = derive_depths(deemphasized_markers, constraints) if not depths: fails_at = debug_idx(markers, constraints) logging.error( "Could not determine paragraph depths (<%s /> %s):\n" "%s\n" "?? %s\n" "Remaining markers: %s", xml.tag, root.label_id(), derive_depths(markers[:fails_at], constraints)[0].pretty_str(), markers[fails_at], markers[fails_at + 1:]) depths = self.select_depth(depths) return self.build_hierarchy(root, nodes, depths) else: return root