def _get_text_values_for_schema_node(self, node, value): result = [] if not value: return result if type(node.typ) == colander.Mapping: for cnode in node.children: name = cnode.name val = value.get(name, None) if val: result += self._get_text_values_for_schema_node(cnode, val) elif type(node.typ) == colander.Sequence: if node.children: cnode = node.children[0] for val in value: result += self._get_text_values_for_schema_node(cnode, val) elif type(node.typ) == colander.Tuple: for (idx, cnode) in enumerate(node.children): result += self._get_text_values_for_schema_node(cnode, value[idx]) elif type(node.typ) == colander.String: if getattr(node, 'include_in_text', True): if getattr(node, 'is_html', False): value = html_to_text(value, 0) if value: result.append(value) #elif type(node.typ) == deform.FileData: # pass # FIXME: handle PDF, Word, etc? return result
def test_htmlutil_html_to_text(self): from audrey import htmlutil self.assertEqual(htmlutil.html_to_text(u'''<html><head><title>Title</title></head><body><h1>Header1</h1><p>Hello <unknown foo="bar">world!</unknown> Perhaps some other Æon...<br/><a href="http://python.org">Ooh, a link!</a><ul><li>animal</li><li>vegetable</li><li>mineral</li></ul></p><p>© 2012</p></body></html>'''), u'''Header1\n\n\n\nHello world! Perhaps some other \xc6on...\nOoh, a link!\n\n- animal\n\n- vegetable\n\n- mineral\n\n\xa9 2012''') self.assertEqual(htmlutil.html_to_text('''<a href="http://python.org">Ooh, a link!</a>''', show_link_urls=True), '''Ooh, a link! [http://python.org]''') self.assertEqual(htmlutil.html_to_text('''Foo &meh; Bar''', unknown_entity_replacement='?'), 'Foo ? Bar')