def test_flat_doc(self): 'Input doc with just top-level text' x = amara.parse('<a>one two three four five six seven eight nine</a>') for i in range(1, 11): trimmed_tree = util.trim_word_count(x, i) word_count = len(trimmed_tree.xml_select(u'string(.)').split()) self.assertEquals(word_count, min(i, 9))
def test_nested_doc(self): 'Input doc with text in nested elements' x = amara.parse('<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>') for i in range(1, 11): trimmed_tree = util.trim_word_count(x, i) word_count = len(trimmed_tree.xml_select(u'string(.)').split()) self.assertEquals(word_count, min(i, 9))
def test_nested_doc(self): 'Input doc with text in nested elements' x = amara.parse( '<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>' ) for i in range(1, 11): trimmed_tree = util.trim_word_count(x, i) word_count = len(trimmed_tree.xml_select(u'string(.)').split()) self.assertEquals(word_count, min(i, 9))
def akara_twc(body, ctype, max=None, html='no'): ''' Take some POSTed markup and return a version with words trimmed, but intelligently, with understanding of markup, so that tags are not counted, and the structure of sub-elements included in the same set is preserved. max (query parameter) - which is the maximum word count of the resulting text html (query parameter) - if 'yes', try to parse the input as HTML Sample request: curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.twc?max=7" ''' #Raises ValueError #Is there a monadic approach we can provide for Akara for error handling? This cries out for "Maybe" #(OK OK, the idea of Maybe, but more of the simple expressiveness of assert) max_ = int(max) if max else 500 if html == 'yes': doc = htmldoc.parse(body) else: doc = amara.parse(body) return trim_word_count(doc, max_)